The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
df5b035b5683 ("x86/cacheinfo: Add a cpu_llc_shared_mask() UP variant")
66558b730f25 ("sched: Add cluster scheduler level for x86")
9164d9493a79 ("x86/cpu: Add get_llc_id() helper function")
2c88d45edbb8 ("x86, sched: Treat Intel SNC topology as default, COD as exception")
adefe55e7258 ("x86/kernel: Convert to new CPU match macros")
45fc24e89b7c ("x86/mpx: remove MPX from arch/x86")
0cc5359d8fd4 ("x86/cpu: Update init data for new Airmont CPU model")
9326011edfcb ("Merge branch 'x86/cleanups' into x86/cpu, to pick up dependent changes")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From df5b035b5683d6a25f077af889fb88e09827f8bc Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp(a)suse.de>
Date: Fri, 19 Aug 2022 19:47:44 +0200
Subject: [PATCH] x86/cacheinfo: Add a cpu_llc_shared_mask() UP variant
On a CONFIG_SMP=n kernel, the LLC shared mask is 0, which prevents
__cache_amd_cpumap_setup() from doing the L3 masks setup, and more
specifically from setting up the shared_cpu_map and shared_cpu_list
files in sysfs, leading to lscpu from util-linux getting confused and
segfaulting.
Add a cpu_llc_shared_mask() UP variant which returns a mask with a
single bit set, i.e., for CPU0.
Fixes: 2b83809a5e6d ("x86/cpu/amd: Derive L3 shared_cpu_map from cpu_llc_shared_mask")
Reported-by: Saurabh Sengar <ssengar(a)linux.microsoft.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/1660148115-302-1-git-send-email-ssengar@linux.mic…
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 81a0211a372d..a73bced40e24 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -21,16 +21,6 @@ DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
-static inline struct cpumask *cpu_llc_shared_mask(int cpu)
-{
- return per_cpu(cpu_llc_shared_map, cpu);
-}
-
-static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
-{
- return per_cpu(cpu_l2c_shared_map, cpu);
-}
-
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
@@ -172,6 +162,16 @@ extern int safe_smp_processor_id(void);
# define safe_smp_processor_id() smp_processor_id()
#endif
+static inline struct cpumask *cpu_llc_shared_mask(int cpu)
+{
+ return per_cpu(cpu_llc_shared_map, cpu);
+}
+
+static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
+{
+ return per_cpu(cpu_l2c_shared_map, cpu);
+}
+
#else /* !CONFIG_SMP */
#define wbinvd_on_cpu(cpu) wbinvd()
static inline int wbinvd_on_all_cpus(void)
@@ -179,6 +179,11 @@ static inline int wbinvd_on_all_cpus(void)
wbinvd();
return 0;
}
+
+static inline struct cpumask *cpu_llc_shared_mask(int cpu)
+{
+ return (struct cpumask *)cpumask_of(0);
+}
#endif /* CONFIG_SMP */
extern unsigned disabled_cpus;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
df5b035b5683 ("x86/cacheinfo: Add a cpu_llc_shared_mask() UP variant")
66558b730f25 ("sched: Add cluster scheduler level for x86")
9164d9493a79 ("x86/cpu: Add get_llc_id() helper function")
2c88d45edbb8 ("x86, sched: Treat Intel SNC topology as default, COD as exception")
adefe55e7258 ("x86/kernel: Convert to new CPU match macros")
45fc24e89b7c ("x86/mpx: remove MPX from arch/x86")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From df5b035b5683d6a25f077af889fb88e09827f8bc Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp(a)suse.de>
Date: Fri, 19 Aug 2022 19:47:44 +0200
Subject: [PATCH] x86/cacheinfo: Add a cpu_llc_shared_mask() UP variant
On a CONFIG_SMP=n kernel, the LLC shared mask is 0, which prevents
__cache_amd_cpumap_setup() from doing the L3 masks setup, and more
specifically from setting up the shared_cpu_map and shared_cpu_list
files in sysfs, leading to lscpu from util-linux getting confused and
segfaulting.
Add a cpu_llc_shared_mask() UP variant which returns a mask with a
single bit set, i.e., for CPU0.
Fixes: 2b83809a5e6d ("x86/cpu/amd: Derive L3 shared_cpu_map from cpu_llc_shared_mask")
Reported-by: Saurabh Sengar <ssengar(a)linux.microsoft.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/1660148115-302-1-git-send-email-ssengar@linux.mic…
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 81a0211a372d..a73bced40e24 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -21,16 +21,6 @@ DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
-static inline struct cpumask *cpu_llc_shared_mask(int cpu)
-{
- return per_cpu(cpu_llc_shared_map, cpu);
-}
-
-static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
-{
- return per_cpu(cpu_l2c_shared_map, cpu);
-}
-
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
@@ -172,6 +162,16 @@ extern int safe_smp_processor_id(void);
# define safe_smp_processor_id() smp_processor_id()
#endif
+static inline struct cpumask *cpu_llc_shared_mask(int cpu)
+{
+ return per_cpu(cpu_llc_shared_map, cpu);
+}
+
+static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
+{
+ return per_cpu(cpu_l2c_shared_map, cpu);
+}
+
#else /* !CONFIG_SMP */
#define wbinvd_on_cpu(cpu) wbinvd()
static inline int wbinvd_on_all_cpus(void)
@@ -179,6 +179,11 @@ static inline int wbinvd_on_all_cpus(void)
wbinvd();
return 0;
}
+
+static inline struct cpumask *cpu_llc_shared_mask(int cpu)
+{
+ return (struct cpumask *)cpumask_of(0);
+}
#endif /* CONFIG_SMP */
extern unsigned disabled_cpus;
Hello,
This patch is required to be patched in linux-5.4.y and linux-4.19.y.
In addition to that, the following two patches are somewhat related:
3334a45 mm/page_alloc: use ac->high_zoneidx for classzone_idx
9282012 page_alloc: fix invalid watermark check on a negative value
thanks.
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
37238699073e ("media: dvb_vb2: fix possible out of bound access")
fd89e0bb6ebf ("media: videobuf2-core: integrate with media requests")
55028695c3bb ("media: vb2: drop VB2_BUF_STATE_PREPARED, use bool prepared/synced instead")
db6e8d57e2cd ("media: vb2: store userspace data in vb2_v4l2_buffer")
0af4e80bf24a ("media: videobuf2-v4l2: replace if by switch in __fill_vb2_buffer()")
5f89ec80f1e0 ("media: videobuf2-v4l2: move __fill_v4l2_buffer() function")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 37238699073e7e93f05517e529661151173cd458 Mon Sep 17 00:00:00 2001
From: Hangyu Hua <hbh25y(a)gmail.com>
Date: Thu, 19 May 2022 03:17:43 +0100
Subject: [PATCH] media: dvb_vb2: fix possible out of bound access
vb2_core_qbuf and vb2_core_querybuf don't check the range of b->index
controlled by the user.
Fix this by adding range checking code before using them.
Fixes: 57868acc369a ("media: videobuf2: Add new uAPI for DVB streaming I/O")
Signed-off-by: Hangyu Hua <hbh25y(a)gmail.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky(a)chromium.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab(a)kernel.org>
diff --git a/drivers/media/dvb-core/dvb_vb2.c b/drivers/media/dvb-core/dvb_vb2.c
index a1bd6d9c9223..909df82fed33 100644
--- a/drivers/media/dvb-core/dvb_vb2.c
+++ b/drivers/media/dvb-core/dvb_vb2.c
@@ -354,6 +354,12 @@ int dvb_vb2_reqbufs(struct dvb_vb2_ctx *ctx, struct dmx_requestbuffers *req)
int dvb_vb2_querybuf(struct dvb_vb2_ctx *ctx, struct dmx_buffer *b)
{
+ struct vb2_queue *q = &ctx->vb_q;
+
+ if (b->index >= q->num_buffers) {
+ dprintk(1, "[%s] buffer index out of range\n", ctx->name);
+ return -EINVAL;
+ }
vb2_core_querybuf(&ctx->vb_q, b->index, b);
dprintk(3, "[%s] index=%d\n", ctx->name, b->index);
return 0;
@@ -378,8 +384,13 @@ int dvb_vb2_expbuf(struct dvb_vb2_ctx *ctx, struct dmx_exportbuffer *exp)
int dvb_vb2_qbuf(struct dvb_vb2_ctx *ctx, struct dmx_buffer *b)
{
+ struct vb2_queue *q = &ctx->vb_q;
int ret;
+ if (b->index >= q->num_buffers) {
+ dprintk(1, "[%s] buffer index out of range\n", ctx->name);
+ return -EINVAL;
+ }
ret = vb2_core_qbuf(&ctx->vb_q, b->index, b, NULL);
if (ret) {
dprintk(1, "[%s] index=%d errno=%d\n", ctx->name,
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
2b7aa91ba0e8 ("mm/huge_memory: use pfn_to_online_page() in split_huge_pages_all()")
a17206dac7b2 ("mm/huge_memory: minor cleanup for split_huge_pages_all")
fa6c02315f74 ("mm: huge_memory: a new debugfs interface for splitting THP tests")
f3a45709d2bb ("selftests/vm: hmm-tests: remove the libhugetlbfs dependency")
f545605cc08e ("selftests/vm: minor cleanup: Makefile and gup_test.c")
b9dcfdff8b4b ("selftests/vm: use a common gup_test.h")
9c84f229268f ("mm/gup_benchmark: rename to mm/gup_test")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2b7aa91ba0e86b8643f5d3c83874c80599c731d7 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Date: Thu, 8 Sep 2022 13:11:50 +0900
Subject: [PATCH] mm/huge_memory: use pfn_to_online_page() in
split_huge_pages_all()
NULL pointer dereference is triggered when calling thp split via debugfs
on the system with offlined memory blocks. With debug option enabled, the
following kernel messages are printed out:
page:00000000467f4890 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x121c000
flags: 0x17fffc00000000(node=0|zone=2|lastcpupid=0x1ffff)
raw: 0017fffc00000000 0000000000000000 dead000000000122 0000000000000000
raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000
page dumped because: unmovable page
page:000000007d7ab72e is uninitialized and poisoned
page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
------------[ cut here ]------------
kernel BUG at include/linux/mm.h:1248!
invalid opcode: 0000 [#1] PREEMPT SMP PTI
CPU: 16 PID: 20964 Comm: bash Tainted: G I 6.0.0-rc3-foll-numa+ #41
...
RIP: 0010:split_huge_pages_write+0xcf4/0xe30
This shows that page_to_nid() in page_zone() is unexpectedly called for an
offlined memmap.
Use pfn_to_online_page() to get struct page in PFN walker.
Link: https://lkml.kernel.org/r/20220908041150.3430269-1-naoya.horiguchi@linux.dev
Fixes: f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online") [visible after d0dc12e86b319]
Signed-off-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Co-developed-by: David Hildenbrand <david(a)redhat.com>
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Reviewed-by: Miaohe Lin <linmiaohe(a)huawei.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: <stable(a)vger.kernel.org> [5.10+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e9414ee57c5b..f42bb51e023a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2894,11 +2894,9 @@ static void split_huge_pages_all(void)
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
int nr_pages;
- if (!pfn_valid(pfn))
- continue;
- page = pfn_to_page(pfn);
- if (!get_page_unless_zero(page))
+ page = pfn_to_online_page(pfn);
+ if (!page || !get_page_unless_zero(page))
continue;
if (zone != page_zone(page))
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
2b7aa91ba0e8 ("mm/huge_memory: use pfn_to_online_page() in split_huge_pages_all()")
a17206dac7b2 ("mm/huge_memory: minor cleanup for split_huge_pages_all")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2b7aa91ba0e86b8643f5d3c83874c80599c731d7 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Date: Thu, 8 Sep 2022 13:11:50 +0900
Subject: [PATCH] mm/huge_memory: use pfn_to_online_page() in
split_huge_pages_all()
NULL pointer dereference is triggered when calling thp split via debugfs
on the system with offlined memory blocks. With debug option enabled, the
following kernel messages are printed out:
page:00000000467f4890 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x121c000
flags: 0x17fffc00000000(node=0|zone=2|lastcpupid=0x1ffff)
raw: 0017fffc00000000 0000000000000000 dead000000000122 0000000000000000
raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000
page dumped because: unmovable page
page:000000007d7ab72e is uninitialized and poisoned
page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
------------[ cut here ]------------
kernel BUG at include/linux/mm.h:1248!
invalid opcode: 0000 [#1] PREEMPT SMP PTI
CPU: 16 PID: 20964 Comm: bash Tainted: G I 6.0.0-rc3-foll-numa+ #41
...
RIP: 0010:split_huge_pages_write+0xcf4/0xe30
This shows that page_to_nid() in page_zone() is unexpectedly called for an
offlined memmap.
Use pfn_to_online_page() to get struct page in PFN walker.
Link: https://lkml.kernel.org/r/20220908041150.3430269-1-naoya.horiguchi@linux.dev
Fixes: f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online") [visible after d0dc12e86b319]
Signed-off-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Co-developed-by: David Hildenbrand <david(a)redhat.com>
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Reviewed-by: Miaohe Lin <linmiaohe(a)huawei.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: <stable(a)vger.kernel.org> [5.10+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e9414ee57c5b..f42bb51e023a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2894,11 +2894,9 @@ static void split_huge_pages_all(void)
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
int nr_pages;
- if (!pfn_valid(pfn))
- continue;
- page = pfn_to_page(pfn);
- if (!get_page_unless_zero(page))
+ page = pfn_to_online_page(pfn);
+ if (!page || !get_page_unless_zero(page))
continue;
if (zone != page_zone(page))
The patch below does not apply to the 5.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
2b7aa91ba0e8 ("mm/huge_memory: use pfn_to_online_page() in split_huge_pages_all()")
a17206dac7b2 ("mm/huge_memory: minor cleanup for split_huge_pages_all")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2b7aa91ba0e86b8643f5d3c83874c80599c731d7 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Date: Thu, 8 Sep 2022 13:11:50 +0900
Subject: [PATCH] mm/huge_memory: use pfn_to_online_page() in
split_huge_pages_all()
NULL pointer dereference is triggered when calling thp split via debugfs
on the system with offlined memory blocks. With debug option enabled, the
following kernel messages are printed out:
page:00000000467f4890 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x121c000
flags: 0x17fffc00000000(node=0|zone=2|lastcpupid=0x1ffff)
raw: 0017fffc00000000 0000000000000000 dead000000000122 0000000000000000
raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000
page dumped because: unmovable page
page:000000007d7ab72e is uninitialized and poisoned
page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
------------[ cut here ]------------
kernel BUG at include/linux/mm.h:1248!
invalid opcode: 0000 [#1] PREEMPT SMP PTI
CPU: 16 PID: 20964 Comm: bash Tainted: G I 6.0.0-rc3-foll-numa+ #41
...
RIP: 0010:split_huge_pages_write+0xcf4/0xe30
This shows that page_to_nid() in page_zone() is unexpectedly called for an
offlined memmap.
Use pfn_to_online_page() to get struct page in PFN walker.
Link: https://lkml.kernel.org/r/20220908041150.3430269-1-naoya.horiguchi@linux.dev
Fixes: f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online") [visible after d0dc12e86b319]
Signed-off-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Co-developed-by: David Hildenbrand <david(a)redhat.com>
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Reviewed-by: Miaohe Lin <linmiaohe(a)huawei.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: <stable(a)vger.kernel.org> [5.10+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e9414ee57c5b..f42bb51e023a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2894,11 +2894,9 @@ static void split_huge_pages_all(void)
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
int nr_pages;
- if (!pfn_valid(pfn))
- continue;
- page = pfn_to_page(pfn);
- if (!get_page_unless_zero(page))
+ page = pfn_to_online_page(pfn);
+ if (!page || !get_page_unless_zero(page))
continue;
if (zone != page_zone(page))
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
70427f6e9ecf ("mm: bring back update_mmu_cache() to finish_fault()")
f46f2adecdcc ("mm: check against orig_pte for finish_fault()")
c89357e27f20 ("mm: support GUP-triggered unsharing of anonymous pages")
6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
6c54dc6c7437 ("mm/rmap: use page_move_anon_rmap() when reusing a mapped PageAnon() page exclusively")
28c5209dfd5f ("mm/rmap: pass rmap flags to hugepage_add_anon_rmap()")
f1e2db12e45b ("mm/rmap: remove do_page_add_anon_rmap()")
14f9135d5470 ("mm/rmap: convert RMAP flags to a proper distinct rmap_t type")
fb3d824d1a46 ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
b51ad4f8679e ("mm/memory: slightly simplify copy_present_pte()")
623a1ddfeb23 ("mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range()")
3bff7e3f1f16 ("mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page()")
c145e0b47c77 ("mm: streamline COW logic in do_swap_page()")
84d60fdd3733 ("mm: slightly clarify KSM logic in do_swap_page()")
53a05ad9f21d ("mm: optimize do_wp_page() for exclusive pages in the swapcache")
9030fb0bb9d6 ("Merge tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 70427f6e9ecfc8c5f977b21dd9f846b3bda02500 Mon Sep 17 00:00:00 2001
From: Sergei Antonov <saproj(a)gmail.com>
Date: Thu, 8 Sep 2022 23:48:09 +0300
Subject: [PATCH] mm: bring back update_mmu_cache() to finish_fault()
Running this test program on ARMv4 a few times (sometimes just once)
reproduces the bug.
int main()
{
unsigned i;
char paragon[SIZE];
void* ptr;
memset(paragon, 0xAA, SIZE);
ptr = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
MAP_ANON | MAP_SHARED, -1, 0);
if (ptr == MAP_FAILED) return 1;
printf("ptr = %p\n", ptr);
for (i=0;i<10000;i++){
memset(ptr, 0xAA, SIZE);
if (memcmp(ptr, paragon, SIZE)) {
printf("Unexpected bytes on iteration %u!!!\n", i);
break;
}
}
munmap(ptr, SIZE);
}
In the "ptr" buffer there appear runs of zero bytes which are aligned
by 16 and their lengths are multiple of 16.
Linux v5.11 does not have the bug, "git bisect" finds the first bad commit:
f9ce0be71d1f ("mm: Cleanup faultaround and finish_fault() codepaths")
Before the commit update_mmu_cache() was called during a call to
filemap_map_pages() as well as finish_fault(). After the commit
finish_fault() lacks it.
Bring back update_mmu_cache() to finish_fault() to fix the bug.
Also call update_mmu_tlb() only when returning VM_FAULT_NOPAGE to more
closely reproduce the code of alloc_set_pte() function that existed before
the commit.
On many platforms update_mmu_cache() is nop:
x86, see arch/x86/include/asm/pgtable
ARMv6+, see arch/arm/include/asm/tlbflush.h
So, it seems, few users ran into this bug.
Link: https://lkml.kernel.org/r/20220908204809.2012451-1-saproj@gmail.com
Fixes: f9ce0be71d1f ("mm: Cleanup faultaround and finish_fault() codepaths")
Signed-off-by: Sergei Antonov <saproj(a)gmail.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/memory.c b/mm/memory.c
index 4ba73f5aa8bb..a78814413ac0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4386,14 +4386,20 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- ret = 0;
+
/* Re-check under ptl */
- if (likely(!vmf_pte_changed(vmf)))
+ if (likely(!vmf_pte_changed(vmf))) {
do_set_pte(vmf, page, vmf->address);
- else
+
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+
+ ret = 0;
+ } else {
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
ret = VM_FAULT_NOPAGE;
+ }
- update_mmu_tlb(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
fd35ca3d12cc ("mm/migrate_device.c: copy pte dirty bit to page")
a3589e1d5fe3 ("mm/migrate_device.c: add missing flush_cache_page()")
6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
6c54dc6c7437 ("mm/rmap: use page_move_anon_rmap() when reusing a mapped PageAnon() page exclusively")
28c5209dfd5f ("mm/rmap: pass rmap flags to hugepage_add_anon_rmap()")
f1e2db12e45b ("mm/rmap: remove do_page_add_anon_rmap()")
14f9135d5470 ("mm/rmap: convert RMAP flags to a proper distinct rmap_t type")
fb3d824d1a46 ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
b51ad4f8679e ("mm/memory: slightly simplify copy_present_pte()")
623a1ddfeb23 ("mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range()")
3bff7e3f1f16 ("mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page()")
c145e0b47c77 ("mm: streamline COW logic in do_swap_page()")
84d60fdd3733 ("mm: slightly clarify KSM logic in do_swap_page()")
53a05ad9f21d ("mm: optimize do_wp_page() for exclusive pages in the swapcache")
9030fb0bb9d6 ("Merge tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fd35ca3d12cc9922d7d9a35f934e72132dbc4853 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple(a)nvidia.com>
Date: Fri, 2 Sep 2022 10:35:53 +1000
Subject: [PATCH] mm/migrate_device.c: copy pte dirty bit to page
migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that
installs migration entries directly if it can lock the migrating page.
When removing a dirty pte the dirty bit is supposed to be carried over to
the underlying page to prevent it being lost.
Currently migrate_vma_*() can only be used for private anonymous mappings.
That means loss of the dirty bit usually doesn't result in data loss
because these pages are typically not file-backed. However pages may be
backed by swap storage which can result in data loss if an attempt is made
to migrate a dirty page that doesn't yet have the PageDirty flag set.
In this case migration will fail due to unexpected references but the
dirty pte bit will be lost. If the page is subsequently reclaimed data
won't be written back to swap storage as it is considered uptodate,
resulting in data loss if the page is subsequently accessed.
Prevent this by copying the dirty bit to the page when removing the pte to
match what try_to_migrate_one() does.
Link: https://lkml.kernel.org/r/dd48e4882ce859c295c1a77612f66d198b0403f9.16620785…
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Cc: huang ying <huang.ying.caritas(a)gmail.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 4cc849c3b54c..dbf6c7a7a7c9 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/memremap.h>
#include <linux/migrate.h>
+#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/oom.h>
@@ -196,7 +197,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- ptep_clear_flush(vma, addr, ptep);
+ pte = ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
set_pte_at(mm, addr, ptep, pte);
@@ -206,11 +207,15 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
goto next;
}
} else {
- ptep_get_and_clear(mm, addr, ptep);
+ pte = ptep_get_and_clear(mm, addr, ptep);
}
migrate->cpages++;
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pte))
+ folio_mark_dirty(page_folio(page));
+
/* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry(
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
fd35ca3d12cc ("mm/migrate_device.c: copy pte dirty bit to page")
a3589e1d5fe3 ("mm/migrate_device.c: add missing flush_cache_page()")
6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
6c54dc6c7437 ("mm/rmap: use page_move_anon_rmap() when reusing a mapped PageAnon() page exclusively")
28c5209dfd5f ("mm/rmap: pass rmap flags to hugepage_add_anon_rmap()")
f1e2db12e45b ("mm/rmap: remove do_page_add_anon_rmap()")
14f9135d5470 ("mm/rmap: convert RMAP flags to a proper distinct rmap_t type")
fb3d824d1a46 ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
b51ad4f8679e ("mm/memory: slightly simplify copy_present_pte()")
623a1ddfeb23 ("mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range()")
3bff7e3f1f16 ("mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page()")
c145e0b47c77 ("mm: streamline COW logic in do_swap_page()")
84d60fdd3733 ("mm: slightly clarify KSM logic in do_swap_page()")
53a05ad9f21d ("mm: optimize do_wp_page() for exclusive pages in the swapcache")
9030fb0bb9d6 ("Merge tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fd35ca3d12cc9922d7d9a35f934e72132dbc4853 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple(a)nvidia.com>
Date: Fri, 2 Sep 2022 10:35:53 +1000
Subject: [PATCH] mm/migrate_device.c: copy pte dirty bit to page
migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that
installs migration entries directly if it can lock the migrating page.
When removing a dirty pte the dirty bit is supposed to be carried over to
the underlying page to prevent it being lost.
Currently migrate_vma_*() can only be used for private anonymous mappings.
That means loss of the dirty bit usually doesn't result in data loss
because these pages are typically not file-backed. However pages may be
backed by swap storage which can result in data loss if an attempt is made
to migrate a dirty page that doesn't yet have the PageDirty flag set.
In this case migration will fail due to unexpected references but the
dirty pte bit will be lost. If the page is subsequently reclaimed data
won't be written back to swap storage as it is considered uptodate,
resulting in data loss if the page is subsequently accessed.
Prevent this by copying the dirty bit to the page when removing the pte to
match what try_to_migrate_one() does.
Link: https://lkml.kernel.org/r/dd48e4882ce859c295c1a77612f66d198b0403f9.16620785…
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Cc: huang ying <huang.ying.caritas(a)gmail.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 4cc849c3b54c..dbf6c7a7a7c9 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/memremap.h>
#include <linux/migrate.h>
+#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/oom.h>
@@ -196,7 +197,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- ptep_clear_flush(vma, addr, ptep);
+ pte = ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
set_pte_at(mm, addr, ptep, pte);
@@ -206,11 +207,15 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
goto next;
}
} else {
- ptep_get_and_clear(mm, addr, ptep);
+ pte = ptep_get_and_clear(mm, addr, ptep);
}
migrate->cpages++;
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pte))
+ folio_mark_dirty(page_folio(page));
+
/* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry(
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
fd35ca3d12cc ("mm/migrate_device.c: copy pte dirty bit to page")
a3589e1d5fe3 ("mm/migrate_device.c: add missing flush_cache_page()")
6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
6c54dc6c7437 ("mm/rmap: use page_move_anon_rmap() when reusing a mapped PageAnon() page exclusively")
28c5209dfd5f ("mm/rmap: pass rmap flags to hugepage_add_anon_rmap()")
f1e2db12e45b ("mm/rmap: remove do_page_add_anon_rmap()")
14f9135d5470 ("mm/rmap: convert RMAP flags to a proper distinct rmap_t type")
fb3d824d1a46 ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
b51ad4f8679e ("mm/memory: slightly simplify copy_present_pte()")
623a1ddfeb23 ("mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range()")
3bff7e3f1f16 ("mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page()")
c145e0b47c77 ("mm: streamline COW logic in do_swap_page()")
84d60fdd3733 ("mm: slightly clarify KSM logic in do_swap_page()")
53a05ad9f21d ("mm: optimize do_wp_page() for exclusive pages in the swapcache")
9030fb0bb9d6 ("Merge tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fd35ca3d12cc9922d7d9a35f934e72132dbc4853 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple(a)nvidia.com>
Date: Fri, 2 Sep 2022 10:35:53 +1000
Subject: [PATCH] mm/migrate_device.c: copy pte dirty bit to page
migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that
installs migration entries directly if it can lock the migrating page.
When removing a dirty pte the dirty bit is supposed to be carried over to
the underlying page to prevent it being lost.
Currently migrate_vma_*() can only be used for private anonymous mappings.
That means loss of the dirty bit usually doesn't result in data loss
because these pages are typically not file-backed. However pages may be
backed by swap storage which can result in data loss if an attempt is made
to migrate a dirty page that doesn't yet have the PageDirty flag set.
In this case migration will fail due to unexpected references but the
dirty pte bit will be lost. If the page is subsequently reclaimed data
won't be written back to swap storage as it is considered uptodate,
resulting in data loss if the page is subsequently accessed.
Prevent this by copying the dirty bit to the page when removing the pte to
match what try_to_migrate_one() does.
Link: https://lkml.kernel.org/r/dd48e4882ce859c295c1a77612f66d198b0403f9.16620785…
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Cc: huang ying <huang.ying.caritas(a)gmail.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 4cc849c3b54c..dbf6c7a7a7c9 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/memremap.h>
#include <linux/migrate.h>
+#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/oom.h>
@@ -196,7 +197,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- ptep_clear_flush(vma, addr, ptep);
+ pte = ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
set_pte_at(mm, addr, ptep, pte);
@@ -206,11 +207,15 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
goto next;
}
} else {
- ptep_get_and_clear(mm, addr, ptep);
+ pte = ptep_get_and_clear(mm, addr, ptep);
}
migrate->cpages++;
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pte))
+ folio_mark_dirty(page_folio(page));
+
/* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry(
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
fd35ca3d12cc ("mm/migrate_device.c: copy pte dirty bit to page")
a3589e1d5fe3 ("mm/migrate_device.c: add missing flush_cache_page()")
6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
6c54dc6c7437 ("mm/rmap: use page_move_anon_rmap() when reusing a mapped PageAnon() page exclusively")
28c5209dfd5f ("mm/rmap: pass rmap flags to hugepage_add_anon_rmap()")
f1e2db12e45b ("mm/rmap: remove do_page_add_anon_rmap()")
14f9135d5470 ("mm/rmap: convert RMAP flags to a proper distinct rmap_t type")
fb3d824d1a46 ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
b51ad4f8679e ("mm/memory: slightly simplify copy_present_pte()")
623a1ddfeb23 ("mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range()")
3bff7e3f1f16 ("mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page()")
c145e0b47c77 ("mm: streamline COW logic in do_swap_page()")
84d60fdd3733 ("mm: slightly clarify KSM logic in do_swap_page()")
53a05ad9f21d ("mm: optimize do_wp_page() for exclusive pages in the swapcache")
9030fb0bb9d6 ("Merge tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fd35ca3d12cc9922d7d9a35f934e72132dbc4853 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple(a)nvidia.com>
Date: Fri, 2 Sep 2022 10:35:53 +1000
Subject: [PATCH] mm/migrate_device.c: copy pte dirty bit to page
migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that
installs migration entries directly if it can lock the migrating page.
When removing a dirty pte the dirty bit is supposed to be carried over to
the underlying page to prevent it being lost.
Currently migrate_vma_*() can only be used for private anonymous mappings.
That means loss of the dirty bit usually doesn't result in data loss
because these pages are typically not file-backed. However pages may be
backed by swap storage which can result in data loss if an attempt is made
to migrate a dirty page that doesn't yet have the PageDirty flag set.
In this case migration will fail due to unexpected references but the
dirty pte bit will be lost. If the page is subsequently reclaimed data
won't be written back to swap storage as it is considered uptodate,
resulting in data loss if the page is subsequently accessed.
Prevent this by copying the dirty bit to the page when removing the pte to
match what try_to_migrate_one() does.
Link: https://lkml.kernel.org/r/dd48e4882ce859c295c1a77612f66d198b0403f9.16620785…
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Cc: huang ying <huang.ying.caritas(a)gmail.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 4cc849c3b54c..dbf6c7a7a7c9 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/memremap.h>
#include <linux/migrate.h>
+#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/oom.h>
@@ -196,7 +197,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- ptep_clear_flush(vma, addr, ptep);
+ pte = ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
set_pte_at(mm, addr, ptep, pte);
@@ -206,11 +207,15 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
goto next;
}
} else {
- ptep_get_and_clear(mm, addr, ptep);
+ pte = ptep_get_and_clear(mm, addr, ptep);
}
migrate->cpages++;
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pte))
+ folio_mark_dirty(page_folio(page));
+
/* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry(
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
fd35ca3d12cc ("mm/migrate_device.c: copy pte dirty bit to page")
a3589e1d5fe3 ("mm/migrate_device.c: add missing flush_cache_page()")
6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
6c54dc6c7437 ("mm/rmap: use page_move_anon_rmap() when reusing a mapped PageAnon() page exclusively")
28c5209dfd5f ("mm/rmap: pass rmap flags to hugepage_add_anon_rmap()")
f1e2db12e45b ("mm/rmap: remove do_page_add_anon_rmap()")
14f9135d5470 ("mm/rmap: convert RMAP flags to a proper distinct rmap_t type")
fb3d824d1a46 ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
b51ad4f8679e ("mm/memory: slightly simplify copy_present_pte()")
623a1ddfeb23 ("mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range()")
3bff7e3f1f16 ("mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page()")
c145e0b47c77 ("mm: streamline COW logic in do_swap_page()")
84d60fdd3733 ("mm: slightly clarify KSM logic in do_swap_page()")
53a05ad9f21d ("mm: optimize do_wp_page() for exclusive pages in the swapcache")
9030fb0bb9d6 ("Merge tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fd35ca3d12cc9922d7d9a35f934e72132dbc4853 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple(a)nvidia.com>
Date: Fri, 2 Sep 2022 10:35:53 +1000
Subject: [PATCH] mm/migrate_device.c: copy pte dirty bit to page
migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that
installs migration entries directly if it can lock the migrating page.
When removing a dirty pte the dirty bit is supposed to be carried over to
the underlying page to prevent it being lost.
Currently migrate_vma_*() can only be used for private anonymous mappings.
That means loss of the dirty bit usually doesn't result in data loss
because these pages are typically not file-backed. However pages may be
backed by swap storage which can result in data loss if an attempt is made
to migrate a dirty page that doesn't yet have the PageDirty flag set.
In this case migration will fail due to unexpected references but the
dirty pte bit will be lost. If the page is subsequently reclaimed data
won't be written back to swap storage as it is considered uptodate,
resulting in data loss if the page is subsequently accessed.
Prevent this by copying the dirty bit to the page when removing the pte to
match what try_to_migrate_one() does.
Link: https://lkml.kernel.org/r/dd48e4882ce859c295c1a77612f66d198b0403f9.16620785…
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Cc: huang ying <huang.ying.caritas(a)gmail.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 4cc849c3b54c..dbf6c7a7a7c9 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/memremap.h>
#include <linux/migrate.h>
+#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/oom.h>
@@ -196,7 +197,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- ptep_clear_flush(vma, addr, ptep);
+ pte = ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
set_pte_at(mm, addr, ptep, pte);
@@ -206,11 +207,15 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
goto next;
}
} else {
- ptep_get_and_clear(mm, addr, ptep);
+ pte = ptep_get_and_clear(mm, addr, ptep);
}
migrate->cpages++;
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pte))
+ folio_mark_dirty(page_folio(page));
+
/* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry(
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
a3589e1d5fe3 ("mm/migrate_device.c: add missing flush_cache_page()")
6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
6c54dc6c7437 ("mm/rmap: use page_move_anon_rmap() when reusing a mapped PageAnon() page exclusively")
28c5209dfd5f ("mm/rmap: pass rmap flags to hugepage_add_anon_rmap()")
f1e2db12e45b ("mm/rmap: remove do_page_add_anon_rmap()")
14f9135d5470 ("mm/rmap: convert RMAP flags to a proper distinct rmap_t type")
fb3d824d1a46 ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
b51ad4f8679e ("mm/memory: slightly simplify copy_present_pte()")
623a1ddfeb23 ("mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range()")
3bff7e3f1f16 ("mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page()")
c145e0b47c77 ("mm: streamline COW logic in do_swap_page()")
84d60fdd3733 ("mm: slightly clarify KSM logic in do_swap_page()")
53a05ad9f21d ("mm: optimize do_wp_page() for exclusive pages in the swapcache")
9030fb0bb9d6 ("Merge tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a3589e1d5fe39c3d9fdd291b111524b93d08bc32 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple(a)nvidia.com>
Date: Fri, 2 Sep 2022 10:35:52 +1000
Subject: [PATCH] mm/migrate_device.c: add missing flush_cache_page()
Currently we only call flush_cache_page() for the anon_exclusive case,
however in both cases we clear the pte so should flush the cache.
Link: https://lkml.kernel.org/r/5676f30436ab71d1a587ac73f835ed8bd2113ff5.16620785…
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Cc: huang ying <huang.ying.caritas(a)gmail.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 6a5ef9f0da6a..4cc849c3b54c 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -193,9 +193,9 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
bool anon_exclusive;
pte_t swp_pte;
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
a3589e1d5fe3 ("mm/migrate_device.c: add missing flush_cache_page()")
6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
6c54dc6c7437 ("mm/rmap: use page_move_anon_rmap() when reusing a mapped PageAnon() page exclusively")
28c5209dfd5f ("mm/rmap: pass rmap flags to hugepage_add_anon_rmap()")
f1e2db12e45b ("mm/rmap: remove do_page_add_anon_rmap()")
14f9135d5470 ("mm/rmap: convert RMAP flags to a proper distinct rmap_t type")
fb3d824d1a46 ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
b51ad4f8679e ("mm/memory: slightly simplify copy_present_pte()")
623a1ddfeb23 ("mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range()")
3bff7e3f1f16 ("mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page()")
c145e0b47c77 ("mm: streamline COW logic in do_swap_page()")
84d60fdd3733 ("mm: slightly clarify KSM logic in do_swap_page()")
53a05ad9f21d ("mm: optimize do_wp_page() for exclusive pages in the swapcache")
9030fb0bb9d6 ("Merge tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a3589e1d5fe39c3d9fdd291b111524b93d08bc32 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple(a)nvidia.com>
Date: Fri, 2 Sep 2022 10:35:52 +1000
Subject: [PATCH] mm/migrate_device.c: add missing flush_cache_page()
Currently we only call flush_cache_page() for the anon_exclusive case,
however in both cases we clear the pte so should flush the cache.
Link: https://lkml.kernel.org/r/5676f30436ab71d1a587ac73f835ed8bd2113ff5.16620785…
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Cc: huang ying <huang.ying.caritas(a)gmail.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 6a5ef9f0da6a..4cc849c3b54c 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -193,9 +193,9 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
bool anon_exclusive;
pte_t swp_pte;
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
a3589e1d5fe3 ("mm/migrate_device.c: add missing flush_cache_page()")
6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
6c54dc6c7437 ("mm/rmap: use page_move_anon_rmap() when reusing a mapped PageAnon() page exclusively")
28c5209dfd5f ("mm/rmap: pass rmap flags to hugepage_add_anon_rmap()")
f1e2db12e45b ("mm/rmap: remove do_page_add_anon_rmap()")
14f9135d5470 ("mm/rmap: convert RMAP flags to a proper distinct rmap_t type")
fb3d824d1a46 ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
b51ad4f8679e ("mm/memory: slightly simplify copy_present_pte()")
623a1ddfeb23 ("mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range()")
3bff7e3f1f16 ("mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page()")
c145e0b47c77 ("mm: streamline COW logic in do_swap_page()")
84d60fdd3733 ("mm: slightly clarify KSM logic in do_swap_page()")
53a05ad9f21d ("mm: optimize do_wp_page() for exclusive pages in the swapcache")
9030fb0bb9d6 ("Merge tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a3589e1d5fe39c3d9fdd291b111524b93d08bc32 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple(a)nvidia.com>
Date: Fri, 2 Sep 2022 10:35:52 +1000
Subject: [PATCH] mm/migrate_device.c: add missing flush_cache_page()
Currently we only call flush_cache_page() for the anon_exclusive case,
however in both cases we clear the pte so should flush the cache.
Link: https://lkml.kernel.org/r/5676f30436ab71d1a587ac73f835ed8bd2113ff5.16620785…
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Cc: huang ying <huang.ying.caritas(a)gmail.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 6a5ef9f0da6a..4cc849c3b54c 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -193,9 +193,9 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
bool anon_exclusive;
pte_t swp_pte;
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
a3589e1d5fe3 ("mm/migrate_device.c: add missing flush_cache_page()")
6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
6c54dc6c7437 ("mm/rmap: use page_move_anon_rmap() when reusing a mapped PageAnon() page exclusively")
28c5209dfd5f ("mm/rmap: pass rmap flags to hugepage_add_anon_rmap()")
f1e2db12e45b ("mm/rmap: remove do_page_add_anon_rmap()")
14f9135d5470 ("mm/rmap: convert RMAP flags to a proper distinct rmap_t type")
fb3d824d1a46 ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
b51ad4f8679e ("mm/memory: slightly simplify copy_present_pte()")
623a1ddfeb23 ("mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range()")
3bff7e3f1f16 ("mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page()")
c145e0b47c77 ("mm: streamline COW logic in do_swap_page()")
84d60fdd3733 ("mm: slightly clarify KSM logic in do_swap_page()")
53a05ad9f21d ("mm: optimize do_wp_page() for exclusive pages in the swapcache")
9030fb0bb9d6 ("Merge tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a3589e1d5fe39c3d9fdd291b111524b93d08bc32 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple(a)nvidia.com>
Date: Fri, 2 Sep 2022 10:35:52 +1000
Subject: [PATCH] mm/migrate_device.c: add missing flush_cache_page()
Currently we only call flush_cache_page() for the anon_exclusive case,
however in both cases we clear the pte so should flush the cache.
Link: https://lkml.kernel.org/r/5676f30436ab71d1a587ac73f835ed8bd2113ff5.16620785…
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Cc: huang ying <huang.ying.caritas(a)gmail.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 6a5ef9f0da6a..4cc849c3b54c 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -193,9 +193,9 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
bool anon_exclusive;
pte_t swp_pte;
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
a3589e1d5fe3 ("mm/migrate_device.c: add missing flush_cache_page()")
6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
6c54dc6c7437 ("mm/rmap: use page_move_anon_rmap() when reusing a mapped PageAnon() page exclusively")
28c5209dfd5f ("mm/rmap: pass rmap flags to hugepage_add_anon_rmap()")
f1e2db12e45b ("mm/rmap: remove do_page_add_anon_rmap()")
14f9135d5470 ("mm/rmap: convert RMAP flags to a proper distinct rmap_t type")
fb3d824d1a46 ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
b51ad4f8679e ("mm/memory: slightly simplify copy_present_pte()")
623a1ddfeb23 ("mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range()")
3bff7e3f1f16 ("mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page()")
c145e0b47c77 ("mm: streamline COW logic in do_swap_page()")
84d60fdd3733 ("mm: slightly clarify KSM logic in do_swap_page()")
53a05ad9f21d ("mm: optimize do_wp_page() for exclusive pages in the swapcache")
9030fb0bb9d6 ("Merge tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a3589e1d5fe39c3d9fdd291b111524b93d08bc32 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple(a)nvidia.com>
Date: Fri, 2 Sep 2022 10:35:52 +1000
Subject: [PATCH] mm/migrate_device.c: add missing flush_cache_page()
Currently we only call flush_cache_page() for the anon_exclusive case,
however in both cases we clear the pte so should flush the cache.
Link: https://lkml.kernel.org/r/5676f30436ab71d1a587ac73f835ed8bd2113ff5.16620785…
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Cc: huang ying <huang.ying.caritas(a)gmail.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 6a5ef9f0da6a..4cc849c3b54c 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -193,9 +193,9 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
bool anon_exclusive;
pte_t swp_pte;
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
70cbc3cc78a9 ("mm: gup: fix the fast GUP race against THP collapse")
b0496fe4effd ("mm/gup: Convert gup_pte_range() to use a folio")
1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas")
43ca106fa8ec ("mm: cma: support sysfs")
bbb269206f3c ("mm: vmstat: add cma statistics")
1fbaf8fc12a0 ("mm: add a io_mapping_map_user helper")
ac73e3dc8acd ("Merge branch 'akpm' (patches from Andrew)")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 70cbc3cc78a997d8247b50389d37c4e1736019da Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301(a)gmail.com>
Date: Wed, 7 Sep 2022 11:01:43 -0700
Subject: [PATCH] mm: gup: fix the fast GUP race against THP collapse
Since general RCU GUP fast was introduced in commit 2667f50e8b81 ("mm:
introduce a general RCU get_user_pages_fast()"), a TLB flush is no longer
sufficient to handle concurrent GUP-fast in all cases, it only handles
traditional IPI-based GUP-fast correctly. On architectures that send an
IPI broadcast on TLB flush, it works as expected. But on the
architectures that do not use IPI to broadcast TLB flush, it may have the
below race:
CPU A CPU B
THP collapse fast GUP
gup_pmd_range() <-- see valid pmd
gup_pte_range() <-- work on pte
pmdp_collapse_flush() <-- clear pmd and flush
__collapse_huge_page_isolate()
check page pinned <-- before GUP bump refcount
pin the page
check PTE <-- no change
__collapse_huge_page_copy()
copy data to huge page
ptep_clear()
install huge pmd for the huge page
return the stale page
discard the stale page
The race can be fixed by checking whether PMD is changed or not after
taking the page pin in fast GUP, just like what it does for PTE. If the
PMD is changed it means there may be parallel THP collapse, so GUP should
back off.
Also update the stale comment about serializing against fast GUP in
khugepaged.
Link: https://lkml.kernel.org/r/20220907180144.555485-1-shy828301@gmail.com
Fixes: 2667f50e8b81 ("mm: introduce a general RCU get_user_pages_fast()")
Acked-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Signed-off-by: Yang Shi <shy828301(a)gmail.com>
Reviewed-by: John Hubbard <jhubbard(a)nvidia.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)linux.ibm.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/gup.c b/mm/gup.c
index 5abdaf487460..00926abb4426 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2345,8 +2345,28 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+/*
+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
+ * operations.
+ *
+ * To pin the page, fast-gup needs to do below in order:
+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
+ *
+ * For the rest of pgtable operations where pgtable updates can be racy
+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
+ * is pinned.
+ *
+ * Above will work for all pte-level operations, including THP split.
+ *
+ * For THP collapse, it's a bit more complicated because fast-gup may be
+ * walking a pgtable page that is being freed (pte is still valid but pmd
+ * can be cleared already). To avoid race in such condition, we need to
+ * also check pmd here to make sure pmd doesn't change (corresponds to
+ * pmdp_collapse_flush() in the THP collapse code path).
+ */
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -2392,7 +2412,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
goto pte_unmap;
}
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
+ unlikely(pte_val(pte) != pte_val(*ptep))) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2439,8 +2460,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
* get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
return 0;
}
@@ -2764,7 +2786,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
+ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 01f71786d530..70b7ac66411c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1083,10 +1083,12 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
- * After this gup_fast can't run anymore. This also removes
- * any huge TLB entry from the CPU so we won't allow
- * huge and small TLB entries for the same virtual address
- * to avoid the risk of CPU bugs in that area.
+ * This removes any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address to
+ * avoid the risk of CPU bugs in that area.
+ *
+ * Parallel fast GUP is fine since fast GUP will back off when
+ * it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
70cbc3cc78a9 ("mm: gup: fix the fast GUP race against THP collapse")
b0496fe4effd ("mm/gup: Convert gup_pte_range() to use a folio")
1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas")
43ca106fa8ec ("mm: cma: support sysfs")
bbb269206f3c ("mm: vmstat: add cma statistics")
1fbaf8fc12a0 ("mm: add a io_mapping_map_user helper")
ac73e3dc8acd ("Merge branch 'akpm' (patches from Andrew)")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 70cbc3cc78a997d8247b50389d37c4e1736019da Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301(a)gmail.com>
Date: Wed, 7 Sep 2022 11:01:43 -0700
Subject: [PATCH] mm: gup: fix the fast GUP race against THP collapse
Since general RCU GUP fast was introduced in commit 2667f50e8b81 ("mm:
introduce a general RCU get_user_pages_fast()"), a TLB flush is no longer
sufficient to handle concurrent GUP-fast in all cases, it only handles
traditional IPI-based GUP-fast correctly. On architectures that send an
IPI broadcast on TLB flush, it works as expected. But on the
architectures that do not use IPI to broadcast TLB flush, it may have the
below race:
CPU A CPU B
THP collapse fast GUP
gup_pmd_range() <-- see valid pmd
gup_pte_range() <-- work on pte
pmdp_collapse_flush() <-- clear pmd and flush
__collapse_huge_page_isolate()
check page pinned <-- before GUP bump refcount
pin the page
check PTE <-- no change
__collapse_huge_page_copy()
copy data to huge page
ptep_clear()
install huge pmd for the huge page
return the stale page
discard the stale page
The race can be fixed by checking whether PMD is changed or not after
taking the page pin in fast GUP, just like what it does for PTE. If the
PMD is changed it means there may be parallel THP collapse, so GUP should
back off.
Also update the stale comment about serializing against fast GUP in
khugepaged.
Link: https://lkml.kernel.org/r/20220907180144.555485-1-shy828301@gmail.com
Fixes: 2667f50e8b81 ("mm: introduce a general RCU get_user_pages_fast()")
Acked-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Signed-off-by: Yang Shi <shy828301(a)gmail.com>
Reviewed-by: John Hubbard <jhubbard(a)nvidia.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)linux.ibm.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/gup.c b/mm/gup.c
index 5abdaf487460..00926abb4426 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2345,8 +2345,28 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+/*
+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
+ * operations.
+ *
+ * To pin the page, fast-gup needs to do below in order:
+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
+ *
+ * For the rest of pgtable operations where pgtable updates can be racy
+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
+ * is pinned.
+ *
+ * Above will work for all pte-level operations, including THP split.
+ *
+ * For THP collapse, it's a bit more complicated because fast-gup may be
+ * walking a pgtable page that is being freed (pte is still valid but pmd
+ * can be cleared already). To avoid race in such condition, we need to
+ * also check pmd here to make sure pmd doesn't change (corresponds to
+ * pmdp_collapse_flush() in the THP collapse code path).
+ */
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -2392,7 +2412,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
goto pte_unmap;
}
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
+ unlikely(pte_val(pte) != pte_val(*ptep))) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2439,8 +2460,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
* get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
return 0;
}
@@ -2764,7 +2786,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
+ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 01f71786d530..70b7ac66411c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1083,10 +1083,12 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
- * After this gup_fast can't run anymore. This also removes
- * any huge TLB entry from the CPU so we won't allow
- * huge and small TLB entries for the same virtual address
- * to avoid the risk of CPU bugs in that area.
+ * This removes any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address to
+ * avoid the risk of CPU bugs in that area.
+ *
+ * Parallel fast GUP is fine since fast GUP will back off when
+ * it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
70cbc3cc78a9 ("mm: gup: fix the fast GUP race against THP collapse")
b0496fe4effd ("mm/gup: Convert gup_pte_range() to use a folio")
1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas")
43ca106fa8ec ("mm: cma: support sysfs")
bbb269206f3c ("mm: vmstat: add cma statistics")
1fbaf8fc12a0 ("mm: add a io_mapping_map_user helper")
ac73e3dc8acd ("Merge branch 'akpm' (patches from Andrew)")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 70cbc3cc78a997d8247b50389d37c4e1736019da Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301(a)gmail.com>
Date: Wed, 7 Sep 2022 11:01:43 -0700
Subject: [PATCH] mm: gup: fix the fast GUP race against THP collapse
Since general RCU GUP fast was introduced in commit 2667f50e8b81 ("mm:
introduce a general RCU get_user_pages_fast()"), a TLB flush is no longer
sufficient to handle concurrent GUP-fast in all cases, it only handles
traditional IPI-based GUP-fast correctly. On architectures that send an
IPI broadcast on TLB flush, it works as expected. But on the
architectures that do not use IPI to broadcast TLB flush, it may have the
below race:
CPU A CPU B
THP collapse fast GUP
gup_pmd_range() <-- see valid pmd
gup_pte_range() <-- work on pte
pmdp_collapse_flush() <-- clear pmd and flush
__collapse_huge_page_isolate()
check page pinned <-- before GUP bump refcount
pin the page
check PTE <-- no change
__collapse_huge_page_copy()
copy data to huge page
ptep_clear()
install huge pmd for the huge page
return the stale page
discard the stale page
The race can be fixed by checking whether PMD is changed or not after
taking the page pin in fast GUP, just like what it does for PTE. If the
PMD is changed it means there may be parallel THP collapse, so GUP should
back off.
Also update the stale comment about serializing against fast GUP in
khugepaged.
Link: https://lkml.kernel.org/r/20220907180144.555485-1-shy828301@gmail.com
Fixes: 2667f50e8b81 ("mm: introduce a general RCU get_user_pages_fast()")
Acked-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Signed-off-by: Yang Shi <shy828301(a)gmail.com>
Reviewed-by: John Hubbard <jhubbard(a)nvidia.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)linux.ibm.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/gup.c b/mm/gup.c
index 5abdaf487460..00926abb4426 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2345,8 +2345,28 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+/*
+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
+ * operations.
+ *
+ * To pin the page, fast-gup needs to do below in order:
+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
+ *
+ * For the rest of pgtable operations where pgtable updates can be racy
+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
+ * is pinned.
+ *
+ * Above will work for all pte-level operations, including THP split.
+ *
+ * For THP collapse, it's a bit more complicated because fast-gup may be
+ * walking a pgtable page that is being freed (pte is still valid but pmd
+ * can be cleared already). To avoid race in such condition, we need to
+ * also check pmd here to make sure pmd doesn't change (corresponds to
+ * pmdp_collapse_flush() in the THP collapse code path).
+ */
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -2392,7 +2412,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
goto pte_unmap;
}
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
+ unlikely(pte_val(pte) != pte_val(*ptep))) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2439,8 +2460,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
* get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
return 0;
}
@@ -2764,7 +2786,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
+ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 01f71786d530..70b7ac66411c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1083,10 +1083,12 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
- * After this gup_fast can't run anymore. This also removes
- * any huge TLB entry from the CPU so we won't allow
- * huge and small TLB entries for the same virtual address
- * to avoid the risk of CPU bugs in that area.
+ * This removes any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address to
+ * avoid the risk of CPU bugs in that area.
+ *
+ * Parallel fast GUP is fine since fast GUP will back off when
+ * it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
70cbc3cc78a9 ("mm: gup: fix the fast GUP race against THP collapse")
b0496fe4effd ("mm/gup: Convert gup_pte_range() to use a folio")
1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas")
43ca106fa8ec ("mm: cma: support sysfs")
bbb269206f3c ("mm: vmstat: add cma statistics")
1fbaf8fc12a0 ("mm: add a io_mapping_map_user helper")
ac73e3dc8acd ("Merge branch 'akpm' (patches from Andrew)")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 70cbc3cc78a997d8247b50389d37c4e1736019da Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301(a)gmail.com>
Date: Wed, 7 Sep 2022 11:01:43 -0700
Subject: [PATCH] mm: gup: fix the fast GUP race against THP collapse
Since general RCU GUP fast was introduced in commit 2667f50e8b81 ("mm:
introduce a general RCU get_user_pages_fast()"), a TLB flush is no longer
sufficient to handle concurrent GUP-fast in all cases, it only handles
traditional IPI-based GUP-fast correctly. On architectures that send an
IPI broadcast on TLB flush, it works as expected. But on the
architectures that do not use IPI to broadcast TLB flush, it may have the
below race:
CPU A CPU B
THP collapse fast GUP
gup_pmd_range() <-- see valid pmd
gup_pte_range() <-- work on pte
pmdp_collapse_flush() <-- clear pmd and flush
__collapse_huge_page_isolate()
check page pinned <-- before GUP bump refcount
pin the page
check PTE <-- no change
__collapse_huge_page_copy()
copy data to huge page
ptep_clear()
install huge pmd for the huge page
return the stale page
discard the stale page
The race can be fixed by checking whether PMD is changed or not after
taking the page pin in fast GUP, just like what it does for PTE. If the
PMD is changed it means there may be parallel THP collapse, so GUP should
back off.
Also update the stale comment about serializing against fast GUP in
khugepaged.
Link: https://lkml.kernel.org/r/20220907180144.555485-1-shy828301@gmail.com
Fixes: 2667f50e8b81 ("mm: introduce a general RCU get_user_pages_fast()")
Acked-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Signed-off-by: Yang Shi <shy828301(a)gmail.com>
Reviewed-by: John Hubbard <jhubbard(a)nvidia.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)linux.ibm.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/gup.c b/mm/gup.c
index 5abdaf487460..00926abb4426 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2345,8 +2345,28 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+/*
+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
+ * operations.
+ *
+ * To pin the page, fast-gup needs to do below in order:
+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
+ *
+ * For the rest of pgtable operations where pgtable updates can be racy
+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
+ * is pinned.
+ *
+ * Above will work for all pte-level operations, including THP split.
+ *
+ * For THP collapse, it's a bit more complicated because fast-gup may be
+ * walking a pgtable page that is being freed (pte is still valid but pmd
+ * can be cleared already). To avoid race in such condition, we need to
+ * also check pmd here to make sure pmd doesn't change (corresponds to
+ * pmdp_collapse_flush() in the THP collapse code path).
+ */
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -2392,7 +2412,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
goto pte_unmap;
}
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
+ unlikely(pte_val(pte) != pte_val(*ptep))) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2439,8 +2460,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
* get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
return 0;
}
@@ -2764,7 +2786,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
+ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 01f71786d530..70b7ac66411c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1083,10 +1083,12 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
- * After this gup_fast can't run anymore. This also removes
- * any huge TLB entry from the CPU so we won't allow
- * huge and small TLB entries for the same virtual address
- * to avoid the risk of CPU bugs in that area.
+ * This removes any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address to
+ * avoid the risk of CPU bugs in that area.
+ *
+ * Parallel fast GUP is fine since fast GUP will back off when
+ * it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
70cbc3cc78a9 ("mm: gup: fix the fast GUP race against THP collapse")
b0496fe4effd ("mm/gup: Convert gup_pte_range() to use a folio")
1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas")
43ca106fa8ec ("mm: cma: support sysfs")
bbb269206f3c ("mm: vmstat: add cma statistics")
1fbaf8fc12a0 ("mm: add a io_mapping_map_user helper")
ac73e3dc8acd ("Merge branch 'akpm' (patches from Andrew)")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 70cbc3cc78a997d8247b50389d37c4e1736019da Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301(a)gmail.com>
Date: Wed, 7 Sep 2022 11:01:43 -0700
Subject: [PATCH] mm: gup: fix the fast GUP race against THP collapse
Since general RCU GUP fast was introduced in commit 2667f50e8b81 ("mm:
introduce a general RCU get_user_pages_fast()"), a TLB flush is no longer
sufficient to handle concurrent GUP-fast in all cases, it only handles
traditional IPI-based GUP-fast correctly. On architectures that send an
IPI broadcast on TLB flush, it works as expected. But on the
architectures that do not use IPI to broadcast TLB flush, it may have the
below race:
CPU A CPU B
THP collapse fast GUP
gup_pmd_range() <-- see valid pmd
gup_pte_range() <-- work on pte
pmdp_collapse_flush() <-- clear pmd and flush
__collapse_huge_page_isolate()
check page pinned <-- before GUP bump refcount
pin the page
check PTE <-- no change
__collapse_huge_page_copy()
copy data to huge page
ptep_clear()
install huge pmd for the huge page
return the stale page
discard the stale page
The race can be fixed by checking whether PMD is changed or not after
taking the page pin in fast GUP, just like what it does for PTE. If the
PMD is changed it means there may be parallel THP collapse, so GUP should
back off.
Also update the stale comment about serializing against fast GUP in
khugepaged.
Link: https://lkml.kernel.org/r/20220907180144.555485-1-shy828301@gmail.com
Fixes: 2667f50e8b81 ("mm: introduce a general RCU get_user_pages_fast()")
Acked-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Signed-off-by: Yang Shi <shy828301(a)gmail.com>
Reviewed-by: John Hubbard <jhubbard(a)nvidia.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)linux.ibm.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/gup.c b/mm/gup.c
index 5abdaf487460..00926abb4426 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2345,8 +2345,28 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+/*
+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
+ * operations.
+ *
+ * To pin the page, fast-gup needs to do below in order:
+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
+ *
+ * For the rest of pgtable operations where pgtable updates can be racy
+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
+ * is pinned.
+ *
+ * Above will work for all pte-level operations, including THP split.
+ *
+ * For THP collapse, it's a bit more complicated because fast-gup may be
+ * walking a pgtable page that is being freed (pte is still valid but pmd
+ * can be cleared already). To avoid race in such condition, we need to
+ * also check pmd here to make sure pmd doesn't change (corresponds to
+ * pmdp_collapse_flush() in the THP collapse code path).
+ */
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -2392,7 +2412,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
goto pte_unmap;
}
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
+ unlikely(pte_val(pte) != pte_val(*ptep))) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2439,8 +2460,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
* get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
return 0;
}
@@ -2764,7 +2786,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
+ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 01f71786d530..70b7ac66411c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1083,10 +1083,12 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
- * After this gup_fast can't run anymore. This also removes
- * any huge TLB entry from the CPU so we won't allow
- * huge and small TLB entries for the same virtual address
- * to avoid the risk of CPU bugs in that area.
+ * This removes any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address to
+ * avoid the risk of CPU bugs in that area.
+ *
+ * Parallel fast GUP is fine since fast GUP will back off when
+ * it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
70cbc3cc78a9 ("mm: gup: fix the fast GUP race against THP collapse")
b0496fe4effd ("mm/gup: Convert gup_pte_range() to use a folio")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 70cbc3cc78a997d8247b50389d37c4e1736019da Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301(a)gmail.com>
Date: Wed, 7 Sep 2022 11:01:43 -0700
Subject: [PATCH] mm: gup: fix the fast GUP race against THP collapse
Since general RCU GUP fast was introduced in commit 2667f50e8b81 ("mm:
introduce a general RCU get_user_pages_fast()"), a TLB flush is no longer
sufficient to handle concurrent GUP-fast in all cases, it only handles
traditional IPI-based GUP-fast correctly. On architectures that send an
IPI broadcast on TLB flush, it works as expected. But on the
architectures that do not use IPI to broadcast TLB flush, it may have the
below race:
CPU A CPU B
THP collapse fast GUP
gup_pmd_range() <-- see valid pmd
gup_pte_range() <-- work on pte
pmdp_collapse_flush() <-- clear pmd and flush
__collapse_huge_page_isolate()
check page pinned <-- before GUP bump refcount
pin the page
check PTE <-- no change
__collapse_huge_page_copy()
copy data to huge page
ptep_clear()
install huge pmd for the huge page
return the stale page
discard the stale page
The race can be fixed by checking whether PMD is changed or not after
taking the page pin in fast GUP, just like what it does for PTE. If the
PMD is changed it means there may be parallel THP collapse, so GUP should
back off.
Also update the stale comment about serializing against fast GUP in
khugepaged.
Link: https://lkml.kernel.org/r/20220907180144.555485-1-shy828301@gmail.com
Fixes: 2667f50e8b81 ("mm: introduce a general RCU get_user_pages_fast()")
Acked-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Signed-off-by: Yang Shi <shy828301(a)gmail.com>
Reviewed-by: John Hubbard <jhubbard(a)nvidia.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)linux.ibm.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/gup.c b/mm/gup.c
index 5abdaf487460..00926abb4426 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2345,8 +2345,28 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+/*
+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
+ * operations.
+ *
+ * To pin the page, fast-gup needs to do below in order:
+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
+ *
+ * For the rest of pgtable operations where pgtable updates can be racy
+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
+ * is pinned.
+ *
+ * Above will work for all pte-level operations, including THP split.
+ *
+ * For THP collapse, it's a bit more complicated because fast-gup may be
+ * walking a pgtable page that is being freed (pte is still valid but pmd
+ * can be cleared already). To avoid race in such condition, we need to
+ * also check pmd here to make sure pmd doesn't change (corresponds to
+ * pmdp_collapse_flush() in the THP collapse code path).
+ */
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -2392,7 +2412,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
goto pte_unmap;
}
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
+ unlikely(pte_val(pte) != pte_val(*ptep))) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2439,8 +2460,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
* get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
return 0;
}
@@ -2764,7 +2786,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
+ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 01f71786d530..70b7ac66411c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1083,10 +1083,12 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
- * After this gup_fast can't run anymore. This also removes
- * any huge TLB entry from the CPU so we won't allow
- * huge and small TLB entries for the same virtual address
- * to avoid the risk of CPU bugs in that area.
+ * This removes any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address to
+ * avoid the risk of CPU bugs in that area.
+ *
+ * Parallel fast GUP is fine since fast GUP will back off when
+ * it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
e9233917a7e5 ("mmc: core: Terminate infinite loop in SD-UHS voltage switch")
e42726646082 ("mmc: core: Replace with already defined values for readability")
09247e110b2e ("mmc: core: Allow UHS-I voltage switch for SDSC cards if supported")
2ed573b603f7 ("mmc: core: Clarify usage of mmc_set_signal_voltage()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e9233917a7e53980664efbc565888163c0a33c3f Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris(a)chromium.org>
Date: Tue, 13 Sep 2022 18:40:10 -0700
Subject: [PATCH] mmc: core: Terminate infinite loop in SD-UHS voltage switch
This loop intends to retry a max of 10 times, with some implicit
termination based on the SD_{R,}OCR_S18A bit. Unfortunately, the
termination condition depends on the value reported by the SD card
(*rocr), which may or may not correctly reflect what we asked it to do.
Needless to say, it's not wise to rely on the card doing what we expect;
we should at least terminate the loop regardless. So, check both the
input and output values, so we ensure we will terminate regardless of
the SD card behavior.
Note that SDIO learned a similar retry loop in commit 0797e5f1453b
("mmc: core: Fixup signal voltage switch"), but that used the 'ocr'
result, and so the current pre-terminating condition looks like:
rocr & ocr & R4_18V_PRESENT
(i.e., it doesn't have the same bug.)
This addresses a number of crash reports seen on ChromeOS that look
like the following:
... // lots of repeated: ...
<4>[13142.846061] mmc1: Skipping voltage switch
<4>[13143.406087] mmc1: Skipping voltage switch
<4>[13143.964724] mmc1: Skipping voltage switch
<4>[13144.526089] mmc1: Skipping voltage switch
<4>[13145.086088] mmc1: Skipping voltage switch
<4>[13145.645941] mmc1: Skipping voltage switch
<3>[13146.153969] INFO: task halt:30352 blocked for more than 122 seconds.
...
Fixes: f2119df6b764 ("mmc: sd: add support for signal voltage switch procedure")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Brian Norris <briannorris(a)chromium.org>
Reviewed-by: Guenter Roeck <linux(a)roeck-us.net>
Link: https://lore.kernel.org/r/20220914014010.2076169-1-briannorris@chromium.org
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 06aa62ce0ed1..3662bf5320ce 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -870,7 +870,8 @@ int mmc_sd_get_cid(struct mmc_host *host, u32 ocr, u32 *cid, u32 *rocr)
* the CCS bit is set as well. We deliberately deviate from the spec in
* regards to this, which allows UHS-I to be supported for SDSC cards.
*/
- if (!mmc_host_is_spi(host) && rocr && (*rocr & SD_ROCR_S18A)) {
+ if (!mmc_host_is_spi(host) && (ocr & SD_OCR_S18R) &&
+ rocr && (*rocr & SD_ROCR_S18A)) {
err = mmc_set_uhs_voltage(host, pocr);
if (err == -EAGAIN) {
retries--;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
e9233917a7e5 ("mmc: core: Terminate infinite loop in SD-UHS voltage switch")
e42726646082 ("mmc: core: Replace with already defined values for readability")
09247e110b2e ("mmc: core: Allow UHS-I voltage switch for SDSC cards if supported")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e9233917a7e53980664efbc565888163c0a33c3f Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris(a)chromium.org>
Date: Tue, 13 Sep 2022 18:40:10 -0700
Subject: [PATCH] mmc: core: Terminate infinite loop in SD-UHS voltage switch
This loop intends to retry a max of 10 times, with some implicit
termination based on the SD_{R,}OCR_S18A bit. Unfortunately, the
termination condition depends on the value reported by the SD card
(*rocr), which may or may not correctly reflect what we asked it to do.
Needless to say, it's not wise to rely on the card doing what we expect;
we should at least terminate the loop regardless. So, check both the
input and output values, so we ensure we will terminate regardless of
the SD card behavior.
Note that SDIO learned a similar retry loop in commit 0797e5f1453b
("mmc: core: Fixup signal voltage switch"), but that used the 'ocr'
result, and so the current pre-terminating condition looks like:
rocr & ocr & R4_18V_PRESENT
(i.e., it doesn't have the same bug.)
This addresses a number of crash reports seen on ChromeOS that look
like the following:
... // lots of repeated: ...
<4>[13142.846061] mmc1: Skipping voltage switch
<4>[13143.406087] mmc1: Skipping voltage switch
<4>[13143.964724] mmc1: Skipping voltage switch
<4>[13144.526089] mmc1: Skipping voltage switch
<4>[13145.086088] mmc1: Skipping voltage switch
<4>[13145.645941] mmc1: Skipping voltage switch
<3>[13146.153969] INFO: task halt:30352 blocked for more than 122 seconds.
...
Fixes: f2119df6b764 ("mmc: sd: add support for signal voltage switch procedure")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Brian Norris <briannorris(a)chromium.org>
Reviewed-by: Guenter Roeck <linux(a)roeck-us.net>
Link: https://lore.kernel.org/r/20220914014010.2076169-1-briannorris@chromium.org
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 06aa62ce0ed1..3662bf5320ce 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -870,7 +870,8 @@ int mmc_sd_get_cid(struct mmc_host *host, u32 ocr, u32 *cid, u32 *rocr)
* the CCS bit is set as well. We deliberately deviate from the spec in
* regards to this, which allows UHS-I to be supported for SDSC cards.
*/
- if (!mmc_host_is_spi(host) && rocr && (*rocr & SD_ROCR_S18A)) {
+ if (!mmc_host_is_spi(host) && (ocr & SD_OCR_S18R) &&
+ rocr && (*rocr & SD_ROCR_S18A)) {
err = mmc_set_uhs_voltage(host, pocr);
if (err == -EAGAIN) {
retries--;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
e9233917a7e5 ("mmc: core: Terminate infinite loop in SD-UHS voltage switch")
e42726646082 ("mmc: core: Replace with already defined values for readability")
09247e110b2e ("mmc: core: Allow UHS-I voltage switch for SDSC cards if supported")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e9233917a7e53980664efbc565888163c0a33c3f Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris(a)chromium.org>
Date: Tue, 13 Sep 2022 18:40:10 -0700
Subject: [PATCH] mmc: core: Terminate infinite loop in SD-UHS voltage switch
This loop intends to retry a max of 10 times, with some implicit
termination based on the SD_{R,}OCR_S18A bit. Unfortunately, the
termination condition depends on the value reported by the SD card
(*rocr), which may or may not correctly reflect what we asked it to do.
Needless to say, it's not wise to rely on the card doing what we expect;
we should at least terminate the loop regardless. So, check both the
input and output values, so we ensure we will terminate regardless of
the SD card behavior.
Note that SDIO learned a similar retry loop in commit 0797e5f1453b
("mmc: core: Fixup signal voltage switch"), but that used the 'ocr'
result, and so the current pre-terminating condition looks like:
rocr & ocr & R4_18V_PRESENT
(i.e., it doesn't have the same bug.)
This addresses a number of crash reports seen on ChromeOS that look
like the following:
... // lots of repeated: ...
<4>[13142.846061] mmc1: Skipping voltage switch
<4>[13143.406087] mmc1: Skipping voltage switch
<4>[13143.964724] mmc1: Skipping voltage switch
<4>[13144.526089] mmc1: Skipping voltage switch
<4>[13145.086088] mmc1: Skipping voltage switch
<4>[13145.645941] mmc1: Skipping voltage switch
<3>[13146.153969] INFO: task halt:30352 blocked for more than 122 seconds.
...
Fixes: f2119df6b764 ("mmc: sd: add support for signal voltage switch procedure")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Brian Norris <briannorris(a)chromium.org>
Reviewed-by: Guenter Roeck <linux(a)roeck-us.net>
Link: https://lore.kernel.org/r/20220914014010.2076169-1-briannorris@chromium.org
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 06aa62ce0ed1..3662bf5320ce 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -870,7 +870,8 @@ int mmc_sd_get_cid(struct mmc_host *host, u32 ocr, u32 *cid, u32 *rocr)
* the CCS bit is set as well. We deliberately deviate from the spec in
* regards to this, which allows UHS-I to be supported for SDSC cards.
*/
- if (!mmc_host_is_spi(host) && rocr && (*rocr & SD_ROCR_S18A)) {
+ if (!mmc_host_is_spi(host) && (ocr & SD_OCR_S18R) &&
+ rocr && (*rocr & SD_ROCR_S18A)) {
err = mmc_set_uhs_voltage(host, pocr);
if (err == -EAGAIN) {
retries--;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
e9233917a7e5 ("mmc: core: Terminate infinite loop in SD-UHS voltage switch")
e42726646082 ("mmc: core: Replace with already defined values for readability")
09247e110b2e ("mmc: core: Allow UHS-I voltage switch for SDSC cards if supported")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e9233917a7e53980664efbc565888163c0a33c3f Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris(a)chromium.org>
Date: Tue, 13 Sep 2022 18:40:10 -0700
Subject: [PATCH] mmc: core: Terminate infinite loop in SD-UHS voltage switch
This loop intends to retry a max of 10 times, with some implicit
termination based on the SD_{R,}OCR_S18A bit. Unfortunately, the
termination condition depends on the value reported by the SD card
(*rocr), which may or may not correctly reflect what we asked it to do.
Needless to say, it's not wise to rely on the card doing what we expect;
we should at least terminate the loop regardless. So, check both the
input and output values, so we ensure we will terminate regardless of
the SD card behavior.
Note that SDIO learned a similar retry loop in commit 0797e5f1453b
("mmc: core: Fixup signal voltage switch"), but that used the 'ocr'
result, and so the current pre-terminating condition looks like:
rocr & ocr & R4_18V_PRESENT
(i.e., it doesn't have the same bug.)
This addresses a number of crash reports seen on ChromeOS that look
like the following:
... // lots of repeated: ...
<4>[13142.846061] mmc1: Skipping voltage switch
<4>[13143.406087] mmc1: Skipping voltage switch
<4>[13143.964724] mmc1: Skipping voltage switch
<4>[13144.526089] mmc1: Skipping voltage switch
<4>[13145.086088] mmc1: Skipping voltage switch
<4>[13145.645941] mmc1: Skipping voltage switch
<3>[13146.153969] INFO: task halt:30352 blocked for more than 122 seconds.
...
Fixes: f2119df6b764 ("mmc: sd: add support for signal voltage switch procedure")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Brian Norris <briannorris(a)chromium.org>
Reviewed-by: Guenter Roeck <linux(a)roeck-us.net>
Link: https://lore.kernel.org/r/20220914014010.2076169-1-briannorris@chromium.org
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 06aa62ce0ed1..3662bf5320ce 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -870,7 +870,8 @@ int mmc_sd_get_cid(struct mmc_host *host, u32 ocr, u32 *cid, u32 *rocr)
* the CCS bit is set as well. We deliberately deviate from the spec in
* regards to this, which allows UHS-I to be supported for SDSC cards.
*/
- if (!mmc_host_is_spi(host) && rocr && (*rocr & SD_ROCR_S18A)) {
+ if (!mmc_host_is_spi(host) && (ocr & SD_OCR_S18R) &&
+ rocr && (*rocr & SD_ROCR_S18A)) {
err = mmc_set_uhs_voltage(host, pocr);
if (err == -EAGAIN) {
retries--;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
e9233917a7e5 ("mmc: core: Terminate infinite loop in SD-UHS voltage switch")
e42726646082 ("mmc: core: Replace with already defined values for readability")
09247e110b2e ("mmc: core: Allow UHS-I voltage switch for SDSC cards if supported")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e9233917a7e53980664efbc565888163c0a33c3f Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris(a)chromium.org>
Date: Tue, 13 Sep 2022 18:40:10 -0700
Subject: [PATCH] mmc: core: Terminate infinite loop in SD-UHS voltage switch
This loop intends to retry a max of 10 times, with some implicit
termination based on the SD_{R,}OCR_S18A bit. Unfortunately, the
termination condition depends on the value reported by the SD card
(*rocr), which may or may not correctly reflect what we asked it to do.
Needless to say, it's not wise to rely on the card doing what we expect;
we should at least terminate the loop regardless. So, check both the
input and output values, so we ensure we will terminate regardless of
the SD card behavior.
Note that SDIO learned a similar retry loop in commit 0797e5f1453b
("mmc: core: Fixup signal voltage switch"), but that used the 'ocr'
result, and so the current pre-terminating condition looks like:
rocr & ocr & R4_18V_PRESENT
(i.e., it doesn't have the same bug.)
This addresses a number of crash reports seen on ChromeOS that look
like the following:
... // lots of repeated: ...
<4>[13142.846061] mmc1: Skipping voltage switch
<4>[13143.406087] mmc1: Skipping voltage switch
<4>[13143.964724] mmc1: Skipping voltage switch
<4>[13144.526089] mmc1: Skipping voltage switch
<4>[13145.086088] mmc1: Skipping voltage switch
<4>[13145.645941] mmc1: Skipping voltage switch
<3>[13146.153969] INFO: task halt:30352 blocked for more than 122 seconds.
...
Fixes: f2119df6b764 ("mmc: sd: add support for signal voltage switch procedure")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Brian Norris <briannorris(a)chromium.org>
Reviewed-by: Guenter Roeck <linux(a)roeck-us.net>
Link: https://lore.kernel.org/r/20220914014010.2076169-1-briannorris@chromium.org
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 06aa62ce0ed1..3662bf5320ce 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -870,7 +870,8 @@ int mmc_sd_get_cid(struct mmc_host *host, u32 ocr, u32 *cid, u32 *rocr)
* the CCS bit is set as well. We deliberately deviate from the spec in
* regards to this, which allows UHS-I to be supported for SDSC cards.
*/
- if (!mmc_host_is_spi(host) && rocr && (*rocr & SD_ROCR_S18A)) {
+ if (!mmc_host_is_spi(host) && (ocr & SD_OCR_S18R) &&
+ rocr && (*rocr & SD_ROCR_S18A)) {
err = mmc_set_uhs_voltage(host, pocr);
if (err == -EAGAIN) {
retries--;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
e9233917a7e5 ("mmc: core: Terminate infinite loop in SD-UHS voltage switch")
e42726646082 ("mmc: core: Replace with already defined values for readability")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e9233917a7e53980664efbc565888163c0a33c3f Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris(a)chromium.org>
Date: Tue, 13 Sep 2022 18:40:10 -0700
Subject: [PATCH] mmc: core: Terminate infinite loop in SD-UHS voltage switch
This loop intends to retry a max of 10 times, with some implicit
termination based on the SD_{R,}OCR_S18A bit. Unfortunately, the
termination condition depends on the value reported by the SD card
(*rocr), which may or may not correctly reflect what we asked it to do.
Needless to say, it's not wise to rely on the card doing what we expect;
we should at least terminate the loop regardless. So, check both the
input and output values, so we ensure we will terminate regardless of
the SD card behavior.
Note that SDIO learned a similar retry loop in commit 0797e5f1453b
("mmc: core: Fixup signal voltage switch"), but that used the 'ocr'
result, and so the current pre-terminating condition looks like:
rocr & ocr & R4_18V_PRESENT
(i.e., it doesn't have the same bug.)
This addresses a number of crash reports seen on ChromeOS that look
like the following:
... // lots of repeated: ...
<4>[13142.846061] mmc1: Skipping voltage switch
<4>[13143.406087] mmc1: Skipping voltage switch
<4>[13143.964724] mmc1: Skipping voltage switch
<4>[13144.526089] mmc1: Skipping voltage switch
<4>[13145.086088] mmc1: Skipping voltage switch
<4>[13145.645941] mmc1: Skipping voltage switch
<3>[13146.153969] INFO: task halt:30352 blocked for more than 122 seconds.
...
Fixes: f2119df6b764 ("mmc: sd: add support for signal voltage switch procedure")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Brian Norris <briannorris(a)chromium.org>
Reviewed-by: Guenter Roeck <linux(a)roeck-us.net>
Link: https://lore.kernel.org/r/20220914014010.2076169-1-briannorris@chromium.org
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 06aa62ce0ed1..3662bf5320ce 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -870,7 +870,8 @@ int mmc_sd_get_cid(struct mmc_host *host, u32 ocr, u32 *cid, u32 *rocr)
* the CCS bit is set as well. We deliberately deviate from the spec in
* regards to this, which allows UHS-I to be supported for SDSC cards.
*/
- if (!mmc_host_is_spi(host) && rocr && (*rocr & SD_ROCR_S18A)) {
+ if (!mmc_host_is_spi(host) && (ocr & SD_OCR_S18R) &&
+ rocr && (*rocr & SD_ROCR_S18A)) {
err = mmc_set_uhs_voltage(host, pocr);
if (err == -EAGAIN) {
retries--;
The patch below does not apply to the 5.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
1c8e2349f2d0 ("damon/sysfs: fix possible memleak on damon_sysfs_add_target")
c9e124e0382d ("mm/damon/{dbgfs,sysfs}: move target_has_pid() from dbgfs to damon.h")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1c8e2349f2d033f634d046063b704b2ca6c46972 Mon Sep 17 00:00:00 2001
From: Levi Yun <ppbuk5246(a)gmail.com>
Date: Mon, 26 Sep 2022 16:06:11 +0000
Subject: [PATCH] damon/sysfs: fix possible memleak on damon_sysfs_add_target
When damon_sysfs_add_target couldn't find proper task, New allocated
damon_target structure isn't registered yet, So, it's impossible to free
new allocated one by damon_sysfs_destroy_targets.
By calling damon_add_target as soon as allocating new target, Fix this
possible memory leak.
Link: https://lkml.kernel.org/r/20220926160611.48536-1-sj@kernel.org
Fixes: a61ea561c871 ("mm/damon/sysfs: link DAMON for virtual address spaces monitoring")
Signed-off-by: Levi Yun <ppbuk5246(a)gmail.com>
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Reviewed-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [5.17.x]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 7488e27c87c3..bdef9682d0a0 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2182,12 +2182,12 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
if (!t)
return -ENOMEM;
+ damon_add_target(ctx, t);
if (damon_target_has_pid(ctx)) {
t->pid = find_get_pid(sys_target->pid);
if (!t->pid)
goto destroy_targets_out;
}
- damon_add_target(ctx, t);
err = damon_sysfs_set_regions(t, sys_target->regions);
if (err)
goto destroy_targets_out;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
6ef7d362123e ("drm/i915/gt: Restrict forced preemption to the active context")
3e28d37146db ("drm/i915: Move priolist to new i915_sched_engine object")
2913fa4d7d42 ("drm/i915/gt: use new tasklet API for execution list")
eb5c10cbbc2f ("drm/i915: Remove I915_USER_PRIORITY_SHIFT")
2867ff6ceb25 ("drm/i915: Strip out internal priorities")
a829f033e966 ("drm/i915: Wedge the GPU if command parser setup fails")
f99e67f1b929 ("drm/i915/display: Apply interactive priority to explicit flip fences")
007c45787650 ("drm/i915/guc: stop calling execlists_set_default_submission")
43aaadc67e6f ("drm/i915/guc: init engine directly in GuC submission mode")
7e5299cebe91 ("drm/i915/guc: Delete GuC code unused in future patches")
baa7c2cd99c6 ("drm/i915: Refactor marking a request as EIO")
751f82b353a6 ("drm/i915/gt: Only disable preemption on gen8 render engines")
4386b8e5ad71 ("drm/i915/gt: Remove timeslice suppression")
fe7bcfaeb2b7 ("drm/i915/gt: Refactor heartbeat request construction and submission")
f81475bb5bb4 ("drm/i915/gt: Resubmit the virtual engine on schedule-out")
bab0557c8dca ("drm/i915/gt: Remove virtual breadcrumb before transfer")
6f0726b4807c ("drm/i915/gt: Defer schedule_out until after the next dequeue")
2efa2c522ab0 ("drm/i915/gt: Decouple inflight virtual engines")
64b7a3fa7e3e ("drm/i915/gt: Use virtual_engine during execlists_dequeue")
16f2941ad307 ("drm/i915/gt: Replace direct submit with direct call to tasklet")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6ef7d362123ecb5bf6d163bb9c7fd6ba2d8c968c Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Wed, 21 Sep 2022 15:52:58 +0200
Subject: [PATCH] drm/i915/gt: Restrict forced preemption to the active context
When we submit a new pair of contexts to ELSP for execution, we start a
timer by which point we expect the HW to have switched execution to the
pending contexts. If the promotion to the new pair of contexts has not
occurred, we declare the executing context to have hung and force the
preemption to take place by resetting the engine and resubmitting the
new contexts.
This can lead to an unfair situation where almost all of the preemption
timeout is consumed by the first context which just switches into the
second context immediately prior to the timer firing and triggering the
preemption reset (assuming that the timer interrupts before we process
the CS events for the context switch). The second context hasn't yet had
a chance to yield to the incoming ELSP (and send the ACk for the
promotion) and so ends up being blamed for the reset.
If we see that a context switch has occurred since setting the
preemption timeout, but have not yet received the ACK for the ELSP
promotion, rearm the preemption timer and check again. This is
especially significant if the first context was not schedulable and so
we used the shortest timer possible, greatly increasing the chance of
accidentally blaming the second innocent context.
Fixes: 3a7a92aba8fb ("drm/i915/execlists: Force preemption")
Fixes: d12acee84ffb ("drm/i915/execlists: Cancel banned contexts on schedule-out")
Reported-by: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
Cc: Andi Shyti <andi.shyti(a)linux.intel.com>
Reviewed-by: Andrzej Hajda <andrzej.hajda(a)intel.com>
Tested-by: Andrzej Hajda <andrzej.hajda(a)intel.com>
Cc: <stable(a)vger.kernel.org> # v5.5+
Signed-off-by: Andi Shyti <andi.shyti(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220921135258.1714873-1-andr…
(cherry picked from commit 107ba1a2c705f4358f2602ec2f2fd821bb651f42)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi(a)intel.com>
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 633a7e5dba3b..6b5d4ea22b67 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -165,6 +165,21 @@ struct intel_engine_execlists {
*/
struct timer_list preempt;
+ /**
+ * @preempt_target: active request at the time of the preemption request
+ *
+ * We force a preemption to occur if the pending contexts have not
+ * been promoted to active upon receipt of the CS ack event within
+ * the timeout. This timeout maybe chosen based on the target,
+ * using a very short timeout if the context is no longer schedulable.
+ * That short timeout may not be applicable to other contexts, so
+ * if a context switch should happen within before the preemption
+ * timeout, we may shoot early at an innocent context. To prevent this,
+ * we record which context was active at the time of the preemption
+ * request and only reset that context upon the timeout.
+ */
+ const struct i915_request *preempt_target;
+
/**
* @ccid: identifier for contexts submitted to this engine
*/
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 4b909cb88cdf..c718e6dc40b5 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -1241,6 +1241,9 @@ static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
if (!rq)
return 0;
+ /* Only allow ourselves to force reset the currently active context */
+ engine->execlists.preempt_target = rq;
+
/* Force a fast reset for terminated contexts (ignoring sysfs!) */
if (unlikely(intel_context_is_banned(rq->context) || bad_request(rq)))
return INTEL_CONTEXT_BANNED_PREEMPT_TIMEOUT_MS;
@@ -2427,8 +2430,24 @@ static void execlists_submission_tasklet(struct tasklet_struct *t)
GEM_BUG_ON(inactive - post > ARRAY_SIZE(post));
if (unlikely(preempt_timeout(engine))) {
+ const struct i915_request *rq = *engine->execlists.active;
+
+ /*
+ * If after the preempt-timeout expired, we are still on the
+ * same active request/context as before we initiated the
+ * preemption, reset the engine.
+ *
+ * However, if we have processed a CS event to switch contexts,
+ * but not yet processed the CS event for the pending
+ * preemption, reset the timer allowing the new context to
+ * gracefully exit.
+ */
cancel_timer(&engine->execlists.preempt);
- engine->execlists.error_interrupt |= ERROR_PREEMPT;
+ if (rq == engine->execlists.preempt_target)
+ engine->execlists.error_interrupt |= ERROR_PREEMPT;
+ else
+ set_timer_ms(&engine->execlists.preempt,
+ active_preempt_timeout(engine, rq));
}
if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
The quilt patch titled
Subject: damon/sysfs: fix possible memleak on damon_sysfs_add_target
has been removed from the -mm tree. Its filename was
damon-sysfs-fix-possible-memleak-on-damon_sysfs_add_target.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Levi Yun <ppbuk5246(a)gmail.com>
Subject: damon/sysfs: fix possible memleak on damon_sysfs_add_target
Date: Mon, 26 Sep 2022 16:06:11 +0000
When damon_sysfs_add_target couldn't find proper task, New allocated
damon_target structure isn't registered yet, So, it's impossible to free
new allocated one by damon_sysfs_destroy_targets.
By calling damon_add_target as soon as allocating new target, Fix this
possible memory leak.
Link: https://lkml.kernel.org/r/20220926160611.48536-1-sj@kernel.org
Fixes: a61ea561c871 ("mm/damon/sysfs: link DAMON for virtual address spaces monitoring")
Signed-off-by: Levi Yun <ppbuk5246(a)gmail.com>
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Reviewed-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [5.17.x]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/sysfs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/damon/sysfs.c~damon-sysfs-fix-possible-memleak-on-damon_sysfs_add_target
+++ a/mm/damon/sysfs.c
@@ -2182,12 +2182,12 @@ static int damon_sysfs_add_target(struct
if (!t)
return -ENOMEM;
+ damon_add_target(ctx, t);
if (damon_target_has_pid(ctx)) {
t->pid = find_get_pid(sys_target->pid);
if (!t->pid)
goto destroy_targets_out;
}
- damon_add_target(ctx, t);
err = damon_sysfs_set_regions(t, sys_target->regions);
if (err)
goto destroy_targets_out;
_
Patches currently in -mm which might be from ppbuk5246(a)gmail.com are
The quilt patch titled
Subject: mm: fix BUG splat with kvmalloc + GFP_ATOMIC
has been removed from the -mm tree. Its filename was
mm-fix-bug-splat-with-kvmalloc-gfp_atomic.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Florian Westphal <fw(a)strlen.de>
Subject: mm: fix BUG splat with kvmalloc + GFP_ATOMIC
Date: Mon, 26 Sep 2022 17:16:50 +0200
Martin Zaharinov reports BUG with 5.19.10 kernel:
kernel BUG at mm/vmalloc.c:2437!
invalid opcode: 0000 [#1] SMP
CPU: 28 PID: 0 Comm: swapper/28 Tainted: G W O 5.19.9 #1
[..]
RIP: 0010:__get_vm_area_node+0x120/0x130
__vmalloc_node_range+0x96/0x1e0
kvmalloc_node+0x92/0xb0
bucket_table_alloc.isra.0+0x47/0x140
rhashtable_try_insert+0x3a4/0x440
rhashtable_insert_slow+0x1b/0x30
[..]
bucket_table_alloc uses kvzalloc(GPF_ATOMIC). If kmalloc fails, this now
falls through to vmalloc and hits code paths that assume GFP_KERNEL.
Link: https://lkml.kernel.org/r/20220926151650.15293-1-fw@strlen.de
Fixes: a421ef303008 ("mm: allow !GFP_KERNEL allocations for kvmalloc")
Signed-off-by: Florian Westphal <fw(a)strlen.de>
Suggested-by: Michal Hocko <mhocko(a)suse.com>
Link: https://lore.kernel.org/linux-mm/Yy3MS2uhSgjF47dy@pc636/T/#t
Acked-by: Michal Hocko <mhocko(a)suse.com>
Reported-by: Martin Zaharinov <micron10(a)gmail.com>
Cc: Uladzislau Rezki (Sony) <urezki(a)gmail.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/util.c | 4 ++++
1 file changed, 4 insertions(+)
--- a/mm/util.c~mm-fix-bug-splat-with-kvmalloc-gfp_atomic
+++ a/mm/util.c
@@ -619,6 +619,10 @@ void *kvmalloc_node(size_t size, gfp_t f
if (ret || size <= PAGE_SIZE)
return ret;
+ /* non-sleeping allocations are not supported by vmalloc */
+ if (!gfpflags_allow_blocking(flags))
+ return NULL;
+
/* Don't even allow crazy sizes */
if (unlikely(size > INT_MAX)) {
WARN_ON_ONCE(!(flags & __GFP_NOWARN));
_
Patches currently in -mm which might be from fw(a)strlen.de are
We enable -Wcast-function-type globally in the kernel to warn about
mismatching types in function pointer casts. Compilers currently
warn only about ABI incompability with this flag, but Clang 16 will
enable a stricter version of the check by default that checks for an
exact type match. This will be very noisy in the kernel, so disable
-Wcast-function-type-strict without W=1 until the new warnings have
been addressed.
Cc: stable(a)vger.kernel.org
Link: https://reviews.llvm.org/D134831
Link: https://github.com/ClangBuiltLinux/linux/issues/1724
Suggested-by: Nathan Chancellor <nathan(a)kernel.org>
Signed-off-by: Sami Tolvanen <samitolvanen(a)google.com>
---
scripts/Makefile.extrawarn | 1 +
1 file changed, 1 insertion(+)
diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn
index 6ae482158bc4..52bd7df84fd6 100644
--- a/scripts/Makefile.extrawarn
+++ b/scripts/Makefile.extrawarn
@@ -64,6 +64,7 @@ KBUILD_CFLAGS += -Wno-sign-compare
KBUILD_CFLAGS += $(call cc-disable-warning, pointer-to-enum-cast)
KBUILD_CFLAGS += -Wno-tautological-constant-out-of-range-compare
KBUILD_CFLAGS += $(call cc-disable-warning, unaligned-access)
+KBUILD_CFLAGS += $(call cc-disable-warning, cast-function-type-strict)
endif
endif
base-commit: 7bc6e90d7aa4170039abe80b9f4e8c8e4eb35091
--
2.38.0.rc1.362.ged0d419d3c-goog
From: Quentin Schulz <quentin.schulz(a)theobroma-systems.com>
Before the split of gpio and pinctrl sections in their own driver,
rockchip_set_mux was called in pinmux_ops.gpio_set_direction for
configuring a pin in its GPIO function.
This is essential for cases where pinctrl is "bypassed" by gpio
consumers otherwise the GPIO function is not configured for the pin and
it does not work. Such was the case for the sysfs/libgpiod userspace
GPIO handling.
Let's call pinctrl_gpio_direction_input/output when setting the
direction of a GPIO so that the pinctrl core requests from the rockchip
pinctrl driver to put the pin in its GPIO function.
Fixes: 9ce9a02039de ("pinctrl/rockchip: drop the gpio related codes")
Fixes: 936ee2675eee ("gpio/rockchip: add driver for rockchip gpio")
Cc: stable(a)vger.kernel.org
Reviewed-by: Heiko Stuebner <heiko(a)sntech.de>
Signed-off-by: Quentin Schulz <quentin.schulz(a)theobroma-systems.com>
---
v2:
- added Reviewed-by,
- added missing linux/pinctrl/consumer.h header,
drivers/gpio/gpio-rockchip.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/drivers/gpio/gpio-rockchip.c b/drivers/gpio/gpio-rockchip.c
index bb50335239ac8..9c976ad7208ef 100644
--- a/drivers/gpio/gpio-rockchip.c
+++ b/drivers/gpio/gpio-rockchip.c
@@ -19,6 +19,7 @@
#include <linux/of_address.h>
#include <linux/of_device.h>
#include <linux/of_irq.h>
+#include <linux/pinctrl/consumer.h>
#include <linux/pinctrl/pinconf-generic.h>
#include <linux/regmap.h>
@@ -156,6 +157,12 @@ static int rockchip_gpio_set_direction(struct gpio_chip *chip,
unsigned long flags;
u32 data = input ? 0 : 1;
+
+ if (input)
+ pinctrl_gpio_direction_input(bank->pin_base + offset);
+ else
+ pinctrl_gpio_direction_output(bank->pin_base + offset);
+
raw_spin_lock_irqsave(&bank->slock, flags);
rockchip_gpio_writel_bit(bank, offset, data, bank->gpio_regs->port_ddr);
raw_spin_unlock_irqrestore(&bank->slock, flags);
--
2.37.3
From: Quentin Schulz <quentin.schulz(a)theobroma-systems.com>
Before the split of gpio and pinctrl sections in their own driver,
rockchip_set_mux was called in pinmux_ops.gpio_set_direction for
configuring a pin in its GPIO function.
This is essential for cases where pinctrl is "bypassed" by gpio
consumers otherwise the GPIO function is not configured for the pin and
it does not work. Such was the case for the sysfs/libgpiod userspace
GPIO handling.
Let's re-implement the pinmux_ops.gpio_set_direction callback so that
the gpio subsystem can request from the pinctrl driver to put the pin in
its GPIO function.
Fixes: 9ce9a02039de ("pinctrl/rockchip: drop the gpio related codes")
Cc: stable(a)vger.kernel.org
Reviewed-by: Heiko Stuebner <heiko(a)sntech.de>
Signed-off-by: Quentin Schulz <quentin.schulz(a)theobroma-systems.com>
---
v2:
- added Reviewed-by,
drivers/pinctrl/pinctrl-rockchip.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c
index 32e41395fc768..c84bd0e1ce5a6 100644
--- a/drivers/pinctrl/pinctrl-rockchip.c
+++ b/drivers/pinctrl/pinctrl-rockchip.c
@@ -2393,11 +2393,24 @@ static int rockchip_pmx_set(struct pinctrl_dev *pctldev, unsigned selector,
return 0;
}
+static int rockchip_pmx_gpio_set_direction(struct pinctrl_dev *pctldev,
+ struct pinctrl_gpio_range *range,
+ unsigned offset,
+ bool input)
+{
+ struct rockchip_pinctrl *info = pinctrl_dev_get_drvdata(pctldev);
+ struct rockchip_pin_bank *bank;
+
+ bank = pin_to_bank(info, offset);
+ return rockchip_set_mux(bank, offset - bank->pin_base, RK_FUNC_GPIO);
+}
+
static const struct pinmux_ops rockchip_pmx_ops = {
.get_functions_count = rockchip_pmx_get_funcs_count,
.get_function_name = rockchip_pmx_get_func_name,
.get_function_groups = rockchip_pmx_get_groups,
.set_mux = rockchip_pmx_set,
+ .gpio_set_direction = rockchip_pmx_gpio_set_direction,
};
/*
--
2.37.3
The following message is seen during boot and the activation of
console port gets delayed until normal serial ports activation.
[ 0.001346] irq: no irq domain found for pic@930 !
The console port doesn't need irq, perform irq reservation later,
during cpm_uart probe.
While at it, don't use NO_IRQ but 0 which is the value returned
by irq_of_parse_and_map() in case of error. By chance powerpc's
NO_IRQ has value 0 but on some architectures it is -1.
Fixes: 14d893fc6846 ("powerpc/8xx: Convert CPM1 interrupt controller to platform_device")
Cc: stable(a)vger.kernel.org
Signed-off-by: Christophe Leroy <christophe.leroy(a)csgroup.eu>
---
drivers/tty/serial/cpm_uart/cpm_uart_core.c | 22 ++++++++++-----------
1 file changed, 10 insertions(+), 12 deletions(-)
diff --git a/drivers/tty/serial/cpm_uart/cpm_uart_core.c b/drivers/tty/serial/cpm_uart/cpm_uart_core.c
index db07d6a5d764..fa5c4633086e 100644
--- a/drivers/tty/serial/cpm_uart/cpm_uart_core.c
+++ b/drivers/tty/serial/cpm_uart/cpm_uart_core.c
@@ -1214,12 +1214,6 @@ static int cpm_uart_init_port(struct device_node *np,
pinfo->port.fifosize = pinfo->tx_nrfifos * pinfo->tx_fifosize;
spin_lock_init(&pinfo->port.lock);
- pinfo->port.irq = irq_of_parse_and_map(np, 0);
- if (pinfo->port.irq == NO_IRQ) {
- ret = -EINVAL;
- goto out_pram;
- }
-
for (i = 0; i < NUM_GPIOS; i++) {
struct gpio_desc *gpiod;
@@ -1229,7 +1223,7 @@ static int cpm_uart_init_port(struct device_node *np,
if (IS_ERR(gpiod)) {
ret = PTR_ERR(gpiod);
- goto out_irq;
+ goto out_pram;
}
if (gpiod) {
@@ -1255,8 +1249,6 @@ static int cpm_uart_init_port(struct device_node *np,
return cpm_uart_request_port(&pinfo->port);
-out_irq:
- irq_dispose_mapping(pinfo->port.irq);
out_pram:
cpm_uart_unmap_pram(pinfo, pram);
out_mem:
@@ -1436,11 +1428,17 @@ static int cpm_uart_probe(struct platform_device *ofdev)
/* initialize the device pointer for the port */
pinfo->port.dev = &ofdev->dev;
+ pinfo->port.irq = irq_of_parse_and_map(ofdev->dev.of_node, 0);
+ if (!pinfo->port.irq)
+ return -EINVAL;
+
ret = cpm_uart_init_port(ofdev->dev.of_node, pinfo);
- if (ret)
- return ret;
+ if (!ret)
+ return uart_add_one_port(&cpm_reg, &pinfo->port);
- return uart_add_one_port(&cpm_reg, &pinfo->port);
+ irq_dispose_mapping(pinfo->port.irq);
+
+ return ret;
}
static int cpm_uart_remove(struct platform_device *ofdev)
--
2.37.1
Ahoj,Som Jonathan Sherman. Som syn kanadského staviteľa. Hľadám kúpu domu vo vašej krajine, kde by som sa mohol presťahovať so svojou rodinou. Ak by ste boli taký láskavý a pomohli mi nájsť niekoho blízkeho, som ochotný sa s vami podeliť o 40 % svojho dedičstva. Vložte zlaté peniaze do banky a oveľa viac. Toto je len úvod k tomu, o čom by som sa s vami rád porozprával. Boli by ste taký láskavý a kontaktoval ma pre viac podrobností? Toto je dôverné, nikdy som to nechcel zverejniť. Z tohto dôvodu sa musím osobne obrátiť na váš e-mail. Pre bližšie informácie ma prosím kontaktujte. Sľubujem, že ti nezaberiem veľa času. Len som potreboval niekoho, komu by som sa zveril so svojím tajným plánom sťahovania. ak by ste boli tak láskaví a kontaktovali moju súkromnú e-mailovú adresu: poskytnem vám všetky podrobnosti o dôvode môjho presťahovania. súkromný e-mail: JonathonSherman@outlook.comSúkromné WhatsApp: +1(289)6459842S Pozdravom,Jonathan Sherman.
From: Quentin Schulz <quentin.schulz(a)theobroma-systems.com>
Before the split of gpio and pinctrl sections in their own driver,
rockchip_set_mux was called in pinmux_ops.gpio_set_direction for
configuring a pin in its GPIO function.
This is essential for cases where pinctrl is "bypassed" by gpio
consumers otherwise the GPIO function is not configured for the pin and
it does not work. Such was the case for the sysfs/libgpiod userspace
GPIO handling.
Let's call pinctrl_gpio_direction_input/output when setting the
direction of a GPIO so that the pinctrl core requests from the rockchip
pinctrl driver to put the pin in its GPIO function.
Fixes: 9ce9a02039de ("pinctrl/rockchip: drop the gpio related codes")
Fixes: 936ee2675eee ("gpio/rockchip: add driver for rockchip gpio")
Cc: stable(a)vger.kernel.org
Signed-off-by: Quentin Schulz <quentin.schulz(a)theobroma-systems.com>
---
drivers/gpio/gpio-rockchip.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/drivers/gpio/gpio-rockchip.c b/drivers/gpio/gpio-rockchip.c
index bb50335239ac8..b83913e1ee49e 100644
--- a/drivers/gpio/gpio-rockchip.c
+++ b/drivers/gpio/gpio-rockchip.c
@@ -156,6 +156,12 @@ static int rockchip_gpio_set_direction(struct gpio_chip *chip,
unsigned long flags;
u32 data = input ? 0 : 1;
+
+ if (input)
+ pinctrl_gpio_direction_input(bank->pin_base + offset);
+ else
+ pinctrl_gpio_direction_output(bank->pin_base + offset);
+
raw_spin_lock_irqsave(&bank->slock, flags);
rockchip_gpio_writel_bit(bank, offset, data, bank->gpio_regs->port_ddr);
raw_spin_unlock_irqrestore(&bank->slock, flags);
--
2.37.3
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
When tracing is disabled, there's no reason that waiters should stay
waiting, wake them up, otherwise tasks get stuck when they should be
flushing the buffers.
Cc: stable(a)vger.kernel.org
Fixes: e30f53aad2202 ("tracing: Do not busy wait in buffer splice")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/trace.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 58afc83afc9d..bb5597c6bfc1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8334,6 +8334,10 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
if (ret)
goto out;
+ /* No need to wait after waking up when tracing is off */
+ if (!tracer_tracing_is_on(iter->tr))
+ goto out;
+
/* Make sure we see the new wait_index */
smp_rmb();
if (wait_index != iter->wait_index)
@@ -9065,6 +9069,8 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
tracer_tracing_off(tr);
if (tr->current_trace->stop)
tr->current_trace->stop(tr);
+ /* Wake up any waiters */
+ ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS);
}
mutex_unlock(&trace_types_lock);
}
--
2.35.1
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
If a process is waiting on the ring buffer for data, there currently isn't
a clean way to force it to wake up. Add an ioctl call that will force any
tasks that are waiting on the trace_pipe_raw file to wake up.
Link: https://lkml.kernel.org/r/20220929095029.117f913f@gandalf.local.home
Cc: stable(a)vger.kernel.org
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Fixes: e30f53aad2202 ("tracing: Do not busy wait in buffer splice")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/trace.c | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e101b0764b39..58afc83afc9d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8349,12 +8349,34 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return ret;
}
+/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
+static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct ftrace_buffer_info *info = file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ if (cmd)
+ return -ENOIOCTLCMD;
+
+ mutex_lock(&trace_types_lock);
+
+ iter->wait_index++;
+ /* Make sure the waiters see the new wait_index */
+ smp_wmb();
+
+ ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+
+ mutex_unlock(&trace_types_lock);
+ return 0;
+}
+
static const struct file_operations tracing_buffers_fops = {
.open = tracing_buffers_open,
.read = tracing_buffers_read,
.poll = tracing_buffers_poll,
.release = tracing_buffers_release,
.splice_read = tracing_buffers_splice_read,
+ .unlocked_ioctl = tracing_buffers_ioctl,
.llseek = no_llseek,
};
--
2.35.1
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
When the file that represents the ring buffer is closed, there may be
waiters waiting on more input from the ring buffer. Call
ring_buffer_wake_waiters() to wake up any waiters when the file is
closed.
Link: https://lkml.kernel.org/r/20220927231825.182416969@goodmis.org
Cc: stable(a)vger.kernel.org
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Fixes: e30f53aad2202 ("tracing: Do not busy wait in buffer splice")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
include/linux/trace_events.h | 1 +
kernel/trace/trace.c | 15 +++++++++++++++
2 files changed, 16 insertions(+)
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8401dec93c15..20749bd9db71 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -92,6 +92,7 @@ struct trace_iterator {
unsigned int temp_size;
char *fmt; /* modified format holder */
unsigned int fmt_size;
+ long wait_index;
/* trace_seq for __print_flags() and __print_symbolic() etc. */
struct trace_seq tmp_seq;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index aed7ea6e6045..e101b0764b39 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8160,6 +8160,12 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
__trace_array_put(iter->tr);
+ iter->wait_index++;
+ /* Make sure the waiters see the new wait_index */
+ smp_wmb();
+
+ ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+
if (info->spare)
ring_buffer_free_read_page(iter->array_buffer->buffer,
info->spare_cpu, info->spare);
@@ -8313,6 +8319,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
+ long wait_index;
+
if (ret)
goto out;
@@ -8320,10 +8328,17 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
goto out;
+ wait_index = READ_ONCE(iter->wait_index);
+
ret = wait_on_pipe(iter, iter->tr->buffer_percent);
if (ret)
goto out;
+ /* Make sure we see the new wait_index */
+ smp_rmb();
+ if (wait_index != iter->wait_index)
+ goto out;
+
goto again;
}
--
2.35.1
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
On closing of a file that represents a ring buffer or flushing the file,
there may be waiters on the ring buffer that needs to be woken up and exit
the ring_buffer_wait() function.
Add ring_buffer_wake_waiters() to wake up the waiters on the ring buffer
and allow them to exit the wait loop.
Link: https://lkml.kernel.org/r/20220928133938.28dc2c27@gandalf.local.home
Cc: stable(a)vger.kernel.org
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Fixes: 15693458c4bc0 ("tracing/ring-buffer: Move poll wake ups into ring buffer code")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
include/linux/ring_buffer.h | 2 +-
kernel/trace/ring_buffer.c | 39 +++++++++++++++++++++++++++++++++++++
2 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index dac53fd3afea..2504df9a0453 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -101,7 +101,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full);
__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
struct file *filp, poll_table *poll_table);
-
+void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu);
#define RING_BUFFER_ALL_CPUS -1
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5a7d818ca3ea..3046deacf7b3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -413,6 +413,7 @@ struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
wait_queue_head_t full_waiters;
+ long wait_index;
bool waiters_pending;
bool full_waiters_pending;
bool wakeup_full;
@@ -924,6 +925,37 @@ static void rb_wake_up_waiters(struct irq_work *work)
}
}
+/**
+ * ring_buffer_wake_waiters - wake up any waiters on this ring buffer
+ * @buffer: The ring buffer to wake waiters on
+ *
+ * In the case of a file that represents a ring buffer is closing,
+ * it is prudent to wake up any waiters that are on this.
+ */
+void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct rb_irq_work *rbwork;
+
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+
+ /* Wake up individual ones too. One level recursion */
+ for_each_buffer_cpu(buffer, cpu)
+ ring_buffer_wake_waiters(buffer, cpu);
+
+ rbwork = &buffer->irq_work;
+ } else {
+ cpu_buffer = buffer->buffers[cpu];
+ rbwork = &cpu_buffer->irq_work;
+ }
+
+ rbwork->wait_index++;
+ /* make sure the waiters see the new index */
+ smp_wmb();
+
+ rb_wake_up_waiters(&rbwork->work);
+}
+
/**
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
@@ -939,6 +971,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
struct rb_irq_work *work;
+ long wait_index;
int ret = 0;
/*
@@ -957,6 +990,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
work = &cpu_buffer->irq_work;
}
+ wait_index = READ_ONCE(work->wait_index);
while (true) {
if (full)
@@ -1021,6 +1055,11 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
}
schedule();
+
+ /* Make sure to see the new wait index */
+ smp_rmb();
+ if (wait_index != work->wait_index)
+ break;
}
if (full)
--
2.35.1
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
The wake up waiters only checks the "wakeup_full" variable and not the
"full_waiters_pending". The full_waiters_pending is set when a waiter is
added to the wait queue. The wakeup_full is only set when an event is
triggered, and it clears the full_waiters_pending to avoid multiple calls
to irq_work_queue().
The irq_work callback really needs to check both wakeup_full as well as
full_waiters_pending such that this code can be used to wake up waiters
when a file is closed that represents the ring buffer and the waiters need
to be woken up.
Link: https://lkml.kernel.org/r/20220927231824.209460321@goodmis.org
Cc: stable(a)vger.kernel.org
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Fixes: 15693458c4bc0 ("tracing/ring-buffer: Move poll wake ups into ring buffer code")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/ring_buffer.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 02db92c9eb1b..5a7d818ca3ea 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -917,8 +917,9 @@ static void rb_wake_up_waiters(struct irq_work *work)
struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
wake_up_all(&rbwork->waiters);
- if (rbwork->wakeup_full) {
+ if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
rbwork->wakeup_full = false;
+ rbwork->full_waiters_pending = false;
wake_up_all(&rbwork->full_waiters);
}
}
--
2.35.1
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
The logic to know when the shortest waiters on the ring buffer should be
woken up or not has uses a less than instead of a greater than compare,
which causes the shortest_full to actually be the longest.
Link: https://lkml.kernel.org/r/20220927231823.718039222@goodmis.org
Cc: stable(a)vger.kernel.org
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Fixes: 2c2b0a78b3739 ("ring-buffer: Add percentage of ring buffer full to wake up reader")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/ring_buffer.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6b145d48dfd1..02db92c9eb1b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1011,7 +1011,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
nr_pages = cpu_buffer->nr_pages;
dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
if (!cpu_buffer->shortest_full ||
- cpu_buffer->shortest_full < full)
+ cpu_buffer->shortest_full > full)
cpu_buffer->shortest_full = full;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
if (!pagebusy &&
--
2.35.1
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
If a page is partially read, and then the splice system call is run
against the ring buffer, it will always fail to read, no matter how much
is in the ring buffer. That's because the code path for a partial read of
the page does will fail if the "full" flag is set.
The splice system call wants full pages, so if the read of the ring buffer
is not yet full, it should return zero, and the splice will block. But if
a previous read was done, where the beginning has been consumed, it should
still be given to the splice caller if the rest of the page has been
written to.
This caused the splice command to never consume data in this scenario, and
let the ring buffer just fill up and lose events.
Link: https://lkml.kernel.org/r/20220927144317.46be6b80@gandalf.local.home
Cc: stable(a)vger.kernel.org
Fixes: 8789a9e7df6bf ("ring-buffer: read page interface")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/ring_buffer.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d59b6a328b7f..6b145d48dfd1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5616,7 +5616,15 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
unsigned int pos = 0;
unsigned int size;
- if (full)
+ /*
+ * If a full page is expected, this can still be returned
+ * if there's been a previous partial read and the
+ * rest of the page can be read and the commit page is off
+ * the reader page.
+ */
+ if (full &&
+ (!read || (len < (commit - read)) ||
+ cpu_buffer->reader_page == cpu_buffer->commit_page))
goto out_unlock;
if (len > (commit - read))
--
2.35.1
> -----Original Message-----
> From: Gaurav Kohli <gauravkohli(a)linux.microsoft.com>
> Sent: Wednesday, September 28, 2022 9:49 AM
> To: KY Srinivasan <kys(a)microsoft.com>; Haiyang Zhang
> <haiyangz(a)microsoft.com>; Stephen Hemminger
> <sthemmin(a)microsoft.com>; wei.liu(a)kernel.org; Dexuan Cui
> <decui(a)microsoft.com>; linux-hyperv(a)vger.kernel.org;
> netdev(a)vger.kernel.org
> Subject: [PATCH net] hv_netvsc: Fix race between VF offering and VF
> association message from host
>
> During vm boot, there might be possibility that vf registration
> call comes before the vf association from host to vm.
>
> And this might break netvsc vf path, To prevent the same block
> vf registration until vf bind message comes from host.
>
> Cc: stable(a)vger.kernel.org
> Fixes: 00d7ddba11436 ("hv_netvsc: pair VF based on serial number")
> Signed-off-by: Gaurav Kohli <gauravkohli(a)linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz(a)microsoft.com>
By the way, did you use "git send-email"? I didn't see the stable(a)vger.kernel.org cc-ed in your original email.
In cases where swiotlb is enabled dma_max_mapping_size takes into
account the min align mask for the device. Right now the mask is
set after the max hw sectors are calculated which might result in
a request size that overflows the swiotlb buffer.
Set the min align mask for nvme driver before calling
dma_max_mapping_size while calculating max hw sectors.
Fixes: 7637de311bd2 ("nvme-pci: limit max_hw_sectors based on the DMA max mapping size")
Cc: stable(a)vger.kernel.org
Signed-off-by: Rishabh Bhatnagar <risbhat(a)amazon.com>
---
Changes in V2:
- Add Cc: <stable(a)vger.kernel.org> tag
- Improve the commit text
- Add patch version
Changes in V1:
- Add fixes tag
drivers/nvme/host/pci.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 98864b853eef..30e71e41a0a2 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2834,6 +2834,8 @@ static void nvme_reset_work(struct work_struct *work)
nvme_start_admin_queue(&dev->ctrl);
}
+ dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1);
+
/*
* Limit the max command size to prevent iod->sg allocations going
* over a single page.
@@ -2846,7 +2848,6 @@ static void nvme_reset_work(struct work_struct *work)
* Don't limit the IOMMU merged segment size.
*/
dma_set_max_seg_size(dev->dev, 0xffffffff);
- dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1);
mutex_unlock(&dev->shutdown_lock);
--
2.37.1
The calling convention for pre_slave_sg is to return NULL on error and
provide an error log to the system. Qcom-adm instead provide error
pointer when an error occur. This indirectly cause kernel panic for
example for the nandc driver that checks only if the pointer returned by
device_prep_slave_sg is not NULL. Returning an error pointer makes nandc
think the device_prep_slave_sg function correctly completed and makes
the kernel panics later in the code.
While nandc is the one that makes the kernel crash, it was pointed out
that the real problem is qcom-adm not following calling convention for
that function.
To fix this, drop returning error pointer and return NULL with an error
log.
Fixes: 03de6b273805 ("dmaengine: qcom-adm: stop abusing slave_id config")
Fixes: 5c9f8c2dbdbe ("dmaengine: qcom: Add ADM driver")
Signed-off-by: Christian Marangi <ansuelsmth(a)gmail.com>
Cc: stable(a)vger.kernel.org # v5.11+
---
drivers/dma/qcom/qcom_adm.c | 20 +++++++++++++-------
1 file changed, 13 insertions(+), 7 deletions(-)
diff --git a/drivers/dma/qcom/qcom_adm.c b/drivers/dma/qcom/qcom_adm.c
index facdacf8aede..cd3f12cf4721 100644
--- a/drivers/dma/qcom/qcom_adm.c
+++ b/drivers/dma/qcom/qcom_adm.c
@@ -379,13 +379,13 @@ static struct dma_async_tx_descriptor *adm_prep_slave_sg(struct dma_chan *chan,
if (blk_size < 0) {
dev_err(adev->dev, "invalid burst value: %d\n",
burst);
- return ERR_PTR(-EINVAL);
+ return NULL;
}
crci = achan->crci & 0xf;
if (!crci || achan->crci > 0x1f) {
dev_err(adev->dev, "invalid crci value\n");
- return ERR_PTR(-EINVAL);
+ return NULL;
}
}
@@ -403,8 +403,10 @@ static struct dma_async_tx_descriptor *adm_prep_slave_sg(struct dma_chan *chan,
}
async_desc = kzalloc(sizeof(*async_desc), GFP_NOWAIT);
- if (!async_desc)
- return ERR_PTR(-ENOMEM);
+ if (!async_desc) {
+ dev_err(adev->dev, "not enough memory for async_desc struct\n");
+ return NULL;
+ }
async_desc->mux = achan->mux ? ADM_CRCI_CTL_MUX_SEL : 0;
async_desc->crci = crci;
@@ -414,8 +416,10 @@ static struct dma_async_tx_descriptor *adm_prep_slave_sg(struct dma_chan *chan,
sizeof(*cple) + 2 * ADM_DESC_ALIGN;
async_desc->cpl = kzalloc(async_desc->dma_len, GFP_NOWAIT);
- if (!async_desc->cpl)
+ if (!async_desc->cpl) {
+ dev_err(adev->dev, "not enough memory for cpl struct\n");
goto free;
+ }
async_desc->adev = adev;
@@ -437,8 +441,10 @@ static struct dma_async_tx_descriptor *adm_prep_slave_sg(struct dma_chan *chan,
async_desc->dma_addr = dma_map_single(adev->dev, async_desc->cpl,
async_desc->dma_len,
DMA_TO_DEVICE);
- if (dma_mapping_error(adev->dev, async_desc->dma_addr))
+ if (dma_mapping_error(adev->dev, async_desc->dma_addr)) {
+ dev_err(adev->dev, "dma mapping error for cpl\n");
goto free;
+ }
cple_addr = async_desc->dma_addr + ((void *)cple - async_desc->cpl);
@@ -454,7 +460,7 @@ static struct dma_async_tx_descriptor *adm_prep_slave_sg(struct dma_chan *chan,
free:
kfree(async_desc);
- return ERR_PTR(-ENOMEM);
+ return NULL;
}
/**
--
2.37.2
Fix broken slave_config function that uncorrectly compare the
peripheral_size with the size of the config pointer instead of the size
of the config struct. This cause the crci value to be ignored and cause
a kernel panic on any slave that use adm driver.
To fix this, compare to the size of the struct and NOT the size of the
pointer.
Fixes: 03de6b273805 ("dmaengine: qcom-adm: stop abusing slave_id config")
Signed-off-by: Christian Marangi <ansuelsmth(a)gmail.com>
Cc: stable(a)vger.kernel.org # v5.17+
---
drivers/dma/qcom/qcom_adm.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/dma/qcom/qcom_adm.c b/drivers/dma/qcom/qcom_adm.c
index facdacf8aede..c77d9de853de 100644
--- a/drivers/dma/qcom/qcom_adm.c
+++ b/drivers/dma/qcom/qcom_adm.c
@@ -494,7 +494,7 @@ static int adm_slave_config(struct dma_chan *chan, struct dma_slave_config *cfg)
spin_lock_irqsave(&achan->vc.lock, flag);
memcpy(&achan->slave, cfg, sizeof(struct dma_slave_config));
- if (cfg->peripheral_size == sizeof(config))
+ if (cfg->peripheral_size == sizeof(*config))
achan->crci = config->crci;
spin_unlock_irqrestore(&achan->vc.lock, flag);
--
2.37.2
The check in __ext4_read_dirblock() for block being outside of directory
size was wrong because it compared block number against directory size
in bytes. Fix it.
Fixes: 65f8ea4cd57d ("ext4: check if directory block is within i_size")
CVE: CVE-2022-1184
CC: stable(a)vger.kernel.org
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/ext4/namei.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3a31b662f661..bc2e0612ec32 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -126,7 +126,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
struct ext4_dir_entry *dirent;
int is_dx_block = 0;
- if (block >= inode->i_size) {
+ if (block >= inode->i_size >> inode->i_blkbits) {
ext4_error_inode(inode, func, line, block,
"Attempting to read directory block (%u) that is past i_size (%llu)",
block, inode->i_size);
--
2.35.3
Previously, the fast pool was dumped into the main pool peroidically in
the fast pool's hard IRQ handler. This worked fine and there weren't
problems with it, until RT came around. Since RT converts spinlocks into
sleeping locks, problems cropped up. Rather than switching to raw
spinlocks, the RT developers preferred we make the transformation from
originally doing:
do_some_stuff()
spin_lock()
do_some_other_stuff()
spin_unlock()
to doing:
do_some_stuff()
queue_work_on(some_other_stuff_worker)
This is an ordinary pattern done all over the kernel. However, Sherry
noticed a 10% performance regression in qperf TCP over a 40gbps
InfiniBand card. Quoting her message:
> MT27500 Family [ConnectX-3] cards:
> Infiniband device 'mlx4_0' port 1 status:
> default gid: fe80:0000:0000:0000:0010:e000:0178:9eb1
> base lid: 0x6
> sm lid: 0x1
> state: 4: ACTIVE
> phys state: 5: LinkUp
> rate: 40 Gb/sec (4X QDR)
> link_layer: InfiniBand
>
> Cards are configured with IP addresses on private subnet for IPoIB
> performance testing.
> Regression identified in this bug is in TCP latency in this stack as reported
> by qperf tcp_lat metric:
>
> We have one system listen as a qperf server:
> [root@yourQperfServer ~]# qperf
>
> Have the other system connect to qperf server as a client (in this
> case, it’s X7 server with Mellanox card):
> [root@yourQperfClient ~]# numactl -m0 -N0 qperf 20.20.20.101 -v -uu -ub --time 60 --wait_server 20 -oo msg_size:4K:1024K:*2 tcp_lat
Rather than incur the scheduling latency from queue_work_on, we can
instead switch to a tasklet, which will run on the same core -- exactly
what we want -- and happen during context transition without additional
scheduling latency, and minimized logic in the enqueuing path.
Hopefully this restores performance from prior to the RT changes.
Reported-by: Sherry Yang <sherry.yang(a)oracle.com>
Suggested-by: Sultan Alsawaf <sultan(a)kerneltoast.com>
Fixes: 58340f8e952b ("random: defer fast pool mixing to worker")
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/lkml/YyuREcGAXV9828w5@zx2c4.com/
Signed-off-by: Jason A. Donenfeld <Jason(a)zx2c4.com>
---
Hi Sherry,
I'm not going to commit to this until I receive your `Tested-by:`, so
please let me know if this fixes the problem. If not, we'll try
something else.
Thanks,
Jason
drivers/char/random.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 520a385c7dab..ad17b36cf977 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -918,13 +918,16 @@ EXPORT_SYMBOL_GPL(unregister_random_vmfork_notifier);
#endif
struct fast_pool {
- struct work_struct mix;
+ struct tasklet_struct mix;
unsigned long pool[4];
unsigned long last;
unsigned int count;
};
+static void mix_interrupt_randomness(struct tasklet_struct *work);
+
static DEFINE_PER_CPU(struct fast_pool, irq_randomness) = {
+ .mix = { .use_callback = true, .callback = mix_interrupt_randomness },
#ifdef CONFIG_64BIT
#define FASTMIX_PERM SIPHASH_PERMUTATION
.pool = { SIPHASH_CONST_0, SIPHASH_CONST_1, SIPHASH_CONST_2, SIPHASH_CONST_3 }
@@ -973,7 +976,7 @@ int __cold random_online_cpu(unsigned int cpu)
}
#endif
-static void mix_interrupt_randomness(struct work_struct *work)
+static void mix_interrupt_randomness(struct tasklet_struct *work)
{
struct fast_pool *fast_pool = container_of(work, struct fast_pool, mix);
/*
@@ -1027,10 +1030,8 @@ void add_interrupt_randomness(int irq)
if (new_count < 1024 && !time_is_before_jiffies(fast_pool->last + HZ))
return;
- if (unlikely(!fast_pool->mix.func))
- INIT_WORK(&fast_pool->mix, mix_interrupt_randomness);
fast_pool->count |= MIX_INFLIGHT;
- queue_work_on(raw_smp_processor_id(), system_highpri_wq, &fast_pool->mix);
+ tasklet_hi_schedule(&fast_pool->mix);
}
EXPORT_SYMBOL_GPL(add_interrupt_randomness);
--
2.37.3
I hit a very bad problem during my tests of SENDMSG_ZC.
BUG(); in first_iovec_segment() triggered very easily.
The problem was io_setup_async_msg() in the partial retry case,
which seems to happen more often with _ZC.
iov_iter_iovec_advance() may change i->iov in order to have i->iov_offset
being only relative to the first element.
Which means kmsg->msg.msg_iter.iov is no longer the
same as kmsg->fast_iov.
But this would rewind the copy to be the start of
async_msg->fast_iov, which means the internal
state of sync_msg->msg.msg_iter is inconsitent.
I tested with 5 vectors with length like this 4, 0, 64, 20, 8388608
and got a short writes with:
- ret=2675244 min_ret=8388692 => remaining 5713448 sr->done_io=2675244
- ret=-EAGAIN => io_uring_poll_arm
- ret=4911225 min_ret=5713448 => remaining 802223 sr->done_io=7586469
- ret=-EAGAIN => io_uring_poll_arm
- ret=802223 min_ret=802223 => res=8388692
While this was easily triggered with SENDMSG_ZC (queued for 6.1),
it was a potential problem starting with 7ba89d2af17aa879dda30f5d5d3f152e587fc551
in 5.18 for IORING_OP_RECVMSG.
And also with 4c3c09439c08b03d9503df0ca4c7619c5842892e in 5.19
for IORING_OP_SENDMSG.
However 257e84a5377fbbc336ff563833a8712619acce56 introduced the critical
code into io_setup_async_msg() in 5.11.
Fixes: 7ba89d2af17aa ("io_uring: ensure recv and recvmsg handle MSG_WAITALL correctly")
Fixes: 257e84a5377fb ("io_uring: refactor sendmsg/recvmsg iov managing")
Cc: stable(a)vger.kernel.org
Signed-off-by: Stefan Metzmacher <metze(a)samba.org>
---
io_uring/net.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/io_uring/net.c b/io_uring/net.c
index 60e392f7f2dc..a81fccd38ae4 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -165,8 +165,10 @@ static int io_setup_async_msg(struct io_kiocb *req,
memcpy(async_msg, kmsg, sizeof(*kmsg));
async_msg->msg.msg_name = &async_msg->addr;
/* if were using fast_iov, set it to the new one */
- if (!async_msg->free_iov)
- async_msg->msg.msg_iter.iov = async_msg->fast_iov;
+ if (!kmsg->free_iov) {
+ size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov;
+ async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx];
+ }
return -EAGAIN;
}
--
2.34.1
A kernel daemon should not rely on the current thread, which is unknown
and might be malicious. Before this security fix,
ksmbd_override_fsids() didn't correctly override FS UID/GID which means
that arbitrary user space threads could trick the kernel to impersonate
arbitrary users or groups for file system access checks, leading to
file system access bypass.
This was found while investigating truncate support for Landlock:
https://lore.kernel.org/r/CAKYAXd8fpMJ7guizOjHgxEyyjoUwPsx3jLOPZP=wPYcbhkVX…
Fixes: e2f34481b24d ("cifsd: add server-side procedures for SMB3")
Cc: Hyunchul Lee <hyc.lee(a)gmail.com>
Cc: Namjae Jeon <linkinjeon(a)kernel.org>
Cc: Steve French <smfrench(a)gmail.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Mickaël Salaün <mic(a)digikod.net>
Link: https://lore.kernel.org/r/20220929100447.108468-1-mic@digikod.net
---
fs/ksmbd/smb_common.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c
index 7f8ab14fb8ec..d96da872d70a 100644
--- a/fs/ksmbd/smb_common.c
+++ b/fs/ksmbd/smb_common.c
@@ -4,6 +4,8 @@
* Copyright (C) 2018 Namjae Jeon <linkinjeon(a)kernel.org>
*/
+#include <linux/user_namespace.h>
+
#include "smb_common.h"
#include "server.h"
#include "misc.h"
@@ -625,8 +627,8 @@ int ksmbd_override_fsids(struct ksmbd_work *work)
if (!cred)
return -ENOMEM;
- cred->fsuid = make_kuid(current_user_ns(), uid);
- cred->fsgid = make_kgid(current_user_ns(), gid);
+ cred->fsuid = make_kuid(&init_user_ns, uid);
+ cred->fsgid = make_kgid(&init_user_ns, gid);
gi = groups_alloc(0);
if (!gi) {
base-commit: f76349cf41451c5c42a99f18a9163377e4b364ff
--
2.37.2
Sometime after 5.10.105 (5.10.132 and 6.0) there is a change that
makes setns(open("/proc/1/ns/net")) in the main process change
the behaviour of other process threads.
I don't know how much is broken, but the following fails.
Create a network namespace (eg "test").
Create a 'bond' interface (eg "test0") in the namespace.
Then /proc/net/bonding/test0 only exists inside the namespace.
However if you run a program in the "test" namespace that does:
- create a thread.
- change the main thread to in "init" namespace.
- try to open /proc/net/bonding/test0 in the thread.
then the open fails.
I don't know how much else is affected and haven't tried
to bisect (I can't create bonds on my normal test kernel).
The test program below shows the problem.
Compile and run as:
# ip netns exec test strace -f test_prog /proc/net/bonding/test0
The second open by the child should succeed, but fails.
I can't see any changes to the bonding code, so I suspect
it is something much more fundamental.
It might only affect /proc/net, but it might also affect
which namespace sockets get created in.
IIRC ls -l /proc/n/task/*/ns gives the correct namespaces.
David
#define _GNU_SOURCE
#include <fcntl.h>
#include <unistd.h>
#include <poll.h>
#include <pthread.h>
#include <sched.h>
#define delay(secs) poll(0,0, (secs) * 1000)
static void *thread_fn(void *file)
{
delay(2);
open(file, O_RDONLY);
delay(5);
open(file, O_RDONLY);
return NULL;
}
int main(int argc, char **argv)
{
pthread_t id;
pthread_create(&id, NULL, thread_fn, argv[1]);
delay(1);
open(argv[1], O_RDONLY);
delay(2);
setns(open("/proc/1/ns/net", O_RDONLY), 0);
delay(1);
open(argv[1], O_RDONLY);
delay(4);
return 0;
}
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
When building s390 allmodconfig after commit 9b91a6523078 ("usb: gadget:
uvc: increase worker prio to WQ_HIGHPRI"), the following error occurs:
In file included from ../include/linux/string.h:253,
from ../include/linux/bitmap.h:11,
from ../include/linux/cpumask.h:12,
from ../include/linux/smp.h:13,
from ../include/linux/lockdep.h:14,
from ../include/linux/rcupdate.h:29,
from ../include/linux/rculist.h:11,
from ../include/linux/pid.h:5,
from ../include/linux/sched.h:14,
from ../include/linux/ratelimit.h:6,
from ../include/linux/dev_printk.h:16,
from ../include/linux/device.h:15,
from ../drivers/usb/gadget/function/f_uvc.c:9:
In function ‘fortify_memset_chk’,
inlined from ‘uvc_register_video’ at ../drivers/usb/gadget/function/f_uvc.c:424:2:
../include/linux/fortify-string.h:301:25: error: call to ‘__write_overflow_field’ declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror=attribute-warning]
301 | __write_overflow_field(p_size_field, size);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This points to the memset() in uvc_register_video(). It is clear that
the argument to sizeof() is incorrect, as uvc->vdev (a 'struct
video_device') is being zeroed out but the size of uvc->video (a 'struct
uvc_video') is being used as the third arugment to memset().
pahole shows that prior to commit 9b91a6523078 ("usb: gadget: uvc:
increase worker prio to WQ_HIGHPRI"), 'struct video_device' and
'struct ucv_video' had the same size, meaning that the argument to
sizeof() is incorrect semantically but there is no visible issue:
$ pahole -s build/drivers/usb/gadget/function/f_uvc.o | grep -E "(uvc_video|video_device)\s+"
video_device 1400 4
uvc_video 1400 3
After that change, uvc_video becomes slightly larger, meaning that the
memset() will overwrite by 8 bytes:
$ pahole -s build/drivers/usb/gadget/function/f_uvc.o | grep -E "(uvc_video|video_device)\s+"
video_device 1400 4
uvc_video 1408 3
Fix the arugment to sizeof() so that there is no overwrite.
Cc: stable(a)vger.kernel.org
Fixes: e4ce9ed835bc ("usb: gadget: uvc: ensure the vdev is unset")
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
---
drivers/usb/gadget/function/f_uvc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/usb/gadget/function/f_uvc.c b/drivers/usb/gadget/function/f_uvc.c
index 71669e0e4d00..86bb0098fb66 100644
--- a/drivers/usb/gadget/function/f_uvc.c
+++ b/drivers/usb/gadget/function/f_uvc.c
@@ -421,7 +421,7 @@ uvc_register_video(struct uvc_device *uvc)
int ret;
/* TODO reference counting. */
- memset(&uvc->vdev, 0, sizeof(uvc->video));
+ memset(&uvc->vdev, 0, sizeof(uvc->vdev));
uvc->vdev.v4l2_dev = &uvc->v4l2_dev;
uvc->vdev.v4l2_dev->dev = &cdev->gadget->dev;
uvc->vdev.fops = &uvc_v4l2_fops;
base-commit: f76349cf41451c5c42a99f18a9163377e4b364ff
--
2.37.3
Hi, this is your Linux kernel regression tracker. CCing the regression
mailing list, as it should be in the loop for all regressions, as
explained here:
https://www.kernel.org/doc/html/latest/admin-guide/reporting-issues.html
Also CCing the stable ml, the NFS maintainers, and the authors of
31b992b3c39b, too.
On 22.09.22 23:46, Kurt Garloff wrote:
>
> a freshly compiled 5.15.69 kernel showed hangs with NFS.
> Typically mkdir would end up in a 'D' process state, but I
> have seen ls -l hanging as well.
> Server is kernel NFS 5.15.69.
>
> After reverting the last three NFS related commits,
> a68a734b19af NFS: Fix WARN_ON due to unionization of nfs_inode.nrequests
> 3b97deb4abf5 NFS: Fix another fsync() issue after a server reboot
> 31b992b3c39b NFS: Save some space in the inode
>
> things work normally again.
>
> As you can see, I suspected 31b992b3c39b ...
FWIW, that's e591b298d7ec in mainline.
> I know this report is light on details; if nothing like this has been
> reported yet, let me know and I'll try to find some time to investigate
> further.
>
> PS: Please keep me on Cc, I'm not subscribed to linux-nfs.
I wonder if this is this is a dup of this report:
https://lore.kernel.org/all/c5d8485b-0dbc-5192-4dc6-10ef2b86b520@molgen.mpg…
In that thread Trond mentioned
```
I believe this is a dependency that was introduced by the back port of
commit e591b298d7ec ("NFS: Save some space in the inode") into 5.15.68.
So the reason it wasn't seen is because the change is very recent.
FYI Greg and Sasha: please also consider pulling 6e176d47160c ("NFSv4:
Fixes for nfs4_inode_return_delegation()") into that stable series.
```
Anyway, for the rest of this mail:
[TLDR: I'm adding this regression report to the list of tracked
regressions; all text from me you find below is based on a few templates
paragraphs you might have encountered already already in similar form.]
Thanks for the report. To be sure below issue doesn't fall through the
cracks unnoticed, I'm adding it to regzbot, my Linux kernel regression
tracking bot:
#regzbot ^introduced 31b992b3c39b
#regzbot ignore-activity
This isn't a regression? This issue or a fix for it are already
discussed somewhere else? It was fixed already? You want to clarify when
the regression started to happen? Or point out I got the title or
something else totally wrong? Then just reply -- ideally with also
telling regzbot about it, as explained here:
https://linux-regtracking.leemhuis.info/tracked-regression/
Reminder for developers: When fixing the issue, add 'Link:' tags
pointing to the report (the mail this one replies to), as explained for
in the Linux kernel's documentation; above webpage explains why this is
important for tracked regressions.
Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat)
P.S.: As the Linux kernel's regression tracker I deal with a lot of
reports and sometimes miss something important when writing mails like
this. If that's the case here, don't hesitate to tell me in a public
reply, it's in everyone's interest to set the public record straight.
The patch below does not apply to the 5.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
ca76d7d2812b ("perf record: Fix cpu mask bit setting for mixed mmaps")
cbd7bfc7fd99 ("tools/perf: Fix out of bound access to cpu mask array")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ca76d7d2812b46124291f99c9b50aaf63a936f23 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter(a)intel.com>
Date: Thu, 15 Sep 2022 15:26:11 +0300
Subject: [PATCH] perf record: Fix cpu mask bit setting for mixed mmaps
With mixed per-thread and (system-wide) per-cpu maps, the "any cpu" value
-1 must be skipped when setting CPU mask bits.
Prior to commit cbd7bfc7fd99acdd ("tools/perf: Fix out of bound access
to cpu mask array") the invalid setting went unnoticed, but since then
it causes perf record to fail with an error.
Example:
Before:
$ perf record -e intel_pt// --per-thread uname
Failed to initialize parallel data streaming masks
After:
$ perf record -e intel_pt// --per-thread uname
Linux
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.068 MB perf.data ]
Fixes: ae4f8ae16a078964 ("libperf evlist: Allow mixing per-thread and per-cpu mmaps")
Signed-off-by: Adrian Hunter <adrian.hunter(a)intel.com>
Acked-by: Namhyung Kim <namhyung(a)kernel.org>
Cc: Athira Rajeev <atrajeev(a)linux.vnet.ibm.com>
Cc: Ian Rogers <irogers(a)google.com>
Cc: Jiri Olsa <jolsa(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20220915122612.81738-2-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index f87ef43eb820..0f711f88894c 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -3371,6 +3371,8 @@ static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cp
return 0;
perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
+ if (cpu.cpu == -1)
+ continue;
/* Return ENODEV is input cpu is greater than max cpu */
if ((unsigned long)cpu.cpu > mask->nbits)
return -ENODEV;
This is the start of the stable review cycle for the 4.19.260 release.
There are 55 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Wed, 28 Sep 2022 16:35:25 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.19.260-r…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.19.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.19.260-rc2
Raymond Tan <raymond.tan(a)intel.com>
usb: dwc3: pci: Allow Elkhart Lake to utilize DSM method for PM functionality
Jan Kara <jack(a)suse.cz>
ext4: make directory inode spreading reflect flexbg size
Tetsuo Handa <penguin-kernel(a)I-love.SAKURA.ne.jp>
workqueue: don't skip lockdep work dependency in cancel_work_sync()
Nathan Huckleberry <nhuck(a)google.com>
drm/rockchip: Fix return type of cdn_dp_connector_mode_valid
Yao Wang1 <Yao.Wang1(a)amd.com>
drm/amd/display: Limit user regamma to a valid value
Vitaly Kuznetsov <vkuznets(a)redhat.com>
Drivers: hv: Never allocate anything besides framebuffer from framebuffer memory region
Stefan Haberland <sth(a)linux.ibm.com>
s390/dasd: fix Oops in dasd_alias_get_start_dev due to missing pavgroup
Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
serial: tegra: Use uart_xmit_advance(), fixes icount.tx accounting
Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
serial: Create uart_xmit_advance()
Sean Anderson <seanga2(a)gmail.com>
net: sunhme: Fix packet reception for len < RX_COPY_THRESHOLD
Adrian Hunter <adrian.hunter(a)intel.com>
perf kcore_copy: Do not check /proc/modules is unchanged
Lieven Hey <lieven.hey(a)kdab.com>
perf jit: Include program header in ELF files
Marc Kleine-Budde <mkl(a)pengutronix.de>
can: gs_usb: gs_can_open(): fix race dev->can.state condition
Florian Westphal <fw(a)strlen.de>
netfilter: ebtables: fix memory leak when blob is malformed
Liang He <windhl(a)126.com>
of: mdio: Add of_node_put() when breaking out of for_each_xx
Michal Jaron <michalx.jaron(a)intel.com>
i40e: Fix set max_tx_rate when it is lower than 1 Mbps
Michal Jaron <michalx.jaron(a)intel.com>
i40e: Fix VF set max MTU size
Randy Dunlap <rdunlap(a)infradead.org>
MIPS: lantiq: export clk_get_io() for lantiq_wdt.ko
Benjamin Poirier <bpoirier(a)nvidia.com>
net: team: Unsync device addresses on ndo_stop
Lu Wei <luwei32(a)huawei.com>
ipvlan: Fix out-of-bound bugs caused by unset skb->mac_header
Brett Creeley <brett.creeley(a)intel.com>
iavf: Fix cached head and tail value for iavf_get_tx_pending
David Leadbeater <dgl(a)dgl.cx>
netfilter: nf_conntrack_irc: Tighten matching on DCC message
Igor Ryzhov <iryzhov(a)nfware.com>
netfilter: nf_conntrack_sip: fix ct_sip_walk_headers
Fabio Estevam <festevam(a)denx.de>
arm64: dts: rockchip: Remove 'enable-active-low' from rk3399-puma
zain wang <wzz(a)rock-chips.com>
arm64: dts: rockchip: Set RK3399-Gru PCLK_EDP to 24 MHz
Chao Yu <chao.yu(a)oppo.com>
mm/slub: fix to return errno if kmalloc() fails
Ard Biesheuvel <ardb(a)kernel.org>
efi: libstub: check Shim mode using MokSBStateRT
Callum Osmotherly <callum.osmotherly(a)gmail.com>
ALSA: hda/realtek: Enable 4-speaker output Dell Precision 5530 laptop
Kai Vehmanen <kai.vehmanen(a)linux.intel.com>
ALSA: hda: add Intel 5 Series / 3400 PCI DID
Mohan Kumar <mkumard(a)nvidia.com>
ALSA: hda/tegra: set depop delay for tegra
jerry meng <jerry-meng(a)foxmail.com>
USB: serial: option: add Quectel RM520N
Carl Yin(殷张成) <carl.yin(a)quectel.com>
USB: serial: option: add Quectel BG95 0x0203 composition
Alan Stern <stern(a)rowland.harvard.edu>
USB: core: Fix RST error in hub.c
Siddh Raman Pant <code(a)siddh.me>
wifi: mac80211: Fix UAF in ieee80211_scan_rx()
Heikki Krogerus <heikki.krogerus(a)linux.intel.com>
usb: dwc3: pci: add support for the Intel Alder Lake-S
Heikki Krogerus <heikki.krogerus(a)linux.intel.com>
usb: dwc3: pci: add support for the Intel Jasper Lake
Heikki Krogerus <heikki.krogerus(a)linux.intel.com>
usb: dwc3: pci: add support for the Intel Tiger Lake PCH -H variant
Felipe Balbi <felipe.balbi(a)linux.intel.com>
usb: dwc3: pci: add support for TigerLake Devices
Felipe Balbi <felipe.balbi(a)linux.intel.com>
usb: dwc3: pci: Add Support for Intel Elkhart Lake Devices
Takashi Iwai <tiwai(a)suse.de>
ALSA: hda/sigmatel: Fix unused variable warning for beep power change
Hyunwoo Kim <imv4bel(a)gmail.com>
video: fbdev: pxa3xx-gcu: Fix integer overflow in pxa3xx_gcu_write
Youling Tang <tangyouling(a)loongson.cn>
mksysmap: Fix the mismatch of 'L0' symbols in System.map
Alexander Sverdlin <alexander.sverdlin(a)nokia.com>
MIPS: OCTEON: irq: Fix octeon_irq_force_ciu_mapping()
jerry.meng <jerry-meng(a)foxmail.com>
net: usb: qmi_wwan: add Quectel RM520N
Takashi Iwai <tiwai(a)suse.de>
ALSA: hda/sigmatel: Keep power up while beep is enabled
David Howells <dhowells(a)redhat.com>
rxrpc: Fix local destruction being repeated
Xiaolei Wang <xiaolei.wang(a)windriver.com>
regulator: pfuze100: Fix the global-out-of-bounds access in pfuze100_regulator_probe()
Takashi Iwai <tiwai(a)suse.de>
ASoC: nau8824: Fix semaphore unbalance at error paths
Stefan Metzmacher <metze(a)samba.org>
cifs: don't send down the destination address to sendmsg for a SOCK_STREAM
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
mvpp2: no need to check return value of debugfs_create functions
Bart Van Assche <bvanassche(a)acm.org>
nvmet: fix a use-after-free
Yang Yingliang <yangyingliang(a)huawei.com>
parisc: ccio-dma: Add missing iounmap in error path in ccio_probe()
Stuart Menefy <stuart.menefy(a)mathembedded.com>
drm/meson: Correct OSD1 global alpha value
Pali Rohár <pali(a)kernel.org>
gpio: mpc8xxx: Fix support for IRQ_TYPE_LEVEL_LOW flow_type in mpc85xx
Sergey Shtylyov <s.shtylyov(a)omp.ru>
of: fdt: fix off-by-one error in unflatten_dt_nodes()
-------------
Diffstat:
Makefile | 4 +--
.../boot/dts/rockchip/rk3399-gru-chromebook.dtsi | 8 +++++
arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi | 1 -
arch/mips/cavium-octeon/octeon-irq.c | 10 +++++++
arch/mips/lantiq/clk.c | 1 +
drivers/firmware/efi/libstub/secureboot.c | 8 ++---
drivers/gpio/gpio-mpc8xxx.c | 1 +
.../drm/amd/display/modules/color/color_gamma.c | 4 +++
drivers/gpu/drm/meson/meson_plane.c | 2 +-
drivers/gpu/drm/rockchip/cdn-dp-core.c | 5 ++--
drivers/hv/vmbus_drv.c | 10 ++++++-
drivers/net/can/usb/gs_usb.c | 4 +--
drivers/net/ethernet/intel/i40e/i40e_main.c | 32 ++++++++++++++++----
drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 20 +++++++++++++
drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 5 +++-
drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c | 19 +-----------
drivers/net/ethernet/sun/sunhme.c | 4 +--
drivers/net/ipvlan/ipvlan_core.c | 6 ++--
drivers/net/team/team.c | 24 +++++++++++----
drivers/net/usb/qmi_wwan.c | 1 +
drivers/nvme/target/core.c | 5 ++--
drivers/of/fdt.c | 2 +-
drivers/of/of_mdio.c | 1 +
drivers/parisc/ccio-dma.c | 1 +
drivers/regulator/pfuze100-regulator.c | 2 +-
drivers/s390/block/dasd_alias.c | 9 ++++--
drivers/tty/serial/serial-tegra.c | 5 ++--
drivers/usb/core/hub.c | 2 +-
drivers/usb/dwc3/dwc3-pci.c | 23 ++++++++++++++-
drivers/usb/serial/option.c | 6 ++++
drivers/video/fbdev/pxa3xx-gcu.c | 2 +-
fs/cifs/transport.c | 4 +--
fs/ext4/ialloc.c | 2 +-
include/linux/serial_core.h | 17 +++++++++++
kernel/workqueue.c | 6 ++--
mm/slub.c | 5 +++-
net/bridge/netfilter/ebtables.c | 4 ++-
net/mac80211/scan.c | 11 ++++---
net/netfilter/nf_conntrack_irc.c | 34 ++++++++++++++++++----
net/netfilter/nf_conntrack_sip.c | 4 +--
net/rxrpc/local_object.c | 3 ++
scripts/mksysmap | 2 +-
sound/pci/hda/hda_intel.c | 2 ++
sound/pci/hda/patch_hdmi.c | 1 +
sound/pci/hda/patch_realtek.c | 1 +
sound/pci/hda/patch_sigmatel.c | 24 +++++++++++++++
sound/soc/codecs/nau8824.c | 17 ++++++-----
tools/perf/util/genelf.c | 14 +++++++++
tools/perf/util/genelf.h | 4 +++
tools/perf/util/symbol-elf.c | 7 ++---
50 files changed, 295 insertions(+), 94 deletions(-)
This reverts commit 24cd0b9bfdff126c066032b0d40ab0962d35e777.
1) commit 4e89dce72521 ("iommu/iova: Retry from last rb tree node if
iova search fails") tries to fix that iova allocation can fail while
there are still free space available. This is not backported to 5.10
stable.
2) commit fce54ed02757 ("scsi: hisi_sas: Limit max hw sectors for v3
HW") fix the performance regression introduced by 1), however, this
is just a temporary solution and will cause io performance regression
because it limit max io size to PAGE_SIZE * 32(128k for 4k page_size).
3) John Garry posted a patchset to fix the problem.
4) The temporary solution is reverted.
It's weird that the patch in 2) is backported to 5.10 stable alone,
while the right thing to do is to backport them all together.
Signed-off-by: Yu Kuai <yukuai3(a)huawei.com>
---
drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 7 -------
1 file changed, 7 deletions(-)
diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
index dfe7e6370d84..cd41dc061d87 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
@@ -2738,7 +2738,6 @@ static int slave_configure_v3_hw(struct scsi_device *sdev)
struct hisi_hba *hisi_hba = shost_priv(shost);
struct device *dev = hisi_hba->dev;
int ret = sas_slave_configure(sdev);
- unsigned int max_sectors;
if (ret)
return ret;
@@ -2756,12 +2755,6 @@ static int slave_configure_v3_hw(struct scsi_device *sdev)
}
}
- /* Set according to IOMMU IOVA caching limit */
- max_sectors = min_t(size_t, queue_max_hw_sectors(sdev->request_queue),
- (PAGE_SIZE * 32) >> SECTOR_SHIFT);
-
- blk_queue_max_hw_sectors(sdev->request_queue, max_sectors);
-
return 0;
}
--
2.31.1
From: ChiYuan Huang <cy_huang(a)richtek.com>
Fix the potential risk for null pointer if bank index is over the maximimum.
Refer to the discussion list for the experiment result on mt6370.
https://lore.kernel.org/all/20220914013345.GA5802@cyhuang-hp-elitebook-840-…
If not to check the bound, there is the same issue on mt6360.
Fixes: 3b0850440a06c (mfd: mt6360: Merge different sub-devices I2C read/write)
Cc: stable(a)vger.kernel.org
Signed-off-by: ChiYuan Huang <cy_huang(a)richtek.com>
---
drivers/mfd/mt6360-core.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/drivers/mfd/mt6360-core.c b/drivers/mfd/mt6360-core.c
index 6eaa677..d375333 100644
--- a/drivers/mfd/mt6360-core.c
+++ b/drivers/mfd/mt6360-core.c
@@ -410,6 +410,9 @@ static int mt6360_regmap_read(void *context, const void *reg, size_t reg_size,
u8 crc;
int ret;
+ if (bank >= MT6360_SLAVE_MAX)
+ return -EINVAL;
+
if (bank == MT6360_SLAVE_PMIC || bank == MT6360_SLAVE_LDO) {
crc_needed = true;
ret = mt6360_xlate_pmicldo_addr(®_addr, val_size);
@@ -460,6 +463,9 @@ static int mt6360_regmap_write(void *context, const void *val, size_t val_size)
int write_size = val_size - MT6360_REGMAP_REG_BYTE_SIZE;
int ret;
+ if (bank >= MT6360_SLAVE_MAX)
+ return -EINVAL;
+
if (bank == MT6360_SLAVE_PMIC || bank == MT6360_SLAVE_LDO) {
crc_needed = true;
ret = mt6360_xlate_pmicldo_addr(®_addr, val_size - MT6360_REGMAP_REG_BYTE_SIZE);
--
2.7.4
This is the start of the stable review cycle for the 4.9.330 release.
There are 21 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Wed, 28 Sep 2022 16:35:25 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.9.330-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.9.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.9.330-rc2
Jan Kara <jack(a)suse.cz>
ext4: make directory inode spreading reflect flexbg size
Vitaly Kuznetsov <vkuznets(a)redhat.com>
Drivers: hv: Never allocate anything besides framebuffer from framebuffer memory region
Stefan Haberland <sth(a)linux.ibm.com>
s390/dasd: fix Oops in dasd_alias_get_start_dev due to missing pavgroup
Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
serial: tegra: Use uart_xmit_advance(), fixes icount.tx accounting
Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
serial: Create uart_xmit_advance()
Sean Anderson <seanga2(a)gmail.com>
net: sunhme: Fix packet reception for len < RX_COPY_THRESHOLD
Adrian Hunter <adrian.hunter(a)intel.com>
perf kcore_copy: Do not check /proc/modules is unchanged
Marc Kleine-Budde <mkl(a)pengutronix.de>
can: gs_usb: gs_can_open(): fix race dev->can.state condition
Randy Dunlap <rdunlap(a)infradead.org>
MIPS: lantiq: export clk_get_io() for lantiq_wdt.ko
Benjamin Poirier <bpoirier(a)nvidia.com>
net: team: Unsync device addresses on ndo_stop
Lu Wei <luwei32(a)huawei.com>
ipvlan: Fix out-of-bound bugs caused by unset skb->mac_header
David Leadbeater <dgl(a)dgl.cx>
netfilter: nf_conntrack_irc: Tighten matching on DCC message
Igor Ryzhov <iryzhov(a)nfware.com>
netfilter: nf_conntrack_sip: fix ct_sip_walk_headers
Chao Yu <chao.yu(a)oppo.com>
mm/slub: fix to return errno if kmalloc() fails
Kai Vehmanen <kai.vehmanen(a)linux.intel.com>
ALSA: hda: add Intel 5 Series / 3400 PCI DID
Mohan Kumar <mkumard(a)nvidia.com>
ALSA: hda/tegra: set depop delay for tegra
Alan Stern <stern(a)rowland.harvard.edu>
USB: core: Fix RST error in hub.c
Siddh Raman Pant <code(a)siddh.me>
wifi: mac80211: Fix UAF in ieee80211_scan_rx()
Hyunwoo Kim <imv4bel(a)gmail.com>
video: fbdev: pxa3xx-gcu: Fix integer overflow in pxa3xx_gcu_write
Stefan Metzmacher <metze(a)samba.org>
cifs: don't send down the destination address to sendmsg for a SOCK_STREAM
Yang Yingliang <yangyingliang(a)huawei.com>
parisc: ccio-dma: Add missing iounmap in error path in ccio_probe()
-------------
Diffstat:
Makefile | 4 ++--
arch/mips/lantiq/clk.c | 1 +
drivers/hv/vmbus_drv.c | 10 +++++++++-
drivers/net/can/usb/gs_usb.c | 4 ++--
drivers/net/ethernet/sun/sunhme.c | 4 ++--
drivers/net/ipvlan/ipvlan_core.c | 6 ++++--
drivers/net/team/team.c | 24 ++++++++++++++++++------
drivers/parisc/ccio-dma.c | 1 +
drivers/s390/block/dasd_alias.c | 9 +++++++--
drivers/tty/serial/serial-tegra.c | 5 ++---
drivers/usb/core/hub.c | 2 +-
drivers/video/fbdev/pxa3xx-gcu.c | 2 +-
fs/cifs/transport.c | 4 ++--
fs/ext4/ialloc.c | 2 +-
include/linux/serial_core.h | 17 +++++++++++++++++
mm/slub.c | 5 ++++-
net/mac80211/scan.c | 11 +++++++----
net/netfilter/nf_conntrack_irc.c | 34 ++++++++++++++++++++++++++++------
net/netfilter/nf_conntrack_sip.c | 4 ++--
sound/pci/hda/hda_intel.c | 2 ++
sound/pci/hda/patch_hdmi.c | 1 +
tools/perf/util/symbol-elf.c | 7 ++-----
22 files changed, 116 insertions(+), 43 deletions(-)
This is the start of the stable review cycle for the 4.14.295 release.
There are 38 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Wed, 28 Sep 2022 16:35:25 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.14.295-r…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.14.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.14.295-rc2
Dongliang Mu <mudongliangabcd(a)gmail.com>
media: em28xx: initialize refcount before kref_get
Jan Kara <jack(a)suse.cz>
ext4: make directory inode spreading reflect flexbg size
Vitaly Kuznetsov <vkuznets(a)redhat.com>
Drivers: hv: Never allocate anything besides framebuffer from framebuffer memory region
Stefan Haberland <sth(a)linux.ibm.com>
s390/dasd: fix Oops in dasd_alias_get_start_dev due to missing pavgroup
Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
serial: tegra: Use uart_xmit_advance(), fixes icount.tx accounting
Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
serial: Create uart_xmit_advance()
Sean Anderson <seanga2(a)gmail.com>
net: sunhme: Fix packet reception for len < RX_COPY_THRESHOLD
Adrian Hunter <adrian.hunter(a)intel.com>
perf kcore_copy: Do not check /proc/modules is unchanged
Marc Kleine-Budde <mkl(a)pengutronix.de>
can: gs_usb: gs_can_open(): fix race dev->can.state condition
Florian Westphal <fw(a)strlen.de>
netfilter: ebtables: fix memory leak when blob is malformed
Liang He <windhl(a)126.com>
of: mdio: Add of_node_put() when breaking out of for_each_xx
Randy Dunlap <rdunlap(a)infradead.org>
MIPS: lantiq: export clk_get_io() for lantiq_wdt.ko
Benjamin Poirier <bpoirier(a)nvidia.com>
net: team: Unsync device addresses on ndo_stop
Lu Wei <luwei32(a)huawei.com>
ipvlan: Fix out-of-bound bugs caused by unset skb->mac_header
Brett Creeley <brett.creeley(a)intel.com>
iavf: Fix cached head and tail value for iavf_get_tx_pending
David Leadbeater <dgl(a)dgl.cx>
netfilter: nf_conntrack_irc: Tighten matching on DCC message
Igor Ryzhov <iryzhov(a)nfware.com>
netfilter: nf_conntrack_sip: fix ct_sip_walk_headers
Fabio Estevam <festevam(a)denx.de>
arm64: dts: rockchip: Remove 'enable-active-low' from rk3399-puma
Chao Yu <chao.yu(a)oppo.com>
mm/slub: fix to return errno if kmalloc() fails
Kai Vehmanen <kai.vehmanen(a)linux.intel.com>
ALSA: hda: add Intel 5 Series / 3400 PCI DID
Mohan Kumar <mkumard(a)nvidia.com>
ALSA: hda/tegra: set depop delay for tegra
jerry meng <jerry-meng(a)foxmail.com>
USB: serial: option: add Quectel RM520N
Carl Yin(殷张成) <carl.yin(a)quectel.com>
USB: serial: option: add Quectel BG95 0x0203 composition
Alan Stern <stern(a)rowland.harvard.edu>
USB: core: Fix RST error in hub.c
Siddh Raman Pant <code(a)siddh.me>
wifi: mac80211: Fix UAF in ieee80211_scan_rx()
Takashi Iwai <tiwai(a)suse.de>
ALSA: hda/sigmatel: Fix unused variable warning for beep power change
Hyunwoo Kim <imv4bel(a)gmail.com>
video: fbdev: pxa3xx-gcu: Fix integer overflow in pxa3xx_gcu_write
Youling Tang <tangyouling(a)loongson.cn>
mksysmap: Fix the mismatch of 'L0' symbols in System.map
Alexander Sverdlin <alexander.sverdlin(a)nokia.com>
MIPS: OCTEON: irq: Fix octeon_irq_force_ciu_mapping()
jerry.meng <jerry-meng(a)foxmail.com>
net: usb: qmi_wwan: add Quectel RM520N
Takashi Iwai <tiwai(a)suse.de>
ALSA: hda/sigmatel: Keep power up while beep is enabled
Xiaolei Wang <xiaolei.wang(a)windriver.com>
regulator: pfuze100: Fix the global-out-of-bounds access in pfuze100_regulator_probe()
Takashi Iwai <tiwai(a)suse.de>
ASoC: nau8824: Fix semaphore unbalance at error paths
Stefan Metzmacher <metze(a)samba.org>
cifs: don't send down the destination address to sendmsg for a SOCK_STREAM
Yang Yingliang <yangyingliang(a)huawei.com>
parisc: ccio-dma: Add missing iounmap in error path in ccio_probe()
Stuart Menefy <stuart.menefy(a)mathembedded.com>
drm/meson: Correct OSD1 global alpha value
Pali Rohár <pali(a)kernel.org>
gpio: mpc8xxx: Fix support for IRQ_TYPE_LEVEL_LOW flow_type in mpc85xx
Sergey Shtylyov <s.shtylyov(a)omp.ru>
of: fdt: fix off-by-one error in unflatten_dt_nodes()
-------------
Diffstat:
Makefile | 4 ++--
arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi | 1 -
arch/mips/cavium-octeon/octeon-irq.c | 10 ++++++++
arch/mips/lantiq/clk.c | 1 +
drivers/gpio/gpio-mpc8xxx.c | 1 +
drivers/gpu/drm/meson/meson_plane.c | 2 +-
drivers/hv/vmbus_drv.c | 10 +++++++-
drivers/media/usb/em28xx/em28xx-cards.c | 4 ++--
drivers/net/can/usb/gs_usb.c | 4 ++--
drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 5 +++-
drivers/net/ethernet/sun/sunhme.c | 4 ++--
drivers/net/ipvlan/ipvlan_core.c | 6 +++--
drivers/net/team/team.c | 24 ++++++++++++++-----
drivers/net/usb/qmi_wwan.c | 1 +
drivers/of/fdt.c | 2 +-
drivers/of/of_mdio.c | 1 +
drivers/parisc/ccio-dma.c | 1 +
drivers/regulator/pfuze100-regulator.c | 2 +-
drivers/s390/block/dasd_alias.c | 9 +++++--
drivers/tty/serial/serial-tegra.c | 5 ++--
drivers/usb/core/hub.c | 2 +-
drivers/usb/serial/option.c | 6 +++++
drivers/video/fbdev/pxa3xx-gcu.c | 2 +-
fs/cifs/transport.c | 4 ++--
fs/ext4/ialloc.c | 2 +-
include/linux/serial_core.h | 17 ++++++++++++++
mm/slub.c | 5 +++-
net/bridge/netfilter/ebtables.c | 4 +++-
net/mac80211/scan.c | 11 +++++----
net/netfilter/nf_conntrack_irc.c | 34 ++++++++++++++++++++++-----
net/netfilter/nf_conntrack_sip.c | 4 ++--
scripts/mksysmap | 2 +-
sound/pci/hda/hda_intel.c | 2 ++
sound/pci/hda/patch_hdmi.c | 1 +
sound/pci/hda/patch_sigmatel.c | 24 +++++++++++++++++++
sound/soc/codecs/nau8824.c | 17 ++++++++------
tools/perf/util/symbol-elf.c | 7 ++----
37 files changed, 182 insertions(+), 59 deletions(-)
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
If a process is waiting on the ring buffer for data, there currently isn't
a clean way to force it to wake up. Add an ioctl call that will force any
tasks that are waiting on the trace_pipe_raw file to wake up.
Cc: stable(a)vger.kernel.org
Fixes: e30f53aad2202 ("tracing: Do not busy wait in buffer splice")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/trace.c | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e101b0764b39..fdbcabfdd2c4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8349,12 +8349,30 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return ret;
}
+/* An ioctl call to the ring buffer file will wake up all waiters */
+static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct ftrace_buffer_info *info = file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ mutex_lock(&trace_types_lock);
+
+ iter->wait_index++;
+ /* Make sure the waiters see the new wait_index */
+ smp_wmb();
+
+ ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+ mutex_unlock(&trace_types_lock);
+ return 0;
+}
+
static const struct file_operations tracing_buffers_fops = {
.open = tracing_buffers_open,
.read = tracing_buffers_read,
.poll = tracing_buffers_poll,
.release = tracing_buffers_release,
.splice_read = tracing_buffers_splice_read,
+ .unlocked_ioctl = tracing_buffers_ioctl,
.llseek = no_llseek,
};
--
2.35.1
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
When the file that represents the ring buffer is closed, there may be
waiters waiting on more input from the ring buffer. Call
ring_buffer_wake_waiters() to wake up any waiters when the file is
closed.
Cc: stable(a)vger.kernel.org
Fixes: e30f53aad2202 ("tracing: Do not busy wait in buffer splice")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
include/linux/trace_events.h | 1 +
kernel/trace/trace.c | 15 +++++++++++++++
2 files changed, 16 insertions(+)
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8401dec93c15..20749bd9db71 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -92,6 +92,7 @@ struct trace_iterator {
unsigned int temp_size;
char *fmt; /* modified format holder */
unsigned int fmt_size;
+ long wait_index;
/* trace_seq for __print_flags() and __print_symbolic() etc. */
struct trace_seq tmp_seq;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index aed7ea6e6045..e101b0764b39 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8160,6 +8160,12 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
__trace_array_put(iter->tr);
+ iter->wait_index++;
+ /* Make sure the waiters see the new wait_index */
+ smp_wmb();
+
+ ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+
if (info->spare)
ring_buffer_free_read_page(iter->array_buffer->buffer,
info->spare_cpu, info->spare);
@@ -8313,6 +8319,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
+ long wait_index;
+
if (ret)
goto out;
@@ -8320,10 +8328,17 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
goto out;
+ wait_index = READ_ONCE(iter->wait_index);
+
ret = wait_on_pipe(iter, iter->tr->buffer_percent);
if (ret)
goto out;
+ /* Make sure we see the new wait_index */
+ smp_rmb();
+ if (wait_index != iter->wait_index)
+ goto out;
+
goto again;
}
--
2.35.1
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
On closing of a file that represents a ring buffer or flushing the file,
there may be waiters on the ring buffer that needs to be woken up and exit
the ring_buffer_wait() function.
Add ring_buffer_wake_waiters() to wake up the waiters on the ring buffer
and allow them to exit the wait loop.
Cc: stable(a)vger.kernel.org
Fixes: 15693458c4bc0 ("tracing/ring-buffer: Move poll wake ups into ring buffer code")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
include/linux/ring_buffer.h | 2 +-
kernel/trace/ring_buffer.c | 34 ++++++++++++++++++++++++++++++++++
2 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index dac53fd3afea..2504df9a0453 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -101,7 +101,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full);
__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
struct file *filp, poll_table *poll_table);
-
+void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu);
#define RING_BUFFER_ALL_CPUS -1
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5a7d818ca3ea..677812b8fae0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -413,6 +413,7 @@ struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
wait_queue_head_t full_waiters;
+ long wait_index;
bool waiters_pending;
bool full_waiters_pending;
bool wakeup_full;
@@ -924,6 +925,32 @@ static void rb_wake_up_waiters(struct irq_work *work)
}
}
+/**
+ * ring_buffer_wake_waiters - wake up any waiters on this ring buffer
+ * @buffer: The ring buffer to wake waiters on
+ *
+ * In the case of a file that represents a ring buffer is closing,
+ * it is prudent to wake up any waiters that are on this.
+ */
+void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct rb_irq_work *rbwork;
+
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ rbwork = &buffer->irq_work;
+ } else {
+ cpu_buffer = buffer->buffers[cpu];
+ rbwork = &cpu_buffer->irq_work;
+ }
+
+ rbwork->wait_index++;
+ /* make sure the waiters see the new index */
+ smp_wmb();
+
+ rb_wake_up_waiters(&rbwork->work);
+}
+
/**
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
@@ -939,6 +966,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
struct rb_irq_work *work;
+ long wait_index;
int ret = 0;
/*
@@ -957,6 +985,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
work = &cpu_buffer->irq_work;
}
+ wait_index = READ_ONCE(work->wait_index);
while (true) {
if (full)
@@ -1021,6 +1050,11 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
}
schedule();
+
+ /* Make sure to see the new wait index */
+ smp_rmb();
+ if (wait_index != work->wait_index)
+ break;
}
if (full)
--
2.35.1
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
The wake up waiters only checks the "wakeup_full" variable and not the
"full_waiters_pending". The full_waiters_pending is set when a waiter is
added to the wait queue. The wakeup_full is only set when an event is
triggered, and it clears the full_waiters_pending to avoid multiple calls
to irq_work_queue().
The irq_work callback really needs to check both wakeup_full as well as
full_waiters_pending such that this code can be used to wake up waiters
when a file is closed that represents the ring buffer and the waiters need
to be woken up.
Cc: stable(a)vger.kernel.org
Fixes: 15693458c4bc0 ("tracing/ring-buffer: Move poll wake ups into ring buffer code")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/ring_buffer.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 02db92c9eb1b..5a7d818ca3ea 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -917,8 +917,9 @@ static void rb_wake_up_waiters(struct irq_work *work)
struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
wake_up_all(&rbwork->waiters);
- if (rbwork->wakeup_full) {
+ if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
rbwork->wakeup_full = false;
+ rbwork->full_waiters_pending = false;
wake_up_all(&rbwork->full_waiters);
}
}
--
2.35.1
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
The logic to know when the shortest waiters on the ring buffer should be
woken up or not has uses a less than instead of a greater than compare,
which causes the shortest_full to actually be the longest.
Cc: stable(a)vger.kernel.org
Fixes: 2c2b0a78b3739 ("ring-buffer: Add percentage of ring buffer full to wake up reader")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/ring_buffer.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6b145d48dfd1..02db92c9eb1b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1011,7 +1011,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
nr_pages = cpu_buffer->nr_pages;
dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
if (!cpu_buffer->shortest_full ||
- cpu_buffer->shortest_full < full)
+ cpu_buffer->shortest_full > full)
cpu_buffer->shortest_full = full;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
if (!pagebusy &&
--
2.35.1
From: Menglong Dong <imagedong(a)tencent.com>
The mptcp socket and its subflow sockets in accept queue can't be
released after the process exit.
While the release of a mptcp socket in listening state, the
corresponding tcp socket will be released too. Meanwhile, the tcp
socket in the unaccept queue will be released too. However, only init
subflow is in the unaccept queue, and the joined subflow is not in the
unaccept queue, which makes the joined subflow won't be released, and
therefore the corresponding unaccepted mptcp socket will not be released
to.
This can be reproduced easily with following steps:
1. create 2 namespace and veth:
$ ip netns add mptcp-client
$ ip netns add mptcp-server
$ sysctl -w net.ipv4.conf.all.rp_filter=0
$ ip netns exec mptcp-client sysctl -w net.mptcp.enabled=1
$ ip netns exec mptcp-server sysctl -w net.mptcp.enabled=1
$ ip link add red-client netns mptcp-client type veth peer red-server \
netns mptcp-server
$ ip -n mptcp-server address add 10.0.0.1/24 dev red-server
$ ip -n mptcp-server address add 192.168.0.1/24 dev red-server
$ ip -n mptcp-client address add 10.0.0.2/24 dev red-client
$ ip -n mptcp-client address add 192.168.0.2/24 dev red-client
$ ip -n mptcp-server link set red-server up
$ ip -n mptcp-client link set red-client up
2. configure the endpoint and limit for client and server:
$ ip -n mptcp-server mptcp endpoint flush
$ ip -n mptcp-server mptcp limits set subflow 2 add_addr_accepted 2
$ ip -n mptcp-client mptcp endpoint flush
$ ip -n mptcp-client mptcp limits set subflow 2 add_addr_accepted 2
$ ip -n mptcp-client mptcp endpoint add 192.168.0.2 dev red-client id \
1 subflow
3. listen and accept on a port, such as 9999. The nc command we used
here is modified, which makes it use mptcp protocol by default.
$ ip netns exec mptcp-server nc -l -k -p 9999
4. open another *two* terminal and use each of them to connect to the
server with the following command:
$ ip netns exec mptcp-client nc 10.0.0.1 9999
Input something after connect to trigger the connection of the second
subflow. So that there are two established mptcp connections, with the
second one still unaccepted.
5. exit all the nc command, and check the tcp socket in server namespace.
And you will find that there is one tcp socket in CLOSE_WAIT state
and can't release forever.
Fix this by closing all of the unaccepted mptcp socket in
mptcp_subflow_queue_clean() with __mptcp_close().
Now, we can ensure that all unaccepted mptcp sockets will be cleaned by
__mptcp_close() before they are released, so mptcp_sock_destruct(), which
is used to clean the unaccepted mptcp socket, is not needed anymore.
The selftests for mptcp is ran for this commit, and no new failures.
Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests")
Fixes: 6aeed9045071 ("mptcp: fix race on unaccepted mptcp sockets")
Cc: stable(a)vger.kernel.org
Reviewed-by: Jiang Biao <benbjiang(a)tencent.com>
Reviewed-by: Mengen Sun <mengensun(a)tencent.com>
Acked-by: Paolo Abeni <pabeni(a)redhat.com>
Signed-off-by: Menglong Dong <imagedong(a)tencent.com>
Signed-off-by: Mat Martineau <mathew.j.martineau(a)linux.intel.com>
---
net/mptcp/protocol.c | 2 +-
net/mptcp/protocol.h | 1 +
net/mptcp/subflow.c | 33 +++++++--------------------------
3 files changed, 9 insertions(+), 27 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index f7690414320a..f8897a70c11d 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2662,7 +2662,7 @@ static void __mptcp_clear_xmit(struct sock *sk)
dfrag_clear(sk, dfrag);
}
-static void mptcp_cancel_work(struct sock *sk)
+void mptcp_cancel_work(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 8f123d450c76..8f372b8f059c 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -613,6 +613,7 @@ void mptcp_subflow_queue_clean(struct sock *ssk);
void mptcp_sock_graft(struct sock *sk, struct socket *parent);
struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
bool __mptcp_close(struct sock *sk, long timeout);
+void mptcp_cancel_work(struct sock *sk);
bool mptcp_addresses_equal(const struct mptcp_addr_info *a,
const struct mptcp_addr_info *b, bool use_port);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index c7d49fb6e7bd..07dd23d0fe04 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -602,30 +602,6 @@ static bool subflow_hmac_valid(const struct request_sock *req,
return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN);
}
-static void mptcp_sock_destruct(struct sock *sk)
-{
- /* if new mptcp socket isn't accepted, it is free'd
- * from the tcp listener sockets request queue, linked
- * from req->sk. The tcp socket is released.
- * This calls the ULP release function which will
- * also remove the mptcp socket, via
- * sock_put(ctx->conn).
- *
- * Problem is that the mptcp socket will be in
- * ESTABLISHED state and will not have the SOCK_DEAD flag.
- * Both result in warnings from inet_sock_destruct.
- */
- if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
- sk->sk_state = TCP_CLOSE;
- WARN_ON_ONCE(sk->sk_socket);
- sock_orphan(sk);
- }
-
- /* We don't need to clear msk->subflow, as it's still NULL at this point */
- mptcp_destroy_common(mptcp_sk(sk), 0);
- inet_sock_destruct(sk);
-}
-
static void mptcp_force_close(struct sock *sk)
{
/* the msk is not yet exposed to user-space */
@@ -768,7 +744,6 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
/* new mpc subflow takes ownership of the newly
* created mptcp socket
*/
- new_msk->sk_destruct = mptcp_sock_destruct;
mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq;
mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1);
mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
@@ -1763,13 +1738,19 @@ void mptcp_subflow_queue_clean(struct sock *listener_ssk)
for (msk = head; msk; msk = next) {
struct sock *sk = (struct sock *)msk;
- bool slow;
+ bool slow, do_cancel_work;
+ sock_hold(sk);
slow = lock_sock_fast_nested(sk);
next = msk->dl_next;
msk->first = NULL;
msk->dl_next = NULL;
+
+ do_cancel_work = __mptcp_close(sk, 0);
unlock_sock_fast(sk, slow);
+ if (do_cancel_work)
+ mptcp_cancel_work(sk);
+ sock_put(sk);
}
/* we are still under the listener msk socket lock */
--
2.37.3
Приветствую тебя, мой дорогой друг,
Я уже второй раз пытаюсь связаться с вами. Я надеюсь, что вы получите это сообщение в добром здравии.
Со стола: Барристер Пол Мартин, эсквайр. Калифорния, США. Я пишу, чтобы попросить вашей помощи, чтобы стать ближайшим родственником и конечным покупателем. Эти деньги на сумму более (8,5 миллионов долларов) принадлежат моему покойному клиенту, который умер несколько лет назад, и банк поручил мне представить бенефициара или деньги поступят в казну банка по мере
невостребованный фон.
Если вы заинтересованы в том, чтобы помочь мне, мы разделим 40% для меня, 40% для вас, 10% для детского дома и 10% для любых расходов на доход, которые возникнут во время перевода. В подтверждение этого сообщения, указывающего на вашу заинтересованность, Ответить на мою личную электронную почту ниже для получения более подробной информации с вашим:
(I) Ваше полное имя............ ,
(II) Ваша страна .................,
(III) Возраст ......................,
Надеюсь скоро прочитать от вас более подробную информацию, и я надеюсь, что это сообщение дойдет до вас в добром здравии ..
Искренне Ваш,
Барристер Пол Мартин, эсквайр
(Электронная почта: esq.paulmartins(a)gmail.com)
Hello Dear God,s Select Good Day,
I apologized, If this mail find's you disturbing, It might not be the
best way to approach you as we have not met before, but due to the
urgency of my present situation i decided to communicate this way, so
please pardon my manna, I am writing this mail to you with heavy tears
In my eyes and great sorrow in my heart, My Name is Mrs.Juliette
Morgan, and I am contacting you from my country Norway, I want to tell
you this because I don't have any other option than to tell you as I
was touched to open up to you,
I married to Mr.sami Morgan. Who worked with Norway embassy in Burkina
Faso for nine years before he died in the year 2020.We were married
for eleven years without a child He died after a brief illness that
lasted for only five days. Since his death I decided not to remarry,
When my late husband was alive he deposited the sum of € 8.5 Million
Euro (Eight million, Five hundred thousand Euros) in a bank in
Ouagadougou the capital city of Burkina Faso in west Africa Presently
this money is still in bank. He made this money available for
exportation of Gold from Burkina Faso mining.
Recently, My Doctor told me that I would not last for the period of
seven months due to cancer problem. The one that disturbs me most is
my stroke sickness.Having known my condition I decided to hand you
over this money to take care of the less-privileged people, you will
utilize this money the way I am going to instruct herein.
I want you to take 30 Percent of the total money for your personal use
While 70% of the money will go to charity, people in the street and
helping the orphanage. I grew up as an Orphan and I don't have any
body as my family member, just to endeavour that the house of God is
maintained. Am doing this so that God will forgive my sins and accept
my soul because these sicknesses have suffered me so much.
As soon as I receive your reply I shall give you the contact of the
bank in Burkina Faso and I will also instruct the Bank Manager to
issue you an authority letter that will prove you the present
beneficiary of the money in the bank that is if you assure me that you
will act accordingly as I Stated herein.
Always reply to my alternative for security purposes
Hoping to receive your reply:
From Mrs.Juliette Morgan,
From: Zheng Yejian <zhengyejian1(a)huawei.com>
When executing following commands like what document said, but the log
"#### all functions enabled ####" was not shown as expect:
1. Set a 'mod' filter:
$ echo 'write*:mod:ext3' > /sys/kernel/tracing/set_ftrace_filter
2. Invert above filter:
$ echo '!write*:mod:ext3' >> /sys/kernel/tracing/set_ftrace_filter
3. Read the file:
$ cat /sys/kernel/tracing/set_ftrace_filter
By some debugging, I found that flag FTRACE_HASH_FL_MOD was not unset
after inversion like above step 2 and then result of ftrace_hash_empty()
is incorrect.
Link: https://lkml.kernel.org/r/20220926152008.2239274-1-zhengyejian1@huawei.com
Cc: <mingo(a)redhat.com>
Cc: stable(a)vger.kernel.org
Fixes: 8c08f0d5c6fb ("ftrace: Have cached module filters be an active filter")
Signed-off-by: Zheng Yejian <zhengyejian1(a)huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/ftrace.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 439e2ab6905e..5a1ec7e1af33 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6081,8 +6081,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
if (filter_hash) {
orig_hash = &iter->ops->func_hash->filter_hash;
- if (iter->tr && !list_empty(&iter->tr->mod_trace))
- iter->hash->flags |= FTRACE_HASH_FL_MOD;
+ if (iter->tr) {
+ if (list_empty(&iter->tr->mod_trace))
+ iter->hash->flags &= ~FTRACE_HASH_FL_MOD;
+ else
+ iter->hash->flags |= FTRACE_HASH_FL_MOD;
+ }
} else
orig_hash = &iter->ops->func_hash->notrace_hash;
--
2.35.1
From: Tao Chen <chentao.kernel(a)linux.alibaba.com>
The event dir will alloc failed when event name no set, using the
command:
"echo "e:esys/ syscalls/sys_enter_openat file=\$filename:string"
>> dynamic_events"
It seems that dir name="syscalls/sys_enter_openat" is not allowed
in debugfs. So just use the "sys_enter_openat" as the event name.
Link: https://lkml.kernel.org/r/1664028814-45923-1-git-send-email-chentao.kernel@…
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Tom Zanussi <zanussi(a)kernel.org>
Cc: Linyu Yuan <quic_linyyuan(a)quicinc.com>
Cc: Tao Chen <chentao.kernel(a)linux.alibaba.com
Cc: stable(a)vger.kernel.org
Fixes: 95c104c378dc ("tracing: Auto generate event name when creating a group of events")
Acked-by: Masami Hiramatsu (Google) <mhiramat(a)kernel.org>
Signed-off-by: Tao Chen <chentao.kernel(a)linux.alibaba.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/trace_eprobe.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 78299d3724a2..c08bde9871ec 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -1039,8 +1039,7 @@ static int __trace_eprobe_create(int argc, const char *argv[])
}
if (!event) {
- strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN);
- sanitize_event_name(buf1);
+ strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN);
event = buf1;
}
--
2.35.1
This reverts commit 8bd6b8c4b1009d7d2662138d6bdc6fe58a9274c5.
Prerequisite revert for the reverting of the original commit 0f0101719138.
Fixes: 8bd6b8c4b100 ("USB: fixup for merge issue with "usb: dwc3: Don't switch OTG -> peripheral if extcon is present"")
Fixes: 0f0101719138 ("usb: dwc3: Don't switch OTG -> peripheral if extcon is present")
Reported-by: Ferry Toth <fntoth(a)gmail.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Tested-by: Ferry Toth <fntoth(a)gmail.com> # for Merrifield
---
drivers/usb/dwc3/core.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index d0237b30c9be..c2b463469d51 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -1684,8 +1684,13 @@ static struct extcon_dev *dwc3_get_extcon(struct dwc3 *dwc)
* This device property is for kernel internal use only and
* is expected to be set by the glue code.
*/
- if (device_property_read_string(dev, "linux,extcon-name", &name) == 0)
- return extcon_get_extcon_dev(name);
+ if (device_property_read_string(dev, "linux,extcon-name", &name) == 0) {
+ edev = extcon_get_extcon_dev(name);
+ if (!edev)
+ return ERR_PTR(-EPROBE_DEFER);
+
+ return edev;
+ }
/*
* Try to get an extcon device from the USB PHY controller's "port"
--
2.35.1
Both i.MX6 and i.MX8 reference manuals list 0xBF8 as SNVS_HPVIDR1
(chapters 57.9 and 6.4.5 respectively).
Without this, trying to read the revision number results in 0 on
all revisions, causing the i.MX6 quirk to apply on all platforms,
which in turn causes the driver to synthesise power button release
events instead of passing the real one as they happen even on
platforms like i.MX8 where that's not wanted.
Fixes: 1a26c920717a ("Input: snvs_pwrkey - send key events for i.MX6 S, DL and Q")
Cc: <stable(a)vger.kernel.org>
Tested-by: Martin Kepplinger <martin.kepplinger(a)puri.sm>
Signed-off-by: Sebastian Krzyszkowiak <sebastian.krzyszkowiak(a)puri.sm>
---
Resent <20220321171755.656750-1-sebastian.krzyszkowiak(a)puri.sm>
v2: augmented commit message; added cc: stable and tested-by
---
drivers/input/keyboard/snvs_pwrkey.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/input/keyboard/snvs_pwrkey.c b/drivers/input/keyboard/snvs_pwrkey.c
index 65286762b02a..ad8660be0127 100644
--- a/drivers/input/keyboard/snvs_pwrkey.c
+++ b/drivers/input/keyboard/snvs_pwrkey.c
@@ -20,7 +20,7 @@
#include <linux/mfd/syscon.h>
#include <linux/regmap.h>
-#define SNVS_HPVIDR1_REG 0xF8
+#define SNVS_HPVIDR1_REG 0xBF8
#define SNVS_LPSR_REG 0x4C /* LP Status Register */
#define SNVS_LPCR_REG 0x38 /* LP Control Register */
#define SNVS_HPSR_REG 0x14
--
2.35.1
Deferred probe is an expected return value for fwnode_usb_role_switch_get().
Given that the driver deals with it properly, there's no need to output a
warning that may potentially confuse users.
Fixes: 3c162511530c ("usb: typec: ucsi: Wait for the USB role switches")
Cc: stable(a)vger.kernel.org
Signed-off-by: Wayne Chang <waynec(a)nvidia.com>
---
V1 -> V2: adjust the coding style for better reading format.
drivers/usb/typec/ucsi/ucsi.c | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c
index 7f2624f42724..e961ebecd7df 100644
--- a/drivers/usb/typec/ucsi/ucsi.c
+++ b/drivers/usb/typec/ucsi/ucsi.c
@@ -1069,11 +1069,9 @@ static int ucsi_register_port(struct ucsi *ucsi, int index)
cap->fwnode = ucsi_find_fwnode(con);
con->usb_role_sw = fwnode_usb_role_switch_get(cap->fwnode);
- if (IS_ERR(con->usb_role_sw)) {
- dev_err(ucsi->dev, "con%d: failed to get usb role switch\n",
- con->num);
- return PTR_ERR(con->usb_role_sw);
- }
+ if (IS_ERR(con->usb_role_sw))
+ return dev_err_probe(ucsi->dev, PTR_ERR(con->usb_role_sw),
+ "con%d: failed to get usb role switch\n", con->num);
/* Delay other interactions with the con until registration is complete */
mutex_lock(&con->lock);
--
2.25.1
Deferred probe is an expected return value for fwnode_usb_role_switch_get().
Given that the driver deals with it properly, there's no need to output a
warning that may potentially confuse users.
Fixes: 3c162511530c ("usb: typec: ucsi: Wait for the USB role switches")
Cc: stable(a)vger.kernel.org
Signed-off-by: Wayne Chang <waynec(a)nvidia.com>
---
drivers/usb/typec/ucsi/ucsi.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c
index 7f2624f42724..45a4374e9baa 100644
--- a/drivers/usb/typec/ucsi/ucsi.c
+++ b/drivers/usb/typec/ucsi/ucsi.c
@@ -1069,11 +1069,10 @@ static int ucsi_register_port(struct ucsi *ucsi, int index)
cap->fwnode = ucsi_find_fwnode(con);
con->usb_role_sw = fwnode_usb_role_switch_get(cap->fwnode);
- if (IS_ERR(con->usb_role_sw)) {
- dev_err(ucsi->dev, "con%d: failed to get usb role switch\n",
+ if (IS_ERR(con->usb_role_sw))
+ return dev_err_probe(ucsi->dev, PTR_ERR(con->usb_role_sw),
+ "con%d: failed to get usb role switch\n",
con->num);
- return PTR_ERR(con->usb_role_sw);
- }
/* Delay other interactions with the con until registration is complete */
mutex_lock(&con->lock);
--
2.25.1
This is a note to let you know that I've just added the patch titled
usb: typec: ucsi: Remove incorrect warning
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 415ba26cb73f7d22a892043301b91b57ae54db02 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus(a)linux.intel.com>
Date: Thu, 22 Sep 2022 17:59:24 +0300
Subject: usb: typec: ucsi: Remove incorrect warning
Sink only devices do not have any source capabilities, so
the driver should not warn about that. Also DRP (Dual Role
Power) capable devices, such as USB Type-C docking stations,
do not return any source capabilities unless they are
plugged to a power supply themselves.
Fixes: 1f4642b72be7 ("usb: typec: ucsi: Retrieve all the PDOs instead of just the first 4")
Reported-by: Paul Menzel <pmenzel(a)molgen.mpg.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Heikki Krogerus <heikki.krogerus(a)linux.intel.com>
Link: https://lore.kernel.org/r/20220922145924.80667-1-heikki.krogerus@linux.inte…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/typec/ucsi/ucsi.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c
index 7f2624f42724..6364f0d467ea 100644
--- a/drivers/usb/typec/ucsi/ucsi.c
+++ b/drivers/usb/typec/ucsi/ucsi.c
@@ -588,8 +588,6 @@ static int ucsi_get_pdos(struct ucsi_connector *con, int is_partner,
num_pdos * sizeof(u32));
if (ret < 0 && ret != -ETIMEDOUT)
dev_err(ucsi->dev, "UCSI_GET_PDOS failed (%d)\n", ret);
- if (ret == 0 && offset == 0)
- dev_warn(ucsi->dev, "UCSI_GET_PDOS returned 0 bytes\n");
return ret;
}
--
2.37.3
The full RNG initialization relies on some timestamps, made possible
with general functions like time_init() and timekeeping_init(). However,
these are only available rather late in initialization. Meanwhile, other
things, such as memory allocator functions, make use of the RNG much
earlier.
So split RNG initialization into two phases. We can give arch randomness
very early on, and then later, after timekeeping and such are available,
initialize the rest.
This ensures that, for example, slabs are properly randomized if RDRAND
is available. Without this, CONFIG_SLAB_FREELIST_RANDOM=y loses a degree
of its security, because its random seed is potentially deterministic,
since it hasn't yet incorporated RDRAND. It also makes it possible to
use a better seed in kfence, which currently relies on only the cycle
counter.
Another positive consequence is that on systems with RDRAND, running
with CONFIG_WARN_ALL_UNSEEDED_RANDOM=y results in no warnings at all.
Cc: Kees Cook <keescook(a)chromium.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason(a)zx2c4.com>
---
drivers/char/random.c | 47 ++++++++++++++++++++++++------------------
include/linux/random.h | 3 ++-
init/main.c | 17 +++++++--------
3 files changed, 37 insertions(+), 30 deletions(-)
diff --git a/drivers/char/random.c b/drivers/char/random.c
index a90d96f4b3bb..1cb53495e8f7 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -772,18 +772,13 @@ static int random_pm_notification(struct notifier_block *nb, unsigned long actio
static struct notifier_block pm_notifier = { .notifier_call = random_pm_notification };
/*
- * The first collection of entropy occurs at system boot while interrupts
- * are still turned off. Here we push in latent entropy, RDSEED, a timestamp,
- * utsname(), and the command line. Depending on the above configuration knob,
- * RDSEED may be considered sufficient for initialization. Note that much
- * earlier setup may already have pushed entropy into the input pool by the
- * time we get here.
+ * This is called extremely early, before time keeping functionality is
+ * available, but arch randomness is. Interrupts are not yet enabled.
*/
-int __init random_init(const char *command_line)
+void __init random_init_early(const char *command_line)
{
- ktime_t now = ktime_get_real();
- size_t i, longs, arch_bits;
unsigned long entropy[BLAKE2S_BLOCK_SIZE / sizeof(long)];
+ size_t i, longs, arch_bits;
#if defined(LATENT_ENTROPY_PLUGIN)
static const u8 compiletime_seed[BLAKE2S_BLOCK_SIZE] __initconst __latent_entropy;
@@ -803,34 +798,46 @@ int __init random_init(const char *command_line)
i += longs;
continue;
}
- entropy[0] = random_get_entropy();
- _mix_pool_bytes(entropy, sizeof(*entropy));
arch_bits -= sizeof(*entropy) * 8;
++i;
}
- _mix_pool_bytes(&now, sizeof(now));
- _mix_pool_bytes(utsname(), sizeof(*(utsname())));
+
_mix_pool_bytes(command_line, strlen(command_line));
+
+ if (trust_cpu)
+ credit_init_bits(arch_bits);
+}
+
+/*
+ * This is called a little bit after the prior function, and now there is
+ * access to timestamps counters. Interrupts are not yet enabled.
+ */
+void __init random_init(void)
+{
+ unsigned long entropy = random_get_entropy();
+ ktime_t now = ktime_get_real();
+
+ _mix_pool_bytes(utsname(), sizeof(*(utsname())));
+ _mix_pool_bytes(&now, sizeof(now));
+ _mix_pool_bytes(&entropy, sizeof(entropy));
add_latent_entropy();
/*
- * If we were initialized by the bootloader before jump labels are
- * initialized, then we should enable the static branch here, where
+ * If we were initialized by the cpu or bootloader before jump labels
+ * are initialized, then we should enable the static branch here, where
* it's guaranteed that jump labels have been initialized.
*/
if (!static_branch_likely(&crng_is_ready) && crng_init >= CRNG_READY)
crng_set_ready(NULL);
+ /* Reseed if already seeded by earlier phases. */
if (crng_ready())
crng_reseed();
- else if (trust_cpu)
- _credit_init_bits(arch_bits);
WARN_ON(register_pm_notifier(&pm_notifier));
- WARN(!random_get_entropy(), "Missing cycle counter and fallback timer; RNG "
- "entropy collection will consequently suffer.");
- return 0;
+ WARN(!entropy, "Missing cycle counter and fallback timer; RNG "
+ "entropy collection will consequently suffer.");
}
/*
diff --git a/include/linux/random.h b/include/linux/random.h
index 3fec206487f6..a9e6e16f9774 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -72,7 +72,8 @@ static inline unsigned long get_random_canary(void)
return get_random_long() & CANARY_MASK;
}
-int __init random_init(const char *command_line);
+void __init random_init_early(const char *command_line);
+void __init random_init(void);
bool rng_is_initialized(void);
int wait_for_random_bytes(void);
diff --git a/init/main.c b/init/main.c
index 1fe7942f5d4a..0866e5d0d467 100644
--- a/init/main.c
+++ b/init/main.c
@@ -976,6 +976,9 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
parse_args("Setting extra init args", extra_init_args,
NULL, 0, -1, -1, NULL, set_init_arg);
+ /* Architectural and non-timekeeping rng init, before allocator init */
+ random_init_early(command_line);
+
/*
* These use large bootmem allocations and must precede
* kmem_cache_init()
@@ -1035,17 +1038,13 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
hrtimers_init();
softirq_init();
timekeeping_init();
- kfence_init();
time_init();
- /*
- * For best initial stack canary entropy, prepare it after:
- * - setup_arch() for any UEFI RNG entropy and boot cmdline access
- * - timekeeping_init() for ktime entropy used in random_init()
- * - time_init() for making random_get_entropy() work on some platforms
- * - random_init() to initialize the RNG from from early entropy sources
- */
- random_init(command_line);
+ /* This must be after timekeeping is initialized */
+ random_init();
+
+ /* These make use of the fully initialized rng */
+ kfence_init();
boot_init_stack_canary();
perf_event_init();
--
2.37.3
This is a note to let you know that I've just added the patch titled
usb: typec: ucsi: Remove incorrect warning
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-testing branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will be merged to the usb-next branch sometime soon,
after it passes testing, and the merge window is open.
If you have any questions about this process, please let me know.
From dd3de5ad032db61f3edd9c94daf27e52f8dc25ca Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus(a)linux.intel.com>
Date: Thu, 22 Sep 2022 17:59:24 +0300
Subject: usb: typec: ucsi: Remove incorrect warning
Sink only devices do not have any source capabilities, so
the driver should not warn about that. Also DRP (Dual Role
Power) capable devices, such as USB Type-C docking stations,
do not return any source capabilities unless they are
plugged to a power supply themselves.
Fixes: 1f4642b72be7 ("usb: typec: ucsi: Retrieve all the PDOs instead of just the first 4")
Reported-by: Paul Menzel <pmenzel(a)molgen.mpg.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Heikki Krogerus <heikki.krogerus(a)linux.intel.com>
Link: https://lore.kernel.org/r/20220922145924.80667-1-heikki.krogerus@linux.inte…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/typec/ucsi/ucsi.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c
index 7f2624f42724..6364f0d467ea 100644
--- a/drivers/usb/typec/ucsi/ucsi.c
+++ b/drivers/usb/typec/ucsi/ucsi.c
@@ -588,8 +588,6 @@ static int ucsi_get_pdos(struct ucsi_connector *con, int is_partner,
num_pdos * sizeof(u32));
if (ret < 0 && ret != -ETIMEDOUT)
dev_err(ucsi->dev, "UCSI_GET_PDOS failed (%d)\n", ret);
- if (ret == 0 && offset == 0)
- dev_warn(ucsi->dev, "UCSI_GET_PDOS returned 0 bytes\n");
return ret;
}
--
2.37.3
Hello,
this is backport of commit 0d519cadf751
("arm64: kexec_file: use more system keyrings to verify kernel image signature")
to table 5.15 tree including the preparatory patches.
Some patches needed minor adjustment for context.
Thanks
Michal
Coiby Xu (3):
kexec: clean up arch_kexec_kernel_verify_sig
kexec, KEYS: make the code in bzImage64_verify_sig generic
arm64: kexec_file: use more system keyrings to verify kernel image
signature
Naveen N. Rao (2):
kexec_file: drop weak attribute from functions
kexec: drop weak attribute from functions
Sven Schnelle (1):
s390/kexec_file: move kernel image size check
arch/arm64/include/asm/kexec.h | 20 ++++++-
arch/arm64/kernel/kexec_image.c | 11 +---
arch/powerpc/include/asm/kexec.h | 14 +++++
arch/s390/boot/head.S | 2 -
arch/s390/include/asm/kexec.h | 14 +++++
arch/s390/include/asm/setup.h | 1 -
arch/s390/kernel/machine_kexec_file.c | 17 +-----
arch/x86/include/asm/kexec.h | 12 ++++
arch/x86/kernel/kexec-bzimage64.c | 20 +------
include/linux/kexec.h | 82 ++++++++++++++++++++++----
kernel/kexec_core.c | 27 ---------
kernel/kexec_file.c | 83 ++++++++++-----------------
12 files changed, 163 insertions(+), 140 deletions(-)
--
2.35.3
On Mon, Sep 26, 2022, at 9:19 PM, Sasha Levin wrote:
>>
>>Ah, so mean this just got incorrectly identified as a dependency
>>by a script, not that you were actually considering the backport
>>of the context changes, right?
>
> My workflow is to have the script attempt to identify and cherry-pick
> all dependencies, marking them as such, and then I manually go over the
> result and decide whether we really need the patches it picked or not.
Ok, got it.
> So in theory yes, at some point this was considered for backport, but
> should have been dropped during the review phase.
It looks like it's still in stable-rc/linux-5.10.y, can we drop it now?
Arnd