commit 7cf209ba8a86410939a24cb1aeb279479a7e0ca6 upstream.
Patch series "mm/memory_hotplug: preparatory patches for new online policy and memory"
These are all cleanups and one fix previously sent as part of [1]:
[PATCH v1 00/12] mm/memory_hotplug: "auto-movable" online policy and memory
groups.
These patches make sense even without the other series, therefore I pulled
them out to make the other series easier to digest.
[1] https://lkml.kernel.org/r/20210607195430.48228-1-david@redhat.com
This patch (of 4):
Checkpatch complained on a follow-up patch that we are using "unsigned"
here, which defaults to "unsigned int" and checkpatch is correct.
As we will search for a fitting zone using the wrong pfn, we might end
up onlining memory to one of the special kernel zones, such as ZONE_DMA,
which can end badly as the onlined memory does not satisfy properties of
these zones.
Use "unsigned long" instead, just as we do in other places when handling
PFNs. This can bite us once we have physical addresses in the range of
multiple TB.
Link: https://lkml.kernel.org/r/20210712124052.26491-2-david@redhat.com
Fixes: e5e689302633 ("mm, memory_hotplug: display allowed zones in the preferred ordering")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Pankaj Gupta <pankaj.gupta(a)ionos.com>
Reviewed-by: Muchun Song <songmuchun(a)bytedance.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Vitaly Kuznetsov <vkuznets(a)redhat.com>
Cc: "Michael S. Tsirkin" <mst(a)redhat.com>
Cc: Jason Wang <jasowang(a)redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux(a)gmail.com>
Cc: Wei Yang <richard.weiyang(a)linux.alibaba.com>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Anshuman Khandual <anshuman.khandual(a)arm.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Mike Rapoport <rppt(a)kernel.org>
Cc: "Rafael J. Wysocki" <rjw(a)rjwysocki.net>
Cc: Len Brown <lenb(a)kernel.org>
Cc: Pavel Tatashin <pasha.tatashin(a)soleen.com>
Cc: Heiko Carstens <hca(a)linux.ibm.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: virtualization(a)lists.linux-foundation.org
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)linux.ibm.com>
Cc: Anton Blanchard <anton(a)ozlabs.org>
Cc: Ard Biesheuvel <ardb(a)kernel.org>
Cc: Baoquan He <bhe(a)redhat.com>
Cc: Benjamin Herrenschmidt <benh(a)kernel.crashing.org>
Cc: Borislav Petkov <bp(a)alien8.de>
Cc: Christian Borntraeger <borntraeger(a)de.ibm.com>
Cc: Christophe Leroy <christophe.leroy(a)c-s.fr>
Cc: Dave Jiang <dave.jiang(a)intel.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Jia He <justin.he(a)arm.com>
Cc: Joe Perches <joe(a)perches.com>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: Laurent Dufour <ldufour(a)linux.ibm.com>
Cc: Michel Lespinasse <michel(a)lespinasse.org>
Cc: Nathan Lynch <nathanl(a)linux.ibm.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Paul Mackerras <paulus(a)samba.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Pierre Morel <pmorel(a)linux.ibm.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki(a)intel.com>
Cc: Rich Felker <dalias(a)libc.org>
Cc: Scott Cheloha <cheloha(a)linux.ibm.com>
Cc: Sergei Trofimovich <slyfox(a)gentoo.org>
Cc: Thiago Jung Bauermann <bauerman(a)linux.ibm.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Vasily Gorbik <gor(a)linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma(a)intel.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: Yoshinori Sato <ysato(a)users.sourceforge.jp>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: David Hildenbrand <david(a)redhat.com>
---
include/linux/memory_hotplug.h | 4 ++--
mm/memory_hotplug.c | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index d36a02935391..8c48d9e0ca22 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -332,6 +332,6 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
unsigned long pnum);
extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
int online_type);
-extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
- unsigned long nr_pages);
+extern struct zone *zone_for_pfn_range(int online_type, int nid,
+ unsigned long start_pfn, unsigned long nr_pages);
#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2d6626ab29d1..c2878d48b8b1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -842,8 +842,8 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn
return movable_node_enabled ? movable_zone : kernel_zone;
}
-struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
- unsigned long nr_pages)
+struct zone *zone_for_pfn_range(int online_type, int nid,
+ unsigned long start_pfn, unsigned long nr_pages)
{
if (online_type == MMOP_ONLINE_KERNEL)
return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
--
2.31.1
On Mon, Sep 20, 2021 at 10:57:34AM +0200, Michel Dänzer wrote:
> On 2021-09-20 10:52, gregkh(a)linuxfoundation.org wrote:
> >
> > The patch below does not apply to the 5.14-stable tree.
> > If someone wants it applied there, or to any other stable or longterm
> > tree, then please email the backport, including the original git commit
> > id to <stable(a)vger.kernel.org>.
>
> I wrote a comment above the Fixes line:
>
> # The function is called amdgpu_ras_eeprom_get_record_max_length on
> # stable branches
>
>
> Looks like that got lost somewhere along the way.
Can you send a working patch for these older kernels that I can queue
up?
thanks,
greg k-h
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From ae7aaecc3f2f78b76ab3a8d6178610f55aadfa56 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin(a)gmail.com>
Date: Wed, 8 Sep 2021 20:17:17 +1000
Subject: [PATCH] powerpc/64s: system call rfscv workaround for TM bugs
The rfscv instruction does not work correctly with the fake-suspend mode
in POWER9, which can end up with the hypervisor restoring an incorrect
checkpoint.
Work around this by setting the _TIF_RESTOREALL flag if a system call
returns to a transaction active state, causing rfid to be used instead
of rfscv to return, which will do the right thing. The contents of the
registers are irrelevant because they will be overwritten in this case
anyway.
Fixes: 7fa95f9adaee7 ("powerpc/64s: system call support for scv/rfscv instructions")
Reported-by: Eirik Fuller <efuller(a)redhat.com>
Signed-off-by: Nicholas Piggin <npiggin(a)gmail.com>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20210908101718.118522-1-npiggin@gmail.com
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 2ccc7ea5db00..de10a2697258 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -137,6 +137,19 @@ notrace long system_call_exception(long r3, long r4, long r5,
*/
irq_soft_mask_regs_set_state(regs, IRQS_ENABLED);
+ /*
+ * If system call is called with TM active, set _TIF_RESTOREALL to
+ * prevent RFSCV being used to return to userspace, because POWER9
+ * TM implementation has problems with this instruction returning to
+ * transactional state. Final register values are not relevant because
+ * the transaction will be aborted upon return anyway. Or in the case
+ * of unsupported_scv SIGILL fault, the return state does not much
+ * matter because it's an edge case.
+ */
+ if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
+ unlikely(MSR_TM_TRANSACTIONAL(regs->msr)))
+ current_thread_info()->flags |= _TIF_RESTOREALL;
+
/*
* If the system call was made with a transaction active, doom it and
* return without performing the system call. Unless it was an
The patch below does not apply to the 5.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 114518ff3b30a3f0611f384fb58e0a968fdf7f5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <mdaenzer(a)redhat.com>
Date: Thu, 9 Sep 2021 18:56:28 +0200
Subject: [PATCH] drm/amdgpu: Drop inline from
amdgpu_ras_eeprom_max_record_count
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This was unusual; normally, inline functions are declared static as
well, and defined in a header file if used by multiple compilation
units. The latter would be more involved in this case, so just drop
the inline declaration for now.
Fixes compile failure building for ppc64le on RHEL 8:
In file included from ../drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h:32,
from ../drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c:33:
../drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c: In function ‘amdgpu_ras_recovery_init’:
../drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h:90:17: error: inlining failed in call
to ‘always_inline’ ‘amdgpu_ras_eeprom_max_record_count’: function body not available
90 | inline uint32_t amdgpu_ras_eeprom_max_record_count(void);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c:1985:34: note: called from here
1985 | max_eeprom_records_len = amdgpu_ras_eeprom_max_record_count();
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Fixes: c84d46707ebb "drm/amdgpu: validate bad page threshold in ras(v3)"
Reviewed-by: Lyude Paul <lyude(a)redhat.com>
Signed-off-by: Michel Dänzer <mdaenzer(a)redhat.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index dc44c946a244..98732518543e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -757,7 +757,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
return res;
}
-inline uint32_t amdgpu_ras_eeprom_max_record_count(void)
+uint32_t amdgpu_ras_eeprom_max_record_count(void)
{
return RAS_MAX_RECORD_COUNT;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index f95fc61b3021..6bb00578bfbb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -120,7 +120,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
struct eeprom_table_record *records, const u32 num);
-inline uint32_t amdgpu_ras_eeprom_max_record_count(void);
+uint32_t amdgpu_ras_eeprom_max_record_count(void);
void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 114518ff3b30a3f0611f384fb58e0a968fdf7f5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <mdaenzer(a)redhat.com>
Date: Thu, 9 Sep 2021 18:56:28 +0200
Subject: [PATCH] drm/amdgpu: Drop inline from
amdgpu_ras_eeprom_max_record_count
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This was unusual; normally, inline functions are declared static as
well, and defined in a header file if used by multiple compilation
units. The latter would be more involved in this case, so just drop
the inline declaration for now.
Fixes compile failure building for ppc64le on RHEL 8:
In file included from ../drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h:32,
from ../drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c:33:
../drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c: In function ‘amdgpu_ras_recovery_init’:
../drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h:90:17: error: inlining failed in call
to ‘always_inline’ ‘amdgpu_ras_eeprom_max_record_count’: function body not available
90 | inline uint32_t amdgpu_ras_eeprom_max_record_count(void);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c:1985:34: note: called from here
1985 | max_eeprom_records_len = amdgpu_ras_eeprom_max_record_count();
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Fixes: c84d46707ebb "drm/amdgpu: validate bad page threshold in ras(v3)"
Reviewed-by: Lyude Paul <lyude(a)redhat.com>
Signed-off-by: Michel Dänzer <mdaenzer(a)redhat.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index dc44c946a244..98732518543e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -757,7 +757,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
return res;
}
-inline uint32_t amdgpu_ras_eeprom_max_record_count(void)
+uint32_t amdgpu_ras_eeprom_max_record_count(void)
{
return RAS_MAX_RECORD_COUNT;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index f95fc61b3021..6bb00578bfbb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -120,7 +120,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
struct eeprom_table_record *records, const u32 num);
-inline uint32_t amdgpu_ras_eeprom_max_record_count(void);
+uint32_t amdgpu_ras_eeprom_max_record_count(void);
void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel(a)iogearbox.net>
Date: Tue, 14 Sep 2021 01:07:57 +0200
Subject: [PATCH] bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode
Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used.
Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
embedded per-socket cgroup information into sock->sk_cgrp_data and in order
to save 8 bytes in struct sock made both mutually exclusive, that is, when
cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2
falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp).
The assumption made was "there is no reason to mix the two and this is in line
with how legacy and v2 compatibility is handled" as stated in bd1060a1d671.
However, with Kubernetes more widely supporting cgroups v2 as well nowadays,
this assumption no longer holds, and the possibility of the v1/v2 mixed mode
with the v2 root fallback being hit becomes a real security issue.
Many of the cgroup v2 BPF programs are also used for policy enforcement, just
to pick _one_ example, that is, to programmatically deny socket related system
calls like connect(2) or bind(2). A v2 root fallback would implicitly cause
a policy bypass for the affected Pods.
In production environments, we have recently seen this case due to various
circumstances: i) a different 3rd party agent and/or ii) a container runtime
such as [0] in the user's environment configuring legacy cgroup v1 net_cls
tags, which triggered implicitly mentioned root fallback. Another case is
Kubernetes projects like kind [1] which create Kubernetes nodes in a container
and also add cgroup namespaces to the mix, meaning programs which are attached
to the cgroup v2 root of the cgroup namespace get attached to a non-root
cgroup v2 path from init namespace point of view. And the latter's root is
out of reach for agents on a kind Kubernetes node to configure. Meaning, any
entity on the node setting cgroup v1 net_cls tag will trigger the bypass
despite cgroup v2 BPF programs attached to the namespace root.
Generally, this mutual exclusiveness does not hold anymore in today's user
environments and makes cgroup v2 usage from BPF side fragile and unreliable.
This fix adds proper struct cgroup pointer for the cgroup v2 case to struct
sock_cgroup_data in order to address these issues; this implicitly also fixes
the tradeoffs being made back then with regards to races and refcount leaks
as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF
programs always operate as expected.
[0] https://github.com/nestybox/sysbox/
[1] https://kind.sigs.k8s.io/
Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
Signed-off-by: Daniel Borkmann <daniel(a)iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast(a)kernel.org>
Acked-by: Stanislav Fomichev <sdf(a)google.com>
Acked-by: Tejun Heo <tj(a)kernel.org>
Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index e1c705fdfa7c..db2e147e069f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -752,107 +752,54 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
* sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
* per-socket cgroup information except for memcg association.
*
- * On legacy hierarchies, net_prio and net_cls controllers directly set
- * attributes on each sock which can then be tested by the network layer.
- * On the default hierarchy, each sock is associated with the cgroup it was
- * created in and the networking layer can match the cgroup directly.
- *
- * To avoid carrying all three cgroup related fields separately in sock,
- * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
- * On boot, sock_cgroup_data records the cgroup that the sock was created
- * in so that cgroup2 matches can be made; however, once either net_prio or
- * net_cls starts being used, the area is overridden to carry prioidx and/or
- * classid. The two modes are distinguished by whether the lowest bit is
- * set. Clear bit indicates cgroup pointer while set bit prioidx and
- * classid.
- *
- * While userland may start using net_prio or net_cls at any time, once
- * either is used, cgroup2 matching no longer works. There is no reason to
- * mix the two and this is in line with how legacy and v2 compatibility is
- * handled. On mode switch, cgroup references which are already being
- * pointed to by socks may be leaked. While this can be remedied by adding
- * synchronization around sock_cgroup_data, given that the number of leaked
- * cgroups is bound and highly unlikely to be high, this seems to be the
- * better trade-off.
+ * On legacy hierarchies, net_prio and net_cls controllers directly
+ * set attributes on each sock which can then be tested by the network
+ * layer. On the default hierarchy, each sock is associated with the
+ * cgroup it was created in and the networking layer can match the
+ * cgroup directly.
*/
struct sock_cgroup_data {
- union {
-#ifdef __LITTLE_ENDIAN
- struct {
- u8 is_data : 1;
- u8 no_refcnt : 1;
- u8 unused : 6;
- u8 padding;
- u16 prioidx;
- u32 classid;
- } __packed;
-#else
- struct {
- u32 classid;
- u16 prioidx;
- u8 padding;
- u8 unused : 6;
- u8 no_refcnt : 1;
- u8 is_data : 1;
- } __packed;
+ struct cgroup *cgroup; /* v2 */
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ u32 classid; /* v1 */
+#endif
+#ifdef CONFIG_CGROUP_NET_PRIO
+ u16 prioidx; /* v1 */
#endif
- u64 val;
- };
};
-/*
- * There's a theoretical window where the following accessors race with
- * updaters and return part of the previous pointer as the prioidx or
- * classid. Such races are short-lived and the result isn't critical.
- */
static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
{
- /* fallback to 1 which is always the ID of the root cgroup */
- return (skcd->is_data & 1) ? skcd->prioidx : 1;
+#ifdef CONFIG_CGROUP_NET_PRIO
+ return READ_ONCE(skcd->prioidx);
+#else
+ return 1;
+#endif
}
static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
{
- /* fallback to 0 which is the unconfigured default classid */
- return (skcd->is_data & 1) ? skcd->classid : 0;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ return READ_ONCE(skcd->classid);
+#else
+ return 0;
+#endif
}
-/*
- * If invoked concurrently, the updaters may clobber each other. The
- * caller is responsible for synchronization.
- */
static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
u16 prioidx)
{
- struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
-
- if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
- return;
-
- if (!(skcd_buf.is_data & 1)) {
- skcd_buf.val = 0;
- skcd_buf.is_data = 1;
- }
-
- skcd_buf.prioidx = prioidx;
- WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
+#ifdef CONFIG_CGROUP_NET_PRIO
+ WRITE_ONCE(skcd->prioidx, prioidx);
+#endif
}
static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
u32 classid)
{
- struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
-
- if (sock_cgroup_classid(&skcd_buf) == classid)
- return;
-
- if (!(skcd_buf.is_data & 1)) {
- skcd_buf.val = 0;
- skcd_buf.is_data = 1;
- }
-
- skcd_buf.classid = classid;
- WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ WRITE_ONCE(skcd->classid, classid);
+#endif
}
#else /* CONFIG_SOCK_CGROUP_DATA */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7bf60454a313..75c151413fda 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -829,33 +829,13 @@ static inline void cgroup_account_cputime_field(struct task_struct *task,
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-extern spinlock_t cgroup_sk_update_lock;
-#endif
-
-void cgroup_sk_alloc_disable(void);
void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
void cgroup_sk_clone(struct sock_cgroup_data *skcd);
void cgroup_sk_free(struct sock_cgroup_data *skcd);
static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
{
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
- unsigned long v;
-
- /*
- * @skcd->val is 64bit but the following is safe on 32bit too as we
- * just need the lower ulong to be written and read atomically.
- */
- v = READ_ONCE(skcd->val);
-
- if (v & 3)
- return &cgrp_dfl_root.cgrp;
-
- return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
-#else
- return (struct cgroup *)(unsigned long)skcd->val;
-#endif
+ return skcd->cgroup;
}
#else /* CONFIG_CGROUP_DATA */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 881ce1470beb..8afa8690d288 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6572,74 +6572,44 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
-static bool cgroup_sk_alloc_disabled __read_mostly;
-
-void cgroup_sk_alloc_disable(void)
-{
- if (cgroup_sk_alloc_disabled)
- return;
- pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
- cgroup_sk_alloc_disabled = true;
-}
-
-#else
-
-#define cgroup_sk_alloc_disabled false
-
-#endif
-
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
- if (cgroup_sk_alloc_disabled) {
- skcd->no_refcnt = 1;
- return;
- }
-
/* Don't associate the sock with unrelated interrupted task's cgroup. */
if (in_interrupt())
return;
rcu_read_lock();
-
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
- skcd->val = (unsigned long)cset->dfl_cgrp;
+ skcd->cgroup = cset->dfl_cgrp;
cgroup_bpf_get(cset->dfl_cgrp);
break;
}
cpu_relax();
}
-
rcu_read_unlock();
}
void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
- if (skcd->val) {
- if (skcd->no_refcnt)
- return;
- /*
- * We might be cloning a socket which is left in an empty
- * cgroup and the cgroup might have already been rmdir'd.
- * Don't use cgroup_get_live().
- */
- cgroup_get(sock_cgroup_ptr(skcd));
- cgroup_bpf_get(sock_cgroup_ptr(skcd));
- }
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
+ cgroup_get(cgrp);
+ cgroup_bpf_get(cgrp);
}
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
- if (skcd->no_refcnt)
- return;
cgroup_bpf_put(cgrp);
cgroup_put(cgrp);
}
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index b49c57d35a88..1a6a86693b74 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -71,11 +71,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
struct update_classid_context *ctx = (void *)v;
struct socket *sock = sock_from_file(file);
- if (sock) {
- spin_lock(&cgroup_sk_update_lock);
+ if (sock)
sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
- spin_unlock(&cgroup_sk_update_lock);
- }
if (--ctx->batch == 0) {
ctx->batch = UPDATE_CLASSID_BATCH;
return n + 1;
@@ -121,8 +118,6 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
struct css_task_iter it;
struct task_struct *p;
- cgroup_sk_alloc_disable();
-
cs->classid = (u32)value;
css_task_iter_start(css, 0, &it);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 99a431c56f23..8456dfbe2eb4 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
if (!dev)
return -ENODEV;
- cgroup_sk_alloc_disable();
-
rtnl_lock();
ret = netprio_set_prio(of_css(of), dev, prio);
@@ -221,12 +219,10 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
static int update_netprio(const void *v, struct file *file, unsigned n)
{
struct socket *sock = sock_from_file(file);
- if (sock) {
- spin_lock(&cgroup_sk_update_lock);
+
+ if (sock)
sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
(unsigned long)v);
- spin_unlock(&cgroup_sk_update_lock);
- }
return 0;
}
@@ -235,8 +231,6 @@ static void net_prio_attach(struct cgroup_taskset *tset)
struct task_struct *p;
struct cgroup_subsys_state *css;
- cgroup_sk_alloc_disable();
-
cgroup_taskset_for_each(p, css, tset) {
void *v = (void *)(unsigned long)css->id;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel(a)iogearbox.net>
Date: Tue, 14 Sep 2021 01:07:57 +0200
Subject: [PATCH] bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode
Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used.
Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
embedded per-socket cgroup information into sock->sk_cgrp_data and in order
to save 8 bytes in struct sock made both mutually exclusive, that is, when
cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2
falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp).
The assumption made was "there is no reason to mix the two and this is in line
with how legacy and v2 compatibility is handled" as stated in bd1060a1d671.
However, with Kubernetes more widely supporting cgroups v2 as well nowadays,
this assumption no longer holds, and the possibility of the v1/v2 mixed mode
with the v2 root fallback being hit becomes a real security issue.
Many of the cgroup v2 BPF programs are also used for policy enforcement, just
to pick _one_ example, that is, to programmatically deny socket related system
calls like connect(2) or bind(2). A v2 root fallback would implicitly cause
a policy bypass for the affected Pods.
In production environments, we have recently seen this case due to various
circumstances: i) a different 3rd party agent and/or ii) a container runtime
such as [0] in the user's environment configuring legacy cgroup v1 net_cls
tags, which triggered implicitly mentioned root fallback. Another case is
Kubernetes projects like kind [1] which create Kubernetes nodes in a container
and also add cgroup namespaces to the mix, meaning programs which are attached
to the cgroup v2 root of the cgroup namespace get attached to a non-root
cgroup v2 path from init namespace point of view. And the latter's root is
out of reach for agents on a kind Kubernetes node to configure. Meaning, any
entity on the node setting cgroup v1 net_cls tag will trigger the bypass
despite cgroup v2 BPF programs attached to the namespace root.
Generally, this mutual exclusiveness does not hold anymore in today's user
environments and makes cgroup v2 usage from BPF side fragile and unreliable.
This fix adds proper struct cgroup pointer for the cgroup v2 case to struct
sock_cgroup_data in order to address these issues; this implicitly also fixes
the tradeoffs being made back then with regards to races and refcount leaks
as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF
programs always operate as expected.
[0] https://github.com/nestybox/sysbox/
[1] https://kind.sigs.k8s.io/
Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
Signed-off-by: Daniel Borkmann <daniel(a)iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast(a)kernel.org>
Acked-by: Stanislav Fomichev <sdf(a)google.com>
Acked-by: Tejun Heo <tj(a)kernel.org>
Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index e1c705fdfa7c..db2e147e069f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -752,107 +752,54 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
* sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
* per-socket cgroup information except for memcg association.
*
- * On legacy hierarchies, net_prio and net_cls controllers directly set
- * attributes on each sock which can then be tested by the network layer.
- * On the default hierarchy, each sock is associated with the cgroup it was
- * created in and the networking layer can match the cgroup directly.
- *
- * To avoid carrying all three cgroup related fields separately in sock,
- * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
- * On boot, sock_cgroup_data records the cgroup that the sock was created
- * in so that cgroup2 matches can be made; however, once either net_prio or
- * net_cls starts being used, the area is overridden to carry prioidx and/or
- * classid. The two modes are distinguished by whether the lowest bit is
- * set. Clear bit indicates cgroup pointer while set bit prioidx and
- * classid.
- *
- * While userland may start using net_prio or net_cls at any time, once
- * either is used, cgroup2 matching no longer works. There is no reason to
- * mix the two and this is in line with how legacy and v2 compatibility is
- * handled. On mode switch, cgroup references which are already being
- * pointed to by socks may be leaked. While this can be remedied by adding
- * synchronization around sock_cgroup_data, given that the number of leaked
- * cgroups is bound and highly unlikely to be high, this seems to be the
- * better trade-off.
+ * On legacy hierarchies, net_prio and net_cls controllers directly
+ * set attributes on each sock which can then be tested by the network
+ * layer. On the default hierarchy, each sock is associated with the
+ * cgroup it was created in and the networking layer can match the
+ * cgroup directly.
*/
struct sock_cgroup_data {
- union {
-#ifdef __LITTLE_ENDIAN
- struct {
- u8 is_data : 1;
- u8 no_refcnt : 1;
- u8 unused : 6;
- u8 padding;
- u16 prioidx;
- u32 classid;
- } __packed;
-#else
- struct {
- u32 classid;
- u16 prioidx;
- u8 padding;
- u8 unused : 6;
- u8 no_refcnt : 1;
- u8 is_data : 1;
- } __packed;
+ struct cgroup *cgroup; /* v2 */
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ u32 classid; /* v1 */
+#endif
+#ifdef CONFIG_CGROUP_NET_PRIO
+ u16 prioidx; /* v1 */
#endif
- u64 val;
- };
};
-/*
- * There's a theoretical window where the following accessors race with
- * updaters and return part of the previous pointer as the prioidx or
- * classid. Such races are short-lived and the result isn't critical.
- */
static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
{
- /* fallback to 1 which is always the ID of the root cgroup */
- return (skcd->is_data & 1) ? skcd->prioidx : 1;
+#ifdef CONFIG_CGROUP_NET_PRIO
+ return READ_ONCE(skcd->prioidx);
+#else
+ return 1;
+#endif
}
static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
{
- /* fallback to 0 which is the unconfigured default classid */
- return (skcd->is_data & 1) ? skcd->classid : 0;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ return READ_ONCE(skcd->classid);
+#else
+ return 0;
+#endif
}
-/*
- * If invoked concurrently, the updaters may clobber each other. The
- * caller is responsible for synchronization.
- */
static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
u16 prioidx)
{
- struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
-
- if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
- return;
-
- if (!(skcd_buf.is_data & 1)) {
- skcd_buf.val = 0;
- skcd_buf.is_data = 1;
- }
-
- skcd_buf.prioidx = prioidx;
- WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
+#ifdef CONFIG_CGROUP_NET_PRIO
+ WRITE_ONCE(skcd->prioidx, prioidx);
+#endif
}
static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
u32 classid)
{
- struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
-
- if (sock_cgroup_classid(&skcd_buf) == classid)
- return;
-
- if (!(skcd_buf.is_data & 1)) {
- skcd_buf.val = 0;
- skcd_buf.is_data = 1;
- }
-
- skcd_buf.classid = classid;
- WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ WRITE_ONCE(skcd->classid, classid);
+#endif
}
#else /* CONFIG_SOCK_CGROUP_DATA */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7bf60454a313..75c151413fda 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -829,33 +829,13 @@ static inline void cgroup_account_cputime_field(struct task_struct *task,
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-extern spinlock_t cgroup_sk_update_lock;
-#endif
-
-void cgroup_sk_alloc_disable(void);
void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
void cgroup_sk_clone(struct sock_cgroup_data *skcd);
void cgroup_sk_free(struct sock_cgroup_data *skcd);
static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
{
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
- unsigned long v;
-
- /*
- * @skcd->val is 64bit but the following is safe on 32bit too as we
- * just need the lower ulong to be written and read atomically.
- */
- v = READ_ONCE(skcd->val);
-
- if (v & 3)
- return &cgrp_dfl_root.cgrp;
-
- return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
-#else
- return (struct cgroup *)(unsigned long)skcd->val;
-#endif
+ return skcd->cgroup;
}
#else /* CONFIG_CGROUP_DATA */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 881ce1470beb..8afa8690d288 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6572,74 +6572,44 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
-static bool cgroup_sk_alloc_disabled __read_mostly;
-
-void cgroup_sk_alloc_disable(void)
-{
- if (cgroup_sk_alloc_disabled)
- return;
- pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
- cgroup_sk_alloc_disabled = true;
-}
-
-#else
-
-#define cgroup_sk_alloc_disabled false
-
-#endif
-
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
- if (cgroup_sk_alloc_disabled) {
- skcd->no_refcnt = 1;
- return;
- }
-
/* Don't associate the sock with unrelated interrupted task's cgroup. */
if (in_interrupt())
return;
rcu_read_lock();
-
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
- skcd->val = (unsigned long)cset->dfl_cgrp;
+ skcd->cgroup = cset->dfl_cgrp;
cgroup_bpf_get(cset->dfl_cgrp);
break;
}
cpu_relax();
}
-
rcu_read_unlock();
}
void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
- if (skcd->val) {
- if (skcd->no_refcnt)
- return;
- /*
- * We might be cloning a socket which is left in an empty
- * cgroup and the cgroup might have already been rmdir'd.
- * Don't use cgroup_get_live().
- */
- cgroup_get(sock_cgroup_ptr(skcd));
- cgroup_bpf_get(sock_cgroup_ptr(skcd));
- }
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
+ cgroup_get(cgrp);
+ cgroup_bpf_get(cgrp);
}
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
- if (skcd->no_refcnt)
- return;
cgroup_bpf_put(cgrp);
cgroup_put(cgrp);
}
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index b49c57d35a88..1a6a86693b74 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -71,11 +71,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
struct update_classid_context *ctx = (void *)v;
struct socket *sock = sock_from_file(file);
- if (sock) {
- spin_lock(&cgroup_sk_update_lock);
+ if (sock)
sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
- spin_unlock(&cgroup_sk_update_lock);
- }
if (--ctx->batch == 0) {
ctx->batch = UPDATE_CLASSID_BATCH;
return n + 1;
@@ -121,8 +118,6 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
struct css_task_iter it;
struct task_struct *p;
- cgroup_sk_alloc_disable();
-
cs->classid = (u32)value;
css_task_iter_start(css, 0, &it);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 99a431c56f23..8456dfbe2eb4 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
if (!dev)
return -ENODEV;
- cgroup_sk_alloc_disable();
-
rtnl_lock();
ret = netprio_set_prio(of_css(of), dev, prio);
@@ -221,12 +219,10 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
static int update_netprio(const void *v, struct file *file, unsigned n)
{
struct socket *sock = sock_from_file(file);
- if (sock) {
- spin_lock(&cgroup_sk_update_lock);
+
+ if (sock)
sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
(unsigned long)v);
- spin_unlock(&cgroup_sk_update_lock);
- }
return 0;
}
@@ -235,8 +231,6 @@ static void net_prio_attach(struct cgroup_taskset *tset)
struct task_struct *p;
struct cgroup_subsys_state *css;
- cgroup_sk_alloc_disable();
-
cgroup_taskset_for_each(p, css, tset) {
void *v = (void *)(unsigned long)css->id;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel(a)iogearbox.net>
Date: Tue, 14 Sep 2021 01:07:57 +0200
Subject: [PATCH] bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode
Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used.
Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
embedded per-socket cgroup information into sock->sk_cgrp_data and in order
to save 8 bytes in struct sock made both mutually exclusive, that is, when
cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2
falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp).
The assumption made was "there is no reason to mix the two and this is in line
with how legacy and v2 compatibility is handled" as stated in bd1060a1d671.
However, with Kubernetes more widely supporting cgroups v2 as well nowadays,
this assumption no longer holds, and the possibility of the v1/v2 mixed mode
with the v2 root fallback being hit becomes a real security issue.
Many of the cgroup v2 BPF programs are also used for policy enforcement, just
to pick _one_ example, that is, to programmatically deny socket related system
calls like connect(2) or bind(2). A v2 root fallback would implicitly cause
a policy bypass for the affected Pods.
In production environments, we have recently seen this case due to various
circumstances: i) a different 3rd party agent and/or ii) a container runtime
such as [0] in the user's environment configuring legacy cgroup v1 net_cls
tags, which triggered implicitly mentioned root fallback. Another case is
Kubernetes projects like kind [1] which create Kubernetes nodes in a container
and also add cgroup namespaces to the mix, meaning programs which are attached
to the cgroup v2 root of the cgroup namespace get attached to a non-root
cgroup v2 path from init namespace point of view. And the latter's root is
out of reach for agents on a kind Kubernetes node to configure. Meaning, any
entity on the node setting cgroup v1 net_cls tag will trigger the bypass
despite cgroup v2 BPF programs attached to the namespace root.
Generally, this mutual exclusiveness does not hold anymore in today's user
environments and makes cgroup v2 usage from BPF side fragile and unreliable.
This fix adds proper struct cgroup pointer for the cgroup v2 case to struct
sock_cgroup_data in order to address these issues; this implicitly also fixes
the tradeoffs being made back then with regards to races and refcount leaks
as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF
programs always operate as expected.
[0] https://github.com/nestybox/sysbox/
[1] https://kind.sigs.k8s.io/
Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
Signed-off-by: Daniel Borkmann <daniel(a)iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast(a)kernel.org>
Acked-by: Stanislav Fomichev <sdf(a)google.com>
Acked-by: Tejun Heo <tj(a)kernel.org>
Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index e1c705fdfa7c..db2e147e069f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -752,107 +752,54 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
* sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
* per-socket cgroup information except for memcg association.
*
- * On legacy hierarchies, net_prio and net_cls controllers directly set
- * attributes on each sock which can then be tested by the network layer.
- * On the default hierarchy, each sock is associated with the cgroup it was
- * created in and the networking layer can match the cgroup directly.
- *
- * To avoid carrying all three cgroup related fields separately in sock,
- * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
- * On boot, sock_cgroup_data records the cgroup that the sock was created
- * in so that cgroup2 matches can be made; however, once either net_prio or
- * net_cls starts being used, the area is overridden to carry prioidx and/or
- * classid. The two modes are distinguished by whether the lowest bit is
- * set. Clear bit indicates cgroup pointer while set bit prioidx and
- * classid.
- *
- * While userland may start using net_prio or net_cls at any time, once
- * either is used, cgroup2 matching no longer works. There is no reason to
- * mix the two and this is in line with how legacy and v2 compatibility is
- * handled. On mode switch, cgroup references which are already being
- * pointed to by socks may be leaked. While this can be remedied by adding
- * synchronization around sock_cgroup_data, given that the number of leaked
- * cgroups is bound and highly unlikely to be high, this seems to be the
- * better trade-off.
+ * On legacy hierarchies, net_prio and net_cls controllers directly
+ * set attributes on each sock which can then be tested by the network
+ * layer. On the default hierarchy, each sock is associated with the
+ * cgroup it was created in and the networking layer can match the
+ * cgroup directly.
*/
struct sock_cgroup_data {
- union {
-#ifdef __LITTLE_ENDIAN
- struct {
- u8 is_data : 1;
- u8 no_refcnt : 1;
- u8 unused : 6;
- u8 padding;
- u16 prioidx;
- u32 classid;
- } __packed;
-#else
- struct {
- u32 classid;
- u16 prioidx;
- u8 padding;
- u8 unused : 6;
- u8 no_refcnt : 1;
- u8 is_data : 1;
- } __packed;
+ struct cgroup *cgroup; /* v2 */
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ u32 classid; /* v1 */
+#endif
+#ifdef CONFIG_CGROUP_NET_PRIO
+ u16 prioidx; /* v1 */
#endif
- u64 val;
- };
};
-/*
- * There's a theoretical window where the following accessors race with
- * updaters and return part of the previous pointer as the prioidx or
- * classid. Such races are short-lived and the result isn't critical.
- */
static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
{
- /* fallback to 1 which is always the ID of the root cgroup */
- return (skcd->is_data & 1) ? skcd->prioidx : 1;
+#ifdef CONFIG_CGROUP_NET_PRIO
+ return READ_ONCE(skcd->prioidx);
+#else
+ return 1;
+#endif
}
static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
{
- /* fallback to 0 which is the unconfigured default classid */
- return (skcd->is_data & 1) ? skcd->classid : 0;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ return READ_ONCE(skcd->classid);
+#else
+ return 0;
+#endif
}
-/*
- * If invoked concurrently, the updaters may clobber each other. The
- * caller is responsible for synchronization.
- */
static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
u16 prioidx)
{
- struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
-
- if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
- return;
-
- if (!(skcd_buf.is_data & 1)) {
- skcd_buf.val = 0;
- skcd_buf.is_data = 1;
- }
-
- skcd_buf.prioidx = prioidx;
- WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
+#ifdef CONFIG_CGROUP_NET_PRIO
+ WRITE_ONCE(skcd->prioidx, prioidx);
+#endif
}
static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
u32 classid)
{
- struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
-
- if (sock_cgroup_classid(&skcd_buf) == classid)
- return;
-
- if (!(skcd_buf.is_data & 1)) {
- skcd_buf.val = 0;
- skcd_buf.is_data = 1;
- }
-
- skcd_buf.classid = classid;
- WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ WRITE_ONCE(skcd->classid, classid);
+#endif
}
#else /* CONFIG_SOCK_CGROUP_DATA */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7bf60454a313..75c151413fda 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -829,33 +829,13 @@ static inline void cgroup_account_cputime_field(struct task_struct *task,
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-extern spinlock_t cgroup_sk_update_lock;
-#endif
-
-void cgroup_sk_alloc_disable(void);
void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
void cgroup_sk_clone(struct sock_cgroup_data *skcd);
void cgroup_sk_free(struct sock_cgroup_data *skcd);
static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
{
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
- unsigned long v;
-
- /*
- * @skcd->val is 64bit but the following is safe on 32bit too as we
- * just need the lower ulong to be written and read atomically.
- */
- v = READ_ONCE(skcd->val);
-
- if (v & 3)
- return &cgrp_dfl_root.cgrp;
-
- return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
-#else
- return (struct cgroup *)(unsigned long)skcd->val;
-#endif
+ return skcd->cgroup;
}
#else /* CONFIG_CGROUP_DATA */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 881ce1470beb..8afa8690d288 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6572,74 +6572,44 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
-static bool cgroup_sk_alloc_disabled __read_mostly;
-
-void cgroup_sk_alloc_disable(void)
-{
- if (cgroup_sk_alloc_disabled)
- return;
- pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
- cgroup_sk_alloc_disabled = true;
-}
-
-#else
-
-#define cgroup_sk_alloc_disabled false
-
-#endif
-
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
- if (cgroup_sk_alloc_disabled) {
- skcd->no_refcnt = 1;
- return;
- }
-
/* Don't associate the sock with unrelated interrupted task's cgroup. */
if (in_interrupt())
return;
rcu_read_lock();
-
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
- skcd->val = (unsigned long)cset->dfl_cgrp;
+ skcd->cgroup = cset->dfl_cgrp;
cgroup_bpf_get(cset->dfl_cgrp);
break;
}
cpu_relax();
}
-
rcu_read_unlock();
}
void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
- if (skcd->val) {
- if (skcd->no_refcnt)
- return;
- /*
- * We might be cloning a socket which is left in an empty
- * cgroup and the cgroup might have already been rmdir'd.
- * Don't use cgroup_get_live().
- */
- cgroup_get(sock_cgroup_ptr(skcd));
- cgroup_bpf_get(sock_cgroup_ptr(skcd));
- }
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
+ cgroup_get(cgrp);
+ cgroup_bpf_get(cgrp);
}
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
- if (skcd->no_refcnt)
- return;
cgroup_bpf_put(cgrp);
cgroup_put(cgrp);
}
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index b49c57d35a88..1a6a86693b74 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -71,11 +71,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
struct update_classid_context *ctx = (void *)v;
struct socket *sock = sock_from_file(file);
- if (sock) {
- spin_lock(&cgroup_sk_update_lock);
+ if (sock)
sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
- spin_unlock(&cgroup_sk_update_lock);
- }
if (--ctx->batch == 0) {
ctx->batch = UPDATE_CLASSID_BATCH;
return n + 1;
@@ -121,8 +118,6 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
struct css_task_iter it;
struct task_struct *p;
- cgroup_sk_alloc_disable();
-
cs->classid = (u32)value;
css_task_iter_start(css, 0, &it);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 99a431c56f23..8456dfbe2eb4 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
if (!dev)
return -ENODEV;
- cgroup_sk_alloc_disable();
-
rtnl_lock();
ret = netprio_set_prio(of_css(of), dev, prio);
@@ -221,12 +219,10 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
static int update_netprio(const void *v, struct file *file, unsigned n)
{
struct socket *sock = sock_from_file(file);
- if (sock) {
- spin_lock(&cgroup_sk_update_lock);
+
+ if (sock)
sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
(unsigned long)v);
- spin_unlock(&cgroup_sk_update_lock);
- }
return 0;
}
@@ -235,8 +231,6 @@ static void net_prio_attach(struct cgroup_taskset *tset)
struct task_struct *p;
struct cgroup_subsys_state *css;
- cgroup_sk_alloc_disable();
-
cgroup_taskset_for_each(p, css, tset) {
void *v = (void *)(unsigned long)css->id;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel(a)iogearbox.net>
Date: Tue, 14 Sep 2021 01:07:57 +0200
Subject: [PATCH] bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode
Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used.
Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
embedded per-socket cgroup information into sock->sk_cgrp_data and in order
to save 8 bytes in struct sock made both mutually exclusive, that is, when
cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2
falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp).
The assumption made was "there is no reason to mix the two and this is in line
with how legacy and v2 compatibility is handled" as stated in bd1060a1d671.
However, with Kubernetes more widely supporting cgroups v2 as well nowadays,
this assumption no longer holds, and the possibility of the v1/v2 mixed mode
with the v2 root fallback being hit becomes a real security issue.
Many of the cgroup v2 BPF programs are also used for policy enforcement, just
to pick _one_ example, that is, to programmatically deny socket related system
calls like connect(2) or bind(2). A v2 root fallback would implicitly cause
a policy bypass for the affected Pods.
In production environments, we have recently seen this case due to various
circumstances: i) a different 3rd party agent and/or ii) a container runtime
such as [0] in the user's environment configuring legacy cgroup v1 net_cls
tags, which triggered implicitly mentioned root fallback. Another case is
Kubernetes projects like kind [1] which create Kubernetes nodes in a container
and also add cgroup namespaces to the mix, meaning programs which are attached
to the cgroup v2 root of the cgroup namespace get attached to a non-root
cgroup v2 path from init namespace point of view. And the latter's root is
out of reach for agents on a kind Kubernetes node to configure. Meaning, any
entity on the node setting cgroup v1 net_cls tag will trigger the bypass
despite cgroup v2 BPF programs attached to the namespace root.
Generally, this mutual exclusiveness does not hold anymore in today's user
environments and makes cgroup v2 usage from BPF side fragile and unreliable.
This fix adds proper struct cgroup pointer for the cgroup v2 case to struct
sock_cgroup_data in order to address these issues; this implicitly also fixes
the tradeoffs being made back then with regards to races and refcount leaks
as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF
programs always operate as expected.
[0] https://github.com/nestybox/sysbox/
[1] https://kind.sigs.k8s.io/
Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
Signed-off-by: Daniel Borkmann <daniel(a)iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast(a)kernel.org>
Acked-by: Stanislav Fomichev <sdf(a)google.com>
Acked-by: Tejun Heo <tj(a)kernel.org>
Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index e1c705fdfa7c..db2e147e069f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -752,107 +752,54 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
* sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
* per-socket cgroup information except for memcg association.
*
- * On legacy hierarchies, net_prio and net_cls controllers directly set
- * attributes on each sock which can then be tested by the network layer.
- * On the default hierarchy, each sock is associated with the cgroup it was
- * created in and the networking layer can match the cgroup directly.
- *
- * To avoid carrying all three cgroup related fields separately in sock,
- * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
- * On boot, sock_cgroup_data records the cgroup that the sock was created
- * in so that cgroup2 matches can be made; however, once either net_prio or
- * net_cls starts being used, the area is overridden to carry prioidx and/or
- * classid. The two modes are distinguished by whether the lowest bit is
- * set. Clear bit indicates cgroup pointer while set bit prioidx and
- * classid.
- *
- * While userland may start using net_prio or net_cls at any time, once
- * either is used, cgroup2 matching no longer works. There is no reason to
- * mix the two and this is in line with how legacy and v2 compatibility is
- * handled. On mode switch, cgroup references which are already being
- * pointed to by socks may be leaked. While this can be remedied by adding
- * synchronization around sock_cgroup_data, given that the number of leaked
- * cgroups is bound and highly unlikely to be high, this seems to be the
- * better trade-off.
+ * On legacy hierarchies, net_prio and net_cls controllers directly
+ * set attributes on each sock which can then be tested by the network
+ * layer. On the default hierarchy, each sock is associated with the
+ * cgroup it was created in and the networking layer can match the
+ * cgroup directly.
*/
struct sock_cgroup_data {
- union {
-#ifdef __LITTLE_ENDIAN
- struct {
- u8 is_data : 1;
- u8 no_refcnt : 1;
- u8 unused : 6;
- u8 padding;
- u16 prioidx;
- u32 classid;
- } __packed;
-#else
- struct {
- u32 classid;
- u16 prioidx;
- u8 padding;
- u8 unused : 6;
- u8 no_refcnt : 1;
- u8 is_data : 1;
- } __packed;
+ struct cgroup *cgroup; /* v2 */
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ u32 classid; /* v1 */
+#endif
+#ifdef CONFIG_CGROUP_NET_PRIO
+ u16 prioidx; /* v1 */
#endif
- u64 val;
- };
};
-/*
- * There's a theoretical window where the following accessors race with
- * updaters and return part of the previous pointer as the prioidx or
- * classid. Such races are short-lived and the result isn't critical.
- */
static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
{
- /* fallback to 1 which is always the ID of the root cgroup */
- return (skcd->is_data & 1) ? skcd->prioidx : 1;
+#ifdef CONFIG_CGROUP_NET_PRIO
+ return READ_ONCE(skcd->prioidx);
+#else
+ return 1;
+#endif
}
static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
{
- /* fallback to 0 which is the unconfigured default classid */
- return (skcd->is_data & 1) ? skcd->classid : 0;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ return READ_ONCE(skcd->classid);
+#else
+ return 0;
+#endif
}
-/*
- * If invoked concurrently, the updaters may clobber each other. The
- * caller is responsible for synchronization.
- */
static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
u16 prioidx)
{
- struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
-
- if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
- return;
-
- if (!(skcd_buf.is_data & 1)) {
- skcd_buf.val = 0;
- skcd_buf.is_data = 1;
- }
-
- skcd_buf.prioidx = prioidx;
- WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
+#ifdef CONFIG_CGROUP_NET_PRIO
+ WRITE_ONCE(skcd->prioidx, prioidx);
+#endif
}
static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
u32 classid)
{
- struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
-
- if (sock_cgroup_classid(&skcd_buf) == classid)
- return;
-
- if (!(skcd_buf.is_data & 1)) {
- skcd_buf.val = 0;
- skcd_buf.is_data = 1;
- }
-
- skcd_buf.classid = classid;
- WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ WRITE_ONCE(skcd->classid, classid);
+#endif
}
#else /* CONFIG_SOCK_CGROUP_DATA */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7bf60454a313..75c151413fda 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -829,33 +829,13 @@ static inline void cgroup_account_cputime_field(struct task_struct *task,
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-extern spinlock_t cgroup_sk_update_lock;
-#endif
-
-void cgroup_sk_alloc_disable(void);
void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
void cgroup_sk_clone(struct sock_cgroup_data *skcd);
void cgroup_sk_free(struct sock_cgroup_data *skcd);
static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
{
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
- unsigned long v;
-
- /*
- * @skcd->val is 64bit but the following is safe on 32bit too as we
- * just need the lower ulong to be written and read atomically.
- */
- v = READ_ONCE(skcd->val);
-
- if (v & 3)
- return &cgrp_dfl_root.cgrp;
-
- return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
-#else
- return (struct cgroup *)(unsigned long)skcd->val;
-#endif
+ return skcd->cgroup;
}
#else /* CONFIG_CGROUP_DATA */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 881ce1470beb..8afa8690d288 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6572,74 +6572,44 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
-static bool cgroup_sk_alloc_disabled __read_mostly;
-
-void cgroup_sk_alloc_disable(void)
-{
- if (cgroup_sk_alloc_disabled)
- return;
- pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
- cgroup_sk_alloc_disabled = true;
-}
-
-#else
-
-#define cgroup_sk_alloc_disabled false
-
-#endif
-
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
- if (cgroup_sk_alloc_disabled) {
- skcd->no_refcnt = 1;
- return;
- }
-
/* Don't associate the sock with unrelated interrupted task's cgroup. */
if (in_interrupt())
return;
rcu_read_lock();
-
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
- skcd->val = (unsigned long)cset->dfl_cgrp;
+ skcd->cgroup = cset->dfl_cgrp;
cgroup_bpf_get(cset->dfl_cgrp);
break;
}
cpu_relax();
}
-
rcu_read_unlock();
}
void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
- if (skcd->val) {
- if (skcd->no_refcnt)
- return;
- /*
- * We might be cloning a socket which is left in an empty
- * cgroup and the cgroup might have already been rmdir'd.
- * Don't use cgroup_get_live().
- */
- cgroup_get(sock_cgroup_ptr(skcd));
- cgroup_bpf_get(sock_cgroup_ptr(skcd));
- }
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
+ cgroup_get(cgrp);
+ cgroup_bpf_get(cgrp);
}
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
- if (skcd->no_refcnt)
- return;
cgroup_bpf_put(cgrp);
cgroup_put(cgrp);
}
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index b49c57d35a88..1a6a86693b74 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -71,11 +71,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
struct update_classid_context *ctx = (void *)v;
struct socket *sock = sock_from_file(file);
- if (sock) {
- spin_lock(&cgroup_sk_update_lock);
+ if (sock)
sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
- spin_unlock(&cgroup_sk_update_lock);
- }
if (--ctx->batch == 0) {
ctx->batch = UPDATE_CLASSID_BATCH;
return n + 1;
@@ -121,8 +118,6 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
struct css_task_iter it;
struct task_struct *p;
- cgroup_sk_alloc_disable();
-
cs->classid = (u32)value;
css_task_iter_start(css, 0, &it);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 99a431c56f23..8456dfbe2eb4 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
if (!dev)
return -ENODEV;
- cgroup_sk_alloc_disable();
-
rtnl_lock();
ret = netprio_set_prio(of_css(of), dev, prio);
@@ -221,12 +219,10 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
static int update_netprio(const void *v, struct file *file, unsigned n)
{
struct socket *sock = sock_from_file(file);
- if (sock) {
- spin_lock(&cgroup_sk_update_lock);
+
+ if (sock)
sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
(unsigned long)v);
- spin_unlock(&cgroup_sk_update_lock);
- }
return 0;
}
@@ -235,8 +231,6 @@ static void net_prio_attach(struct cgroup_taskset *tset)
struct task_struct *p;
struct cgroup_subsys_state *css;
- cgroup_sk_alloc_disable();
-
cgroup_taskset_for_each(p, css, tset) {
void *v = (void *)(unsigned long)css->id;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel(a)iogearbox.net>
Date: Tue, 14 Sep 2021 01:07:57 +0200
Subject: [PATCH] bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode
Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used.
Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
embedded per-socket cgroup information into sock->sk_cgrp_data and in order
to save 8 bytes in struct sock made both mutually exclusive, that is, when
cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2
falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp).
The assumption made was "there is no reason to mix the two and this is in line
with how legacy and v2 compatibility is handled" as stated in bd1060a1d671.
However, with Kubernetes more widely supporting cgroups v2 as well nowadays,
this assumption no longer holds, and the possibility of the v1/v2 mixed mode
with the v2 root fallback being hit becomes a real security issue.
Many of the cgroup v2 BPF programs are also used for policy enforcement, just
to pick _one_ example, that is, to programmatically deny socket related system
calls like connect(2) or bind(2). A v2 root fallback would implicitly cause
a policy bypass for the affected Pods.
In production environments, we have recently seen this case due to various
circumstances: i) a different 3rd party agent and/or ii) a container runtime
such as [0] in the user's environment configuring legacy cgroup v1 net_cls
tags, which triggered implicitly mentioned root fallback. Another case is
Kubernetes projects like kind [1] which create Kubernetes nodes in a container
and also add cgroup namespaces to the mix, meaning programs which are attached
to the cgroup v2 root of the cgroup namespace get attached to a non-root
cgroup v2 path from init namespace point of view. And the latter's root is
out of reach for agents on a kind Kubernetes node to configure. Meaning, any
entity on the node setting cgroup v1 net_cls tag will trigger the bypass
despite cgroup v2 BPF programs attached to the namespace root.
Generally, this mutual exclusiveness does not hold anymore in today's user
environments and makes cgroup v2 usage from BPF side fragile and unreliable.
This fix adds proper struct cgroup pointer for the cgroup v2 case to struct
sock_cgroup_data in order to address these issues; this implicitly also fixes
the tradeoffs being made back then with regards to races and refcount leaks
as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF
programs always operate as expected.
[0] https://github.com/nestybox/sysbox/
[1] https://kind.sigs.k8s.io/
Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
Signed-off-by: Daniel Borkmann <daniel(a)iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast(a)kernel.org>
Acked-by: Stanislav Fomichev <sdf(a)google.com>
Acked-by: Tejun Heo <tj(a)kernel.org>
Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index e1c705fdfa7c..db2e147e069f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -752,107 +752,54 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
* sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
* per-socket cgroup information except for memcg association.
*
- * On legacy hierarchies, net_prio and net_cls controllers directly set
- * attributes on each sock which can then be tested by the network layer.
- * On the default hierarchy, each sock is associated with the cgroup it was
- * created in and the networking layer can match the cgroup directly.
- *
- * To avoid carrying all three cgroup related fields separately in sock,
- * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
- * On boot, sock_cgroup_data records the cgroup that the sock was created
- * in so that cgroup2 matches can be made; however, once either net_prio or
- * net_cls starts being used, the area is overridden to carry prioidx and/or
- * classid. The two modes are distinguished by whether the lowest bit is
- * set. Clear bit indicates cgroup pointer while set bit prioidx and
- * classid.
- *
- * While userland may start using net_prio or net_cls at any time, once
- * either is used, cgroup2 matching no longer works. There is no reason to
- * mix the two and this is in line with how legacy and v2 compatibility is
- * handled. On mode switch, cgroup references which are already being
- * pointed to by socks may be leaked. While this can be remedied by adding
- * synchronization around sock_cgroup_data, given that the number of leaked
- * cgroups is bound and highly unlikely to be high, this seems to be the
- * better trade-off.
+ * On legacy hierarchies, net_prio and net_cls controllers directly
+ * set attributes on each sock which can then be tested by the network
+ * layer. On the default hierarchy, each sock is associated with the
+ * cgroup it was created in and the networking layer can match the
+ * cgroup directly.
*/
struct sock_cgroup_data {
- union {
-#ifdef __LITTLE_ENDIAN
- struct {
- u8 is_data : 1;
- u8 no_refcnt : 1;
- u8 unused : 6;
- u8 padding;
- u16 prioidx;
- u32 classid;
- } __packed;
-#else
- struct {
- u32 classid;
- u16 prioidx;
- u8 padding;
- u8 unused : 6;
- u8 no_refcnt : 1;
- u8 is_data : 1;
- } __packed;
+ struct cgroup *cgroup; /* v2 */
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ u32 classid; /* v1 */
+#endif
+#ifdef CONFIG_CGROUP_NET_PRIO
+ u16 prioidx; /* v1 */
#endif
- u64 val;
- };
};
-/*
- * There's a theoretical window where the following accessors race with
- * updaters and return part of the previous pointer as the prioidx or
- * classid. Such races are short-lived and the result isn't critical.
- */
static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
{
- /* fallback to 1 which is always the ID of the root cgroup */
- return (skcd->is_data & 1) ? skcd->prioidx : 1;
+#ifdef CONFIG_CGROUP_NET_PRIO
+ return READ_ONCE(skcd->prioidx);
+#else
+ return 1;
+#endif
}
static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
{
- /* fallback to 0 which is the unconfigured default classid */
- return (skcd->is_data & 1) ? skcd->classid : 0;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ return READ_ONCE(skcd->classid);
+#else
+ return 0;
+#endif
}
-/*
- * If invoked concurrently, the updaters may clobber each other. The
- * caller is responsible for synchronization.
- */
static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
u16 prioidx)
{
- struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
-
- if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
- return;
-
- if (!(skcd_buf.is_data & 1)) {
- skcd_buf.val = 0;
- skcd_buf.is_data = 1;
- }
-
- skcd_buf.prioidx = prioidx;
- WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
+#ifdef CONFIG_CGROUP_NET_PRIO
+ WRITE_ONCE(skcd->prioidx, prioidx);
+#endif
}
static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
u32 classid)
{
- struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
-
- if (sock_cgroup_classid(&skcd_buf) == classid)
- return;
-
- if (!(skcd_buf.is_data & 1)) {
- skcd_buf.val = 0;
- skcd_buf.is_data = 1;
- }
-
- skcd_buf.classid = classid;
- WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ WRITE_ONCE(skcd->classid, classid);
+#endif
}
#else /* CONFIG_SOCK_CGROUP_DATA */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7bf60454a313..75c151413fda 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -829,33 +829,13 @@ static inline void cgroup_account_cputime_field(struct task_struct *task,
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-extern spinlock_t cgroup_sk_update_lock;
-#endif
-
-void cgroup_sk_alloc_disable(void);
void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
void cgroup_sk_clone(struct sock_cgroup_data *skcd);
void cgroup_sk_free(struct sock_cgroup_data *skcd);
static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
{
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
- unsigned long v;
-
- /*
- * @skcd->val is 64bit but the following is safe on 32bit too as we
- * just need the lower ulong to be written and read atomically.
- */
- v = READ_ONCE(skcd->val);
-
- if (v & 3)
- return &cgrp_dfl_root.cgrp;
-
- return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
-#else
- return (struct cgroup *)(unsigned long)skcd->val;
-#endif
+ return skcd->cgroup;
}
#else /* CONFIG_CGROUP_DATA */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 881ce1470beb..8afa8690d288 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6572,74 +6572,44 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
-static bool cgroup_sk_alloc_disabled __read_mostly;
-
-void cgroup_sk_alloc_disable(void)
-{
- if (cgroup_sk_alloc_disabled)
- return;
- pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
- cgroup_sk_alloc_disabled = true;
-}
-
-#else
-
-#define cgroup_sk_alloc_disabled false
-
-#endif
-
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
- if (cgroup_sk_alloc_disabled) {
- skcd->no_refcnt = 1;
- return;
- }
-
/* Don't associate the sock with unrelated interrupted task's cgroup. */
if (in_interrupt())
return;
rcu_read_lock();
-
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
- skcd->val = (unsigned long)cset->dfl_cgrp;
+ skcd->cgroup = cset->dfl_cgrp;
cgroup_bpf_get(cset->dfl_cgrp);
break;
}
cpu_relax();
}
-
rcu_read_unlock();
}
void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
- if (skcd->val) {
- if (skcd->no_refcnt)
- return;
- /*
- * We might be cloning a socket which is left in an empty
- * cgroup and the cgroup might have already been rmdir'd.
- * Don't use cgroup_get_live().
- */
- cgroup_get(sock_cgroup_ptr(skcd));
- cgroup_bpf_get(sock_cgroup_ptr(skcd));
- }
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
+ cgroup_get(cgrp);
+ cgroup_bpf_get(cgrp);
}
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
- if (skcd->no_refcnt)
- return;
cgroup_bpf_put(cgrp);
cgroup_put(cgrp);
}
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index b49c57d35a88..1a6a86693b74 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -71,11 +71,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
struct update_classid_context *ctx = (void *)v;
struct socket *sock = sock_from_file(file);
- if (sock) {
- spin_lock(&cgroup_sk_update_lock);
+ if (sock)
sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
- spin_unlock(&cgroup_sk_update_lock);
- }
if (--ctx->batch == 0) {
ctx->batch = UPDATE_CLASSID_BATCH;
return n + 1;
@@ -121,8 +118,6 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
struct css_task_iter it;
struct task_struct *p;
- cgroup_sk_alloc_disable();
-
cs->classid = (u32)value;
css_task_iter_start(css, 0, &it);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 99a431c56f23..8456dfbe2eb4 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
if (!dev)
return -ENODEV;
- cgroup_sk_alloc_disable();
-
rtnl_lock();
ret = netprio_set_prio(of_css(of), dev, prio);
@@ -221,12 +219,10 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
static int update_netprio(const void *v, struct file *file, unsigned n)
{
struct socket *sock = sock_from_file(file);
- if (sock) {
- spin_lock(&cgroup_sk_update_lock);
+
+ if (sock)
sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
(unsigned long)v);
- spin_unlock(&cgroup_sk_update_lock);
- }
return 0;
}
@@ -235,8 +231,6 @@ static void net_prio_attach(struct cgroup_taskset *tset)
struct task_struct *p;
struct cgroup_subsys_state *css;
- cgroup_sk_alloc_disable();
-
cgroup_taskset_for_each(p, css, tset) {
void *v = (void *)(unsigned long)css->id;
On 9/20/21 10:17 AM, gregkh(a)linuxfoundation.org wrote:
>
> This is a note to let you know that I've just added the patch titled
>
> parisc: Declare pci_iounmap() parisc version only when CONFIG_PCI enabled
>
> to the 5.14-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> parisc-declare-pci_iounmap-parisc-version-only-when-config_pci-enabled.patch
> and it can be found in the queue-5.14 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
No, please don't add it to any stable tree yet.
This patch was applied by Linus in order to fix build errors on the alpha
architecture, but it triggers build errors on arm64 (and maybe other arches).
The whole issue is still being analyzed, as visible in e.g. this commit:
316e8d79a0959c302b0c462ab64b069599f10eef
Author: Linus Torvalds <torvalds(a)linux-foundation.org>
Date: Sun Sep 19 17:13:35 2021 -0700
pci_iounmap'2: Electric Boogaloo: try to make sense of it all
Helge
>
>
> From 9caea0007601d3bc6debec04f8b4cd6f4c2394be Mon Sep 17 00:00:00 2001
> From: Helge Deller <deller(a)gmx.de>
> Date: Sun, 19 Sep 2021 10:36:09 -0700
> Subject: parisc: Declare pci_iounmap() parisc version only when CONFIG_PCI enabled
>
> From: Helge Deller <deller(a)gmx.de>
>
> commit 9caea0007601d3bc6debec04f8b4cd6f4c2394be upstream.
>
> Linus noticed odd declaration rules for pci_iounmap() in iomap.h and
> pci_iomap.h, where it dependend on either NO_GENERIC_PCI_IOPORT_MAP or
> GENERIC_IOMAP when CONFIG_PCI was disabled.
>
> Testing on parisc seems to indicate that we need pci_iounmap() only when
> CONFIG_PCI is enabled, so the declaration of pci_iounmap() can be moved
> cleanly into pci_iomap.h in sync with the declarations of pci_iomap().
>
> Link: https://lore.kernel.org/all/CAHk-=wjRrh98pZoQ+AzfWmsTZacWxTJKXZ9eKU2X_0+jM=…
> Signed-off-by: Helge Deller <deller(a)gmx.de>
> Suggested-by: Linus Torvalds <torvalds(a)linux-foundation.org>
> Fixes: 97a29d59fc22 ("[PARISC] fix compile break caused by iomap: make IOPORT/PCI mapping functions conditional")
> Cc: Arnd Bergmann <arnd(a)arndb.de>
> Cc: Guenter Roeck <linux(a)roeck-us.net>
> Cc: Ulrich Teichert <krypton(a)ulrich-teichert.org>
> Cc: James Bottomley <James.Bottomley(a)hansenpartnership.com>
> Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
> Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
> ---
> arch/parisc/lib/iomap.c | 4 +++-
> include/asm-generic/iomap.h | 10 ----------
> include/asm-generic/pci_iomap.h | 3 +++
> 3 files changed, 6 insertions(+), 11 deletions(-)
>
> --- a/arch/parisc/lib/iomap.c
> +++ b/arch/parisc/lib/iomap.c
> @@ -513,12 +513,15 @@ void ioport_unmap(void __iomem *addr)
> }
> }
>
> +#ifdef CONFIG_PCI
> void pci_iounmap(struct pci_dev *dev, void __iomem * addr)
> {
> if (!INDIRECT_ADDR(addr)) {
> iounmap(addr);
> }
> }
> +EXPORT_SYMBOL(pci_iounmap);
> +#endif
>
> EXPORT_SYMBOL(ioread8);
> EXPORT_SYMBOL(ioread16);
> @@ -544,4 +547,3 @@ EXPORT_SYMBOL(iowrite16_rep);
> EXPORT_SYMBOL(iowrite32_rep);
> EXPORT_SYMBOL(ioport_map);
> EXPORT_SYMBOL(ioport_unmap);
> -EXPORT_SYMBOL(pci_iounmap);
> --- a/include/asm-generic/iomap.h
> +++ b/include/asm-generic/iomap.h
> @@ -110,16 +110,6 @@ static inline void __iomem *ioremap_np(p
> }
> #endif
>
> -#ifdef CONFIG_PCI
> -/* Destroy a virtual mapping cookie for a PCI BAR (memory or IO) */
> -struct pci_dev;
> -extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
> -#elif defined(CONFIG_GENERIC_IOMAP)
> -struct pci_dev;
> -static inline void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
> -{ }
> -#endif
> -
> #include <asm-generic/pci_iomap.h>
>
> #endif
> --- a/include/asm-generic/pci_iomap.h
> +++ b/include/asm-generic/pci_iomap.h
> @@ -18,6 +18,7 @@ extern void __iomem *pci_iomap_range(str
> extern void __iomem *pci_iomap_wc_range(struct pci_dev *dev, int bar,
> unsigned long offset,
> unsigned long maxlen);
> +extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
> /* Create a virtual mapping cookie for a port on a given PCI device.
> * Do not call this directly, it exists to make it easier for architectures
> * to override */
> @@ -50,6 +51,8 @@ static inline void __iomem *pci_iomap_wc
> {
> return NULL;
> }
> +static inline void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
> +{ }
> #endif
>
> #endif /* __ASM_GENERIC_IO_H */
>
>
> Patches currently in stable-queue which might be from deller(a)gmx.de are
>
> queue-5.14/parisc-declare-pci_iounmap-parisc-version-only-when-config_pci-enabled.patch
>
On 9/20/21 10:25 AM, gregkh(a)linuxfoundation.org wrote:
>
> This is a note to let you know that I've just added the patch titled
>
> parisc: Declare pci_iounmap() parisc version only when CONFIG_PCI enabled
>
> to the 4.9-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> parisc-declare-pci_iounmap-parisc-version-only-when-config_pci-enabled.patch
> and it can be found in the queue-4.9 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
No, please don't add it to any stable tree yet.
This patch was applied by Linus in order to fix build errors on the alpha
architecture, but it triggers build errors on arm64 (and maybe other arches).
The whole issue is still being analyzed, as visible in e.g. this commit:
316e8d79a0959c302b0c462ab64b069599f10eef
Author: Linus Torvalds <torvalds(a)linux-foundation.org>
Date: Sun Sep 19 17:13:35 2021 -0700
pci_iounmap'2: Electric Boogaloo: try to make sense of it all
Helge
>
>
> From 9caea0007601d3bc6debec04f8b4cd6f4c2394be Mon Sep 17 00:00:00 2001
> From: Helge Deller <deller(a)gmx.de>
> Date: Sun, 19 Sep 2021 10:36:09 -0700
> Subject: parisc: Declare pci_iounmap() parisc version only when CONFIG_PCI enabled
>
> From: Helge Deller <deller(a)gmx.de>
>
> commit 9caea0007601d3bc6debec04f8b4cd6f4c2394be upstream.
>
> Linus noticed odd declaration rules for pci_iounmap() in iomap.h and
> pci_iomap.h, where it dependend on either NO_GENERIC_PCI_IOPORT_MAP or
> GENERIC_IOMAP when CONFIG_PCI was disabled.
>
> Testing on parisc seems to indicate that we need pci_iounmap() only when
> CONFIG_PCI is enabled, so the declaration of pci_iounmap() can be moved
> cleanly into pci_iomap.h in sync with the declarations of pci_iomap().
>
> Link: https://lore.kernel.org/all/CAHk-=wjRrh98pZoQ+AzfWmsTZacWxTJKXZ9eKU2X_0+jM=…
> Signed-off-by: Helge Deller <deller(a)gmx.de>
> Suggested-by: Linus Torvalds <torvalds(a)linux-foundation.org>
> Fixes: 97a29d59fc22 ("[PARISC] fix compile break caused by iomap: make IOPORT/PCI mapping functions conditional")
> Cc: Arnd Bergmann <arnd(a)arndb.de>
> Cc: Guenter Roeck <linux(a)roeck-us.net>
> Cc: Ulrich Teichert <krypton(a)ulrich-teichert.org>
> Cc: James Bottomley <James.Bottomley(a)hansenpartnership.com>
> Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
> Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
> ---
> arch/parisc/lib/iomap.c | 4 +++-
> include/asm-generic/iomap.h | 10 ----------
> include/asm-generic/pci_iomap.h | 3 +++
> 3 files changed, 6 insertions(+), 11 deletions(-)
>
> --- a/arch/parisc/lib/iomap.c
> +++ b/arch/parisc/lib/iomap.c
> @@ -436,12 +436,15 @@ void ioport_unmap(void __iomem *addr)
> }
> }
>
> +#ifdef CONFIG_PCI
> void pci_iounmap(struct pci_dev *dev, void __iomem * addr)
> {
> if (!INDIRECT_ADDR(addr)) {
> iounmap(addr);
> }
> }
> +EXPORT_SYMBOL(pci_iounmap);
> +#endif
>
> EXPORT_SYMBOL(ioread8);
> EXPORT_SYMBOL(ioread16);
> @@ -461,4 +464,3 @@ EXPORT_SYMBOL(iowrite16_rep);
> EXPORT_SYMBOL(iowrite32_rep);
> EXPORT_SYMBOL(ioport_map);
> EXPORT_SYMBOL(ioport_unmap);
> -EXPORT_SYMBOL(pci_iounmap);
> --- a/include/asm-generic/iomap.h
> +++ b/include/asm-generic/iomap.h
> @@ -78,16 +78,6 @@ extern void ioport_unmap(void __iomem *)
> #define ioremap_wt ioremap_nocache
> #endif
>
> -#ifdef CONFIG_PCI
> -/* Destroy a virtual mapping cookie for a PCI BAR (memory or IO) */
> -struct pci_dev;
> -extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
> -#elif defined(CONFIG_GENERIC_IOMAP)
> -struct pci_dev;
> -static inline void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
> -{ }
> -#endif
> -
> #include <asm-generic/pci_iomap.h>
>
> #endif
> --- a/include/asm-generic/pci_iomap.h
> +++ b/include/asm-generic/pci_iomap.h
> @@ -22,6 +22,7 @@ extern void __iomem *pci_iomap_range(str
> extern void __iomem *pci_iomap_wc_range(struct pci_dev *dev, int bar,
> unsigned long offset,
> unsigned long maxlen);
> +extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
> /* Create a virtual mapping cookie for a port on a given PCI device.
> * Do not call this directly, it exists to make it easier for architectures
> * to override */
> @@ -54,6 +55,8 @@ static inline void __iomem *pci_iomap_wc
> {
> return NULL;
> }
> +static inline void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
> +{ }
> #endif
>
> #endif /* __ASM_GENERIC_IO_H */
>
>
> Patches currently in stable-queue which might be from deller(a)gmx.de are
>
> queue-4.9/parisc-declare-pci_iounmap-parisc-version-only-when-config_pci-enabled.patch
> queue-4.9/parisc-fix-crash-with-signals-and-alloca.patch
>
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b871895b148256f1721bc565d803860242755a0b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin(a)gmail.com>
Date: Fri, 3 Sep 2021 22:57:06 +1000
Subject: [PATCH] powerpc/64s: system call scv tabort fix for corrupt irq
soft-mask state
If a system call is made with a transaction active, the kernel
immediately aborts it and returns. scv system calls disable irqs even
earlier in their interrupt handler, and tabort_syscall does not fix this
up.
This can result in irq soft-mask state being messed up on the next
kernel entry, and crashing at BUG_ON(arch_irq_disabled_regs(regs)) in
the kernel exit handlers, or possibly worse.
This can't easily be fixed in asm because at this point an async irq may
have hit, which is soft-masked and marked pending. The pending interrupt
has to be replayed before returning to userspace. The fix is to move the
tabort_syscall code to C in the main syscall handler, and just skip the
system call but otherwise return as usual, which will take care of the
pending irqs. This also does a bunch of other things including possible
signal delivery to the process, but the doomed transaction should still
be aborted when it is eventually returned to.
The sc system call path is changed to use the new C function as well to
reduce code and path differences. This slows down how quickly system
calls are aborted when called while a transaction is active, which could
potentially impact TM performance. But making any system call is already
bad for performance, and TM is on the way out, so go with simpler over
faster.
Fixes: 7fa95f9adaee7 ("powerpc/64s: system call support for scv/rfscv instructions")
Reported-by: Eirik Fuller <efuller(a)redhat.com>
Signed-off-by: Nicholas Piggin <npiggin(a)gmail.com>
[mpe: Use #ifdef rather than IS_ENABLED() to fix build error on 32-bit]
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20210903125707.1601269-1-npiggin@gmail.com
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index a73f3f70a657..2ccc7ea5db00 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -18,6 +18,7 @@
#include <asm/switch_to.h>
#include <asm/syscall.h>
#include <asm/time.h>
+#include <asm/tm.h>
#include <asm/unistd.h>
#if defined(CONFIG_PPC_ADV_DEBUG_REGS) && defined(CONFIG_PPC32)
@@ -136,6 +137,35 @@ notrace long system_call_exception(long r3, long r4, long r5,
*/
irq_soft_mask_regs_set_state(regs, IRQS_ENABLED);
+ /*
+ * If the system call was made with a transaction active, doom it and
+ * return without performing the system call. Unless it was an
+ * unsupported scv vector, in which case it's treated like an illegal
+ * instruction.
+ */
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ if (unlikely(MSR_TM_TRANSACTIONAL(regs->msr)) &&
+ !trap_is_unsupported_scv(regs)) {
+ /* Enable TM in the kernel, and disable EE (for scv) */
+ hard_irq_disable();
+ mtmsr(mfmsr() | MSR_TM);
+
+ /* tabort, this dooms the transaction, nothing else */
+ asm volatile(".long 0x7c00071d | ((%0) << 16)"
+ :: "r"(TM_CAUSE_SYSCALL|TM_CAUSE_PERSISTENT));
+
+ /*
+ * Userspace will never see the return value. Execution will
+ * resume after the tbegin. of the aborted transaction with the
+ * checkpointed register state. A context switch could occur
+ * or signal delivered to the process before resuming the
+ * doomed transaction context, but that should all be handled
+ * as expected.
+ */
+ return -ENOSYS;
+ }
+#endif // CONFIG_PPC_TRANSACTIONAL_MEM
+
local_irq_enable();
if (unlikely(current_thread_info()->flags & _TIF_SYSCALL_DOTRACE)) {
diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S
index d4212d2ff0b5..ec950b08a8dc 100644
--- a/arch/powerpc/kernel/interrupt_64.S
+++ b/arch/powerpc/kernel/interrupt_64.S
@@ -12,7 +12,6 @@
#include <asm/mmu.h>
#include <asm/ppc_asm.h>
#include <asm/ptrace.h>
-#include <asm/tm.h>
.section ".toc","aw"
SYS_CALL_TABLE:
@@ -55,12 +54,6 @@ COMPAT_SYS_CALL_TABLE:
.globl system_call_vectored_\name
system_call_vectored_\name:
_ASM_NOKPROBE_SYMBOL(system_call_vectored_\name)
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-BEGIN_FTR_SECTION
- extrdi. r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */
- bne tabort_syscall
-END_FTR_SECTION_IFSET(CPU_FTR_TM)
-#endif
SCV_INTERRUPT_TO_KERNEL
mr r10,r1
ld r1,PACAKSAVE(r13)
@@ -247,12 +240,6 @@ _ASM_NOKPROBE_SYMBOL(system_call_common_real)
.globl system_call_common
system_call_common:
_ASM_NOKPROBE_SYMBOL(system_call_common)
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-BEGIN_FTR_SECTION
- extrdi. r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */
- bne tabort_syscall
-END_FTR_SECTION_IFSET(CPU_FTR_TM)
-#endif
mr r10,r1
ld r1,PACAKSAVE(r13)
std r10,0(r1)
@@ -425,34 +412,6 @@ SOFT_MASK_TABLE(.Lsyscall_rst_start, 1b)
RESTART_TABLE(.Lsyscall_rst_start, .Lsyscall_rst_end, syscall_restart)
#endif
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-tabort_syscall:
-_ASM_NOKPROBE_SYMBOL(tabort_syscall)
- /* Firstly we need to enable TM in the kernel */
- mfmsr r10
- li r9, 1
- rldimi r10, r9, MSR_TM_LG, 63-MSR_TM_LG
- mtmsrd r10, 0
-
- /* tabort, this dooms the transaction, nothing else */
- li r9, (TM_CAUSE_SYSCALL|TM_CAUSE_PERSISTENT)
- TABORT(R9)
-
- /*
- * Return directly to userspace. We have corrupted user register state,
- * but userspace will never see that register state. Execution will
- * resume after the tbegin of the aborted transaction with the
- * checkpointed register state.
- */
- li r9, MSR_RI
- andc r10, r10, r9
- mtmsrd r10, 1
- mtspr SPRN_SRR0, r11
- mtspr SPRN_SRR1, r12
- RFI_TO_USER
- b . /* prevent speculative execution */
-#endif
-
/*
* If MSR EE/RI was never enabled, IRQs not reconciled, NVGPRs not
* touched, no exit work created, then this can be used.
The patch below does not apply to the 5.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 44df58d441a94de40d52fca67dc60790daee4266 Mon Sep 17 00:00:00 2001
From: Xiaoguang Wang <xiaoguang.wang(a)linux.alibaba.com>
Date: Tue, 14 Sep 2021 22:38:52 +0800
Subject: [PATCH] io_uring: fix missing sigmask restore in io_cqring_wait()
Move get_timespec() section in io_cqring_wait() before the sigmask
saving, otherwise we'll fail to restore sigmask once get_timespec()
returns error.
Fixes: c73ebb685fb6 ("io_uring: add timeout support for io_uring_enter()")
Signed-off-by: Xiaoguang Wang <xiaoguang.wang(a)linux.alibaba.com>
Link: https://lore.kernel.org/r/20210914143852.9663-1-xiaoguang.wang@linux.alibab…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index a864a94364c6..94afd087e97b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7518,6 +7518,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
break;
} while (1);
+ if (uts) {
+ struct timespec64 ts;
+
+ if (get_timespec64(&ts, uts))
+ return -EFAULT;
+ timeout = timespec64_to_jiffies(&ts);
+ }
+
if (sig) {
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
@@ -7531,14 +7539,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
- if (uts) {
- struct timespec64 ts;
-
- if (get_timespec64(&ts, uts))
- return -EFAULT;
- timeout = timespec64_to_jiffies(&ts);
- }
-
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 40ee363c844fcb6ae0f1f5cfea68aed7e268c2f4 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni(a)redhat.com>
Date: Wed, 15 Sep 2021 10:19:07 -0700
Subject: [PATCH] igc: fix tunnel offloading
Checking tunnel offloading, it turns out that offloading doesn't work
as expected. The following script allows to reproduce the issue.
Call it as `testscript DEVICE LOCALIP REMOTEIP NETMASK'
=== SNIP ===
if [ $# -ne 4 ]
then
echo "Usage $0 DEVICE LOCALIP REMOTEIP NETMASK"
exit 1
fi
DEVICE="$1"
LOCAL_ADDRESS="$2"
REMOTE_ADDRESS="$3"
NWMASK="$4"
echo "Driver: $(ethtool -i ${DEVICE} | awk '/^driver:/{print $2}') "
ethtool -k "${DEVICE}" | grep tx-udp
echo
echo "Set up NIC and tunnel..."
ip addr add "${LOCAL_ADDRESS}/${NWMASK}" dev "${DEVICE}"
ip link set "${DEVICE}" up
sleep 2
ip link add vxlan1 type vxlan id 42 \
remote "${REMOTE_ADDRESS}" \
local "${LOCAL_ADDRESS}" \
dstport 0 \
dev "${DEVICE}"
ip addr add fc00::1/64 dev vxlan1
ip link set vxlan1 up
sleep 2
rm -f vxlan.pcap
echo "Running tcpdump and iperf3..."
( nohup tcpdump -i any -w vxlan.pcap >/dev/null 2>&1 ) &
sleep 2
iperf3 -c fc00::2 >/dev/null
pkill tcpdump
echo
echo -n "Max. Paket Size: "
tcpdump -r vxlan.pcap -nnle 2>/dev/null \
| grep "${LOCAL_ADDRESS}.*> ${REMOTE_ADDRESS}.*OTV" \
| awk '{print $8}' | awk -F ':' '{print $1}' \
| sort -n | tail -1
echo
ip link del vxlan1
ip addr del ${LOCAL_ADDRESS}/${NWMASK} dev "${DEVICE}"
=== SNAP ===
The expected outcome is
Max. Paket Size: 64904
This is what you see on igb, the code igc has been taken from.
However, on igc the output is
Max. Paket Size: 1516
so the GSO aggregate packets are segmented by the kernel before calling
igc_xmit_frame. Inside the subsequent call to igc_tso, the check for
skb_is_gso(skb) fails and the function returns prematurely.
It turns out that this occurs because the feature flags aren't set
entirely correctly in igc_probe. In contrast to the original code
from igb_probe, igc_probe neglects to set the flags required to allow
tunnel offloading.
Setting the same flags as igb fixes the issue on igc.
Fixes: 34428dff3679 ("igc: Add GSO partial support")
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
Tested-by: Corinna Vinschen <vinschen(a)redhat.com>
Acked-by: Sasha Neftin <sasha.neftin(a)intel.com>
Tested-by: Nechama Kraus <nechamax.kraus(a)linux.intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen(a)intel.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index b877efae61df..0e19b4d02e62 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -6350,7 +6350,9 @@ static int igc_probe(struct pci_dev *pdev,
if (pci_using_dac)
netdev->features |= NETIF_F_HIGHDMA;
- netdev->vlan_features |= netdev->features;
+ netdev->vlan_features |= netdev->features | NETIF_F_TSO_MANGLEID;
+ netdev->mpls_features |= NETIF_F_HW_CSUM;
+ netdev->hw_enc_features |= netdev->vlan_features;
/* MTU range: 68 - 9216 */
netdev->min_mtu = ETH_MIN_MTU;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 276aae377206d60b9b7b7df4586cd9f2a813f5d0 Mon Sep 17 00:00:00 2001
From: Joakim Zhang <qiangqing.zhang(a)nxp.com>
Date: Wed, 8 Sep 2021 15:43:35 +0800
Subject: [PATCH] net: stmmac: fix system hang caused by eee_ctrl_timer during
suspend/resume
commit 5f58591323bf ("net: stmmac: delete the eee_ctrl_timer after
napi disabled"), this patch tries to fix system hang caused by eee_ctrl_timer,
unfortunately, it only can resolve it for system reboot stress test. System
hang also can be reproduced easily during system suspend/resume stess test
when mount NFS on i.MX8MP EVK board.
In stmmac driver, eee feature is combined to phylink framework. When do
system suspend, phylink_stop() would queue delayed work, it invokes
stmmac_mac_link_down(), where to deactivate eee_ctrl_timer synchronizly.
In above commit, try to fix issue by deactivating eee_ctrl_timer obviously,
but it is not enough. Looking into eee_ctrl_timer expire callback
stmmac_eee_ctrl_timer(), it could enable hareware eee mode again. What is
unexpected is that LPI interrupt (MAC_Interrupt_Enable.LPIEN bit) is always
asserted. This interrupt has chance to be issued when LPI state entry/exit
from the MAC, and at that time, clock could have been already disabled.
The result is that system hang when driver try to touch register from
interrupt handler.
The reason why above commit can fix system hang issue in stmmac_release()
is that, deactivate eee_ctrl_timer not just after napi disabled, further
after irq freed.
In conclusion, hardware would generate LPI interrupt when clock has been
disabled during suspend or resume, since hardware is in eee mode and LPI
interrupt enabled.
Interrupts from MAC, MTL and DMA level are enabled and never been disabled
when system suspend, so postpone clocks management from suspend stage to
noirq suspend stage should be more safe.
Fixes: 5f58591323bf ("net: stmmac: delete the eee_ctrl_timer after napi disabled")
Signed-off-by: Joakim Zhang <qiangqing.zhang(a)nxp.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index ece02b35a6ce..246f84fedbc8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -7118,7 +7118,6 @@ int stmmac_suspend(struct device *dev)
struct net_device *ndev = dev_get_drvdata(dev);
struct stmmac_priv *priv = netdev_priv(ndev);
u32 chan;
- int ret;
if (!ndev || !netif_running(ndev))
return 0;
@@ -7150,13 +7149,6 @@ int stmmac_suspend(struct device *dev)
} else {
stmmac_mac_set(priv, priv->ioaddr, false);
pinctrl_pm_select_sleep_state(priv->device);
- /* Disable clock in case of PWM is off */
- clk_disable_unprepare(priv->plat->clk_ptp_ref);
- ret = pm_runtime_force_suspend(dev);
- if (ret) {
- mutex_unlock(&priv->lock);
- return ret;
- }
}
mutex_unlock(&priv->lock);
@@ -7242,12 +7234,6 @@ int stmmac_resume(struct device *dev)
priv->irq_wake = 0;
} else {
pinctrl_pm_select_default_state(priv->device);
- /* enable the clk previously disabled */
- ret = pm_runtime_force_resume(dev);
- if (ret)
- return ret;
- if (priv->plat->clk_ptp_ref)
- clk_prepare_enable(priv->plat->clk_ptp_ref);
/* reset the phy so that it's ready */
if (priv->mii)
stmmac_mdio_reset(priv->mii);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 5ca710844cc1..4885f9ad1b1e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -9,6 +9,7 @@
*******************************************************************************/
#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
#include <linux/module.h>
#include <linux/io.h>
#include <linux/of.h>
@@ -771,9 +772,52 @@ static int __maybe_unused stmmac_runtime_resume(struct device *dev)
return stmmac_bus_clks_config(priv, true);
}
+static int stmmac_pltfr_noirq_suspend(struct device *dev)
+{
+ struct net_device *ndev = dev_get_drvdata(dev);
+ struct stmmac_priv *priv = netdev_priv(ndev);
+ int ret;
+
+ if (!netif_running(ndev))
+ return 0;
+
+ if (!device_may_wakeup(priv->device) || !priv->plat->pmt) {
+ /* Disable clock in case of PWM is off */
+ clk_disable_unprepare(priv->plat->clk_ptp_ref);
+
+ ret = pm_runtime_force_suspend(dev);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int stmmac_pltfr_noirq_resume(struct device *dev)
+{
+ struct net_device *ndev = dev_get_drvdata(dev);
+ struct stmmac_priv *priv = netdev_priv(ndev);
+ int ret;
+
+ if (!netif_running(ndev))
+ return 0;
+
+ if (!device_may_wakeup(priv->device) || !priv->plat->pmt) {
+ /* enable the clk previously disabled */
+ ret = pm_runtime_force_resume(dev);
+ if (ret)
+ return ret;
+
+ clk_prepare_enable(priv->plat->clk_ptp_ref);
+ }
+
+ return 0;
+}
+
const struct dev_pm_ops stmmac_pltfr_pm_ops = {
SET_SYSTEM_SLEEP_PM_OPS(stmmac_pltfr_suspend, stmmac_pltfr_resume)
SET_RUNTIME_PM_OPS(stmmac_runtime_suspend, stmmac_runtime_resume, NULL)
+ SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(stmmac_pltfr_noirq_suspend, stmmac_pltfr_noirq_resume)
};
EXPORT_SYMBOL_GPL(stmmac_pltfr_pm_ops);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 276aae377206d60b9b7b7df4586cd9f2a813f5d0 Mon Sep 17 00:00:00 2001
From: Joakim Zhang <qiangqing.zhang(a)nxp.com>
Date: Wed, 8 Sep 2021 15:43:35 +0800
Subject: [PATCH] net: stmmac: fix system hang caused by eee_ctrl_timer during
suspend/resume
commit 5f58591323bf ("net: stmmac: delete the eee_ctrl_timer after
napi disabled"), this patch tries to fix system hang caused by eee_ctrl_timer,
unfortunately, it only can resolve it for system reboot stress test. System
hang also can be reproduced easily during system suspend/resume stess test
when mount NFS on i.MX8MP EVK board.
In stmmac driver, eee feature is combined to phylink framework. When do
system suspend, phylink_stop() would queue delayed work, it invokes
stmmac_mac_link_down(), where to deactivate eee_ctrl_timer synchronizly.
In above commit, try to fix issue by deactivating eee_ctrl_timer obviously,
but it is not enough. Looking into eee_ctrl_timer expire callback
stmmac_eee_ctrl_timer(), it could enable hareware eee mode again. What is
unexpected is that LPI interrupt (MAC_Interrupt_Enable.LPIEN bit) is always
asserted. This interrupt has chance to be issued when LPI state entry/exit
from the MAC, and at that time, clock could have been already disabled.
The result is that system hang when driver try to touch register from
interrupt handler.
The reason why above commit can fix system hang issue in stmmac_release()
is that, deactivate eee_ctrl_timer not just after napi disabled, further
after irq freed.
In conclusion, hardware would generate LPI interrupt when clock has been
disabled during suspend or resume, since hardware is in eee mode and LPI
interrupt enabled.
Interrupts from MAC, MTL and DMA level are enabled and never been disabled
when system suspend, so postpone clocks management from suspend stage to
noirq suspend stage should be more safe.
Fixes: 5f58591323bf ("net: stmmac: delete the eee_ctrl_timer after napi disabled")
Signed-off-by: Joakim Zhang <qiangqing.zhang(a)nxp.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index ece02b35a6ce..246f84fedbc8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -7118,7 +7118,6 @@ int stmmac_suspend(struct device *dev)
struct net_device *ndev = dev_get_drvdata(dev);
struct stmmac_priv *priv = netdev_priv(ndev);
u32 chan;
- int ret;
if (!ndev || !netif_running(ndev))
return 0;
@@ -7150,13 +7149,6 @@ int stmmac_suspend(struct device *dev)
} else {
stmmac_mac_set(priv, priv->ioaddr, false);
pinctrl_pm_select_sleep_state(priv->device);
- /* Disable clock in case of PWM is off */
- clk_disable_unprepare(priv->plat->clk_ptp_ref);
- ret = pm_runtime_force_suspend(dev);
- if (ret) {
- mutex_unlock(&priv->lock);
- return ret;
- }
}
mutex_unlock(&priv->lock);
@@ -7242,12 +7234,6 @@ int stmmac_resume(struct device *dev)
priv->irq_wake = 0;
} else {
pinctrl_pm_select_default_state(priv->device);
- /* enable the clk previously disabled */
- ret = pm_runtime_force_resume(dev);
- if (ret)
- return ret;
- if (priv->plat->clk_ptp_ref)
- clk_prepare_enable(priv->plat->clk_ptp_ref);
/* reset the phy so that it's ready */
if (priv->mii)
stmmac_mdio_reset(priv->mii);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 5ca710844cc1..4885f9ad1b1e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -9,6 +9,7 @@
*******************************************************************************/
#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
#include <linux/module.h>
#include <linux/io.h>
#include <linux/of.h>
@@ -771,9 +772,52 @@ static int __maybe_unused stmmac_runtime_resume(struct device *dev)
return stmmac_bus_clks_config(priv, true);
}
+static int stmmac_pltfr_noirq_suspend(struct device *dev)
+{
+ struct net_device *ndev = dev_get_drvdata(dev);
+ struct stmmac_priv *priv = netdev_priv(ndev);
+ int ret;
+
+ if (!netif_running(ndev))
+ return 0;
+
+ if (!device_may_wakeup(priv->device) || !priv->plat->pmt) {
+ /* Disable clock in case of PWM is off */
+ clk_disable_unprepare(priv->plat->clk_ptp_ref);
+
+ ret = pm_runtime_force_suspend(dev);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int stmmac_pltfr_noirq_resume(struct device *dev)
+{
+ struct net_device *ndev = dev_get_drvdata(dev);
+ struct stmmac_priv *priv = netdev_priv(ndev);
+ int ret;
+
+ if (!netif_running(ndev))
+ return 0;
+
+ if (!device_may_wakeup(priv->device) || !priv->plat->pmt) {
+ /* enable the clk previously disabled */
+ ret = pm_runtime_force_resume(dev);
+ if (ret)
+ return ret;
+
+ clk_prepare_enable(priv->plat->clk_ptp_ref);
+ }
+
+ return 0;
+}
+
const struct dev_pm_ops stmmac_pltfr_pm_ops = {
SET_SYSTEM_SLEEP_PM_OPS(stmmac_pltfr_suspend, stmmac_pltfr_resume)
SET_RUNTIME_PM_OPS(stmmac_runtime_suspend, stmmac_runtime_resume, NULL)
+ SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(stmmac_pltfr_noirq_suspend, stmmac_pltfr_noirq_resume)
};
EXPORT_SYMBOL_GPL(stmmac_pltfr_pm_ops);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 276aae377206d60b9b7b7df4586cd9f2a813f5d0 Mon Sep 17 00:00:00 2001
From: Joakim Zhang <qiangqing.zhang(a)nxp.com>
Date: Wed, 8 Sep 2021 15:43:35 +0800
Subject: [PATCH] net: stmmac: fix system hang caused by eee_ctrl_timer during
suspend/resume
commit 5f58591323bf ("net: stmmac: delete the eee_ctrl_timer after
napi disabled"), this patch tries to fix system hang caused by eee_ctrl_timer,
unfortunately, it only can resolve it for system reboot stress test. System
hang also can be reproduced easily during system suspend/resume stess test
when mount NFS on i.MX8MP EVK board.
In stmmac driver, eee feature is combined to phylink framework. When do
system suspend, phylink_stop() would queue delayed work, it invokes
stmmac_mac_link_down(), where to deactivate eee_ctrl_timer synchronizly.
In above commit, try to fix issue by deactivating eee_ctrl_timer obviously,
but it is not enough. Looking into eee_ctrl_timer expire callback
stmmac_eee_ctrl_timer(), it could enable hareware eee mode again. What is
unexpected is that LPI interrupt (MAC_Interrupt_Enable.LPIEN bit) is always
asserted. This interrupt has chance to be issued when LPI state entry/exit
from the MAC, and at that time, clock could have been already disabled.
The result is that system hang when driver try to touch register from
interrupt handler.
The reason why above commit can fix system hang issue in stmmac_release()
is that, deactivate eee_ctrl_timer not just after napi disabled, further
after irq freed.
In conclusion, hardware would generate LPI interrupt when clock has been
disabled during suspend or resume, since hardware is in eee mode and LPI
interrupt enabled.
Interrupts from MAC, MTL and DMA level are enabled and never been disabled
when system suspend, so postpone clocks management from suspend stage to
noirq suspend stage should be more safe.
Fixes: 5f58591323bf ("net: stmmac: delete the eee_ctrl_timer after napi disabled")
Signed-off-by: Joakim Zhang <qiangqing.zhang(a)nxp.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index ece02b35a6ce..246f84fedbc8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -7118,7 +7118,6 @@ int stmmac_suspend(struct device *dev)
struct net_device *ndev = dev_get_drvdata(dev);
struct stmmac_priv *priv = netdev_priv(ndev);
u32 chan;
- int ret;
if (!ndev || !netif_running(ndev))
return 0;
@@ -7150,13 +7149,6 @@ int stmmac_suspend(struct device *dev)
} else {
stmmac_mac_set(priv, priv->ioaddr, false);
pinctrl_pm_select_sleep_state(priv->device);
- /* Disable clock in case of PWM is off */
- clk_disable_unprepare(priv->plat->clk_ptp_ref);
- ret = pm_runtime_force_suspend(dev);
- if (ret) {
- mutex_unlock(&priv->lock);
- return ret;
- }
}
mutex_unlock(&priv->lock);
@@ -7242,12 +7234,6 @@ int stmmac_resume(struct device *dev)
priv->irq_wake = 0;
} else {
pinctrl_pm_select_default_state(priv->device);
- /* enable the clk previously disabled */
- ret = pm_runtime_force_resume(dev);
- if (ret)
- return ret;
- if (priv->plat->clk_ptp_ref)
- clk_prepare_enable(priv->plat->clk_ptp_ref);
/* reset the phy so that it's ready */
if (priv->mii)
stmmac_mdio_reset(priv->mii);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 5ca710844cc1..4885f9ad1b1e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -9,6 +9,7 @@
*******************************************************************************/
#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
#include <linux/module.h>
#include <linux/io.h>
#include <linux/of.h>
@@ -771,9 +772,52 @@ static int __maybe_unused stmmac_runtime_resume(struct device *dev)
return stmmac_bus_clks_config(priv, true);
}
+static int stmmac_pltfr_noirq_suspend(struct device *dev)
+{
+ struct net_device *ndev = dev_get_drvdata(dev);
+ struct stmmac_priv *priv = netdev_priv(ndev);
+ int ret;
+
+ if (!netif_running(ndev))
+ return 0;
+
+ if (!device_may_wakeup(priv->device) || !priv->plat->pmt) {
+ /* Disable clock in case of PWM is off */
+ clk_disable_unprepare(priv->plat->clk_ptp_ref);
+
+ ret = pm_runtime_force_suspend(dev);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int stmmac_pltfr_noirq_resume(struct device *dev)
+{
+ struct net_device *ndev = dev_get_drvdata(dev);
+ struct stmmac_priv *priv = netdev_priv(ndev);
+ int ret;
+
+ if (!netif_running(ndev))
+ return 0;
+
+ if (!device_may_wakeup(priv->device) || !priv->plat->pmt) {
+ /* enable the clk previously disabled */
+ ret = pm_runtime_force_resume(dev);
+ if (ret)
+ return ret;
+
+ clk_prepare_enable(priv->plat->clk_ptp_ref);
+ }
+
+ return 0;
+}
+
const struct dev_pm_ops stmmac_pltfr_pm_ops = {
SET_SYSTEM_SLEEP_PM_OPS(stmmac_pltfr_suspend, stmmac_pltfr_resume)
SET_RUNTIME_PM_OPS(stmmac_runtime_suspend, stmmac_runtime_resume, NULL)
+ SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(stmmac_pltfr_noirq_suspend, stmmac_pltfr_noirq_resume)
};
EXPORT_SYMBOL_GPL(stmmac_pltfr_pm_ops);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 276aae377206d60b9b7b7df4586cd9f2a813f5d0 Mon Sep 17 00:00:00 2001
From: Joakim Zhang <qiangqing.zhang(a)nxp.com>
Date: Wed, 8 Sep 2021 15:43:35 +0800
Subject: [PATCH] net: stmmac: fix system hang caused by eee_ctrl_timer during
suspend/resume
commit 5f58591323bf ("net: stmmac: delete the eee_ctrl_timer after
napi disabled"), this patch tries to fix system hang caused by eee_ctrl_timer,
unfortunately, it only can resolve it for system reboot stress test. System
hang also can be reproduced easily during system suspend/resume stess test
when mount NFS on i.MX8MP EVK board.
In stmmac driver, eee feature is combined to phylink framework. When do
system suspend, phylink_stop() would queue delayed work, it invokes
stmmac_mac_link_down(), where to deactivate eee_ctrl_timer synchronizly.
In above commit, try to fix issue by deactivating eee_ctrl_timer obviously,
but it is not enough. Looking into eee_ctrl_timer expire callback
stmmac_eee_ctrl_timer(), it could enable hareware eee mode again. What is
unexpected is that LPI interrupt (MAC_Interrupt_Enable.LPIEN bit) is always
asserted. This interrupt has chance to be issued when LPI state entry/exit
from the MAC, and at that time, clock could have been already disabled.
The result is that system hang when driver try to touch register from
interrupt handler.
The reason why above commit can fix system hang issue in stmmac_release()
is that, deactivate eee_ctrl_timer not just after napi disabled, further
after irq freed.
In conclusion, hardware would generate LPI interrupt when clock has been
disabled during suspend or resume, since hardware is in eee mode and LPI
interrupt enabled.
Interrupts from MAC, MTL and DMA level are enabled and never been disabled
when system suspend, so postpone clocks management from suspend stage to
noirq suspend stage should be more safe.
Fixes: 5f58591323bf ("net: stmmac: delete the eee_ctrl_timer after napi disabled")
Signed-off-by: Joakim Zhang <qiangqing.zhang(a)nxp.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index ece02b35a6ce..246f84fedbc8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -7118,7 +7118,6 @@ int stmmac_suspend(struct device *dev)
struct net_device *ndev = dev_get_drvdata(dev);
struct stmmac_priv *priv = netdev_priv(ndev);
u32 chan;
- int ret;
if (!ndev || !netif_running(ndev))
return 0;
@@ -7150,13 +7149,6 @@ int stmmac_suspend(struct device *dev)
} else {
stmmac_mac_set(priv, priv->ioaddr, false);
pinctrl_pm_select_sleep_state(priv->device);
- /* Disable clock in case of PWM is off */
- clk_disable_unprepare(priv->plat->clk_ptp_ref);
- ret = pm_runtime_force_suspend(dev);
- if (ret) {
- mutex_unlock(&priv->lock);
- return ret;
- }
}
mutex_unlock(&priv->lock);
@@ -7242,12 +7234,6 @@ int stmmac_resume(struct device *dev)
priv->irq_wake = 0;
} else {
pinctrl_pm_select_default_state(priv->device);
- /* enable the clk previously disabled */
- ret = pm_runtime_force_resume(dev);
- if (ret)
- return ret;
- if (priv->plat->clk_ptp_ref)
- clk_prepare_enable(priv->plat->clk_ptp_ref);
/* reset the phy so that it's ready */
if (priv->mii)
stmmac_mdio_reset(priv->mii);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 5ca710844cc1..4885f9ad1b1e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -9,6 +9,7 @@
*******************************************************************************/
#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
#include <linux/module.h>
#include <linux/io.h>
#include <linux/of.h>
@@ -771,9 +772,52 @@ static int __maybe_unused stmmac_runtime_resume(struct device *dev)
return stmmac_bus_clks_config(priv, true);
}
+static int stmmac_pltfr_noirq_suspend(struct device *dev)
+{
+ struct net_device *ndev = dev_get_drvdata(dev);
+ struct stmmac_priv *priv = netdev_priv(ndev);
+ int ret;
+
+ if (!netif_running(ndev))
+ return 0;
+
+ if (!device_may_wakeup(priv->device) || !priv->plat->pmt) {
+ /* Disable clock in case of PWM is off */
+ clk_disable_unprepare(priv->plat->clk_ptp_ref);
+
+ ret = pm_runtime_force_suspend(dev);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int stmmac_pltfr_noirq_resume(struct device *dev)
+{
+ struct net_device *ndev = dev_get_drvdata(dev);
+ struct stmmac_priv *priv = netdev_priv(ndev);
+ int ret;
+
+ if (!netif_running(ndev))
+ return 0;
+
+ if (!device_may_wakeup(priv->device) || !priv->plat->pmt) {
+ /* enable the clk previously disabled */
+ ret = pm_runtime_force_resume(dev);
+ if (ret)
+ return ret;
+
+ clk_prepare_enable(priv->plat->clk_ptp_ref);
+ }
+
+ return 0;
+}
+
const struct dev_pm_ops stmmac_pltfr_pm_ops = {
SET_SYSTEM_SLEEP_PM_OPS(stmmac_pltfr_suspend, stmmac_pltfr_resume)
SET_RUNTIME_PM_OPS(stmmac_runtime_suspend, stmmac_runtime_resume, NULL)
+ SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(stmmac_pltfr_noirq_suspend, stmmac_pltfr_noirq_resume)
};
EXPORT_SYMBOL_GPL(stmmac_pltfr_pm_ops);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 276aae377206d60b9b7b7df4586cd9f2a813f5d0 Mon Sep 17 00:00:00 2001
From: Joakim Zhang <qiangqing.zhang(a)nxp.com>
Date: Wed, 8 Sep 2021 15:43:35 +0800
Subject: [PATCH] net: stmmac: fix system hang caused by eee_ctrl_timer during
suspend/resume
commit 5f58591323bf ("net: stmmac: delete the eee_ctrl_timer after
napi disabled"), this patch tries to fix system hang caused by eee_ctrl_timer,
unfortunately, it only can resolve it for system reboot stress test. System
hang also can be reproduced easily during system suspend/resume stess test
when mount NFS on i.MX8MP EVK board.
In stmmac driver, eee feature is combined to phylink framework. When do
system suspend, phylink_stop() would queue delayed work, it invokes
stmmac_mac_link_down(), where to deactivate eee_ctrl_timer synchronizly.
In above commit, try to fix issue by deactivating eee_ctrl_timer obviously,
but it is not enough. Looking into eee_ctrl_timer expire callback
stmmac_eee_ctrl_timer(), it could enable hareware eee mode again. What is
unexpected is that LPI interrupt (MAC_Interrupt_Enable.LPIEN bit) is always
asserted. This interrupt has chance to be issued when LPI state entry/exit
from the MAC, and at that time, clock could have been already disabled.
The result is that system hang when driver try to touch register from
interrupt handler.
The reason why above commit can fix system hang issue in stmmac_release()
is that, deactivate eee_ctrl_timer not just after napi disabled, further
after irq freed.
In conclusion, hardware would generate LPI interrupt when clock has been
disabled during suspend or resume, since hardware is in eee mode and LPI
interrupt enabled.
Interrupts from MAC, MTL and DMA level are enabled and never been disabled
when system suspend, so postpone clocks management from suspend stage to
noirq suspend stage should be more safe.
Fixes: 5f58591323bf ("net: stmmac: delete the eee_ctrl_timer after napi disabled")
Signed-off-by: Joakim Zhang <qiangqing.zhang(a)nxp.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index ece02b35a6ce..246f84fedbc8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -7118,7 +7118,6 @@ int stmmac_suspend(struct device *dev)
struct net_device *ndev = dev_get_drvdata(dev);
struct stmmac_priv *priv = netdev_priv(ndev);
u32 chan;
- int ret;
if (!ndev || !netif_running(ndev))
return 0;
@@ -7150,13 +7149,6 @@ int stmmac_suspend(struct device *dev)
} else {
stmmac_mac_set(priv, priv->ioaddr, false);
pinctrl_pm_select_sleep_state(priv->device);
- /* Disable clock in case of PWM is off */
- clk_disable_unprepare(priv->plat->clk_ptp_ref);
- ret = pm_runtime_force_suspend(dev);
- if (ret) {
- mutex_unlock(&priv->lock);
- return ret;
- }
}
mutex_unlock(&priv->lock);
@@ -7242,12 +7234,6 @@ int stmmac_resume(struct device *dev)
priv->irq_wake = 0;
} else {
pinctrl_pm_select_default_state(priv->device);
- /* enable the clk previously disabled */
- ret = pm_runtime_force_resume(dev);
- if (ret)
- return ret;
- if (priv->plat->clk_ptp_ref)
- clk_prepare_enable(priv->plat->clk_ptp_ref);
/* reset the phy so that it's ready */
if (priv->mii)
stmmac_mdio_reset(priv->mii);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 5ca710844cc1..4885f9ad1b1e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -9,6 +9,7 @@
*******************************************************************************/
#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
#include <linux/module.h>
#include <linux/io.h>
#include <linux/of.h>
@@ -771,9 +772,52 @@ static int __maybe_unused stmmac_runtime_resume(struct device *dev)
return stmmac_bus_clks_config(priv, true);
}
+static int stmmac_pltfr_noirq_suspend(struct device *dev)
+{
+ struct net_device *ndev = dev_get_drvdata(dev);
+ struct stmmac_priv *priv = netdev_priv(ndev);
+ int ret;
+
+ if (!netif_running(ndev))
+ return 0;
+
+ if (!device_may_wakeup(priv->device) || !priv->plat->pmt) {
+ /* Disable clock in case of PWM is off */
+ clk_disable_unprepare(priv->plat->clk_ptp_ref);
+
+ ret = pm_runtime_force_suspend(dev);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int stmmac_pltfr_noirq_resume(struct device *dev)
+{
+ struct net_device *ndev = dev_get_drvdata(dev);
+ struct stmmac_priv *priv = netdev_priv(ndev);
+ int ret;
+
+ if (!netif_running(ndev))
+ return 0;
+
+ if (!device_may_wakeup(priv->device) || !priv->plat->pmt) {
+ /* enable the clk previously disabled */
+ ret = pm_runtime_force_resume(dev);
+ if (ret)
+ return ret;
+
+ clk_prepare_enable(priv->plat->clk_ptp_ref);
+ }
+
+ return 0;
+}
+
const struct dev_pm_ops stmmac_pltfr_pm_ops = {
SET_SYSTEM_SLEEP_PM_OPS(stmmac_pltfr_suspend, stmmac_pltfr_resume)
SET_RUNTIME_PM_OPS(stmmac_runtime_suspend, stmmac_runtime_resume, NULL)
+ SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(stmmac_pltfr_noirq_suspend, stmmac_pltfr_noirq_resume)
};
EXPORT_SYMBOL_GPL(stmmac_pltfr_pm_ops);