The bspec is confusing on the nature of the upper 32bits of the LRC
descriptor. Once upon a time, it said that it uses the upper 32b to
decide if it should perform a lite-restore, and so we must ensure that
each unique context submitted to HW is given a unique CCID [for the
duration of it being on the HW]. Currently, this was thought to be
achieved by using a small circular tag, and assigning every context
submitted to HW a new id. However, due to preemption replacing an
inflight context, it would be possible to assign an id to a new context
that matched the currently in flight context -- a situation we sought to
avoid.
Rather than use a simple circular counter, switch over to a small bitmap
of inflight ids so we can avoid reusing one that is still potentially
active.
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1796
Fixes: 2935ed5339c4 ("drm/i915: Remove logical HW ID")
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> # v5.5+
---
drivers/gpu/drm/i915/gt/intel_engine_types.h | 3 +-
drivers/gpu/drm/i915/gt/intel_lrc.c | 31 ++++++++++++++------
drivers/gpu/drm/i915/i915_perf.c | 3 +-
drivers/gpu/drm/i915/selftests/i915_vma.c | 2 +-
4 files changed, 25 insertions(+), 14 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index bf395227c99f..a9fc3fbbe482 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -304,8 +304,7 @@ struct intel_engine_cs {
u32 context_size;
u32 mmio_base;
- unsigned int context_tag;
-#define NUM_CONTEXT_TAG roundup_pow_of_two(2 * EXECLIST_MAX_PORTS)
+ unsigned long context_tag;
struct rb_node uabi_node;
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 93a1b73ad96b..e84d277d1f02 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1404,13 +1404,17 @@ __execlists_schedule_in(struct i915_request *rq)
ce->lrc_desc &= ~GENMASK_ULL(47, 37);
if (ce->tag) {
/* Use a fixed tag for OA and friends */
+ GEM_BUG_ON(ce->tag <= BITS_PER_TYPE(engine->context_tag));
ce->lrc_desc |= (u64)ce->tag << 32;
} else {
/* We don't need a strict matching tag, just different values */
- ce->lrc_desc |=
- (u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
- GEN11_SW_CTX_ID_SHIFT;
- BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
+ unsigned int tag = ffs(engine->context_tag);
+
+ GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
+ clear_bit(tag - 1, &engine->context_tag);
+ ce->lrc_desc |= (u64)tag << GEN11_SW_CTX_ID_SHIFT;
+
+ BUILD_BUG_ON(BITS_PER_TYPE(engine->context_tag) > GEN12_MAX_CONTEXT_HW_ID);
}
__intel_gt_pm_get(engine->gt);
@@ -1452,7 +1456,8 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
static inline void
__execlists_schedule_out(struct i915_request *rq,
- struct intel_engine_cs * const engine)
+ struct intel_engine_cs * const engine,
+ unsigned int ccid)
{
struct intel_context * const ce = rq->context;
@@ -1470,6 +1475,10 @@ __execlists_schedule_out(struct i915_request *rq,
i915_request_completed(rq))
intel_engine_add_retire(engine, ce->timeline);
+ ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
+ if (ccid < BITS_PER_TYPE(engine->context_tag))
+ set_bit(ccid - 1, &engine->context_tag);
+
intel_context_update_runtime(ce);
intel_engine_context_out(engine);
execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
@@ -1495,15 +1504,17 @@ execlists_schedule_out(struct i915_request *rq)
{
struct intel_context * const ce = rq->context;
struct intel_engine_cs *cur, *old;
+ unsigned int ccid;
trace_i915_request_out(rq);
+ ccid = upper_32_bits(rq->context->lrc_desc);
old = READ_ONCE(ce->inflight);
do
cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
while (!try_cmpxchg(&ce->inflight, &old, cur));
if (!cur)
- __execlists_schedule_out(rq, old);
+ __execlists_schedule_out(rq, old, ccid);
i915_request_put(rq);
}
@@ -1571,8 +1582,9 @@ dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
if (!rq)
return "";
- snprintf(buf, buflen, "%s%llx:%lld%s prio %d",
+ snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
prefix,
+ upper_32_bits(rq->context->lrc_desc),
rq->fence.context, rq->fence.seqno,
i915_request_completed(rq) ? "!" :
i915_request_started(rq) ? "*" :
@@ -3444,7 +3456,8 @@ __execlists_context_pin(struct intel_context *ce,
if (IS_ERR(vaddr))
return PTR_ERR(vaddr);
- ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
+ ce->lrc_desc &= ~GENMASK_ULL(31, 0);
+ ce->lrc_desc |= lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
__execlists_update_reg_state(ce, engine, ce->ring->tail);
@@ -4002,7 +4015,7 @@ static void enable_execlists(struct intel_engine_cs *engine)
enable_error_interrupt(engine);
- engine->context_tag = 0;
+ engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
}
static bool unexpected_starting_state(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index dec1b33e4da8..1863a5c4778d 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1281,11 +1281,10 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << (GEN11_SW_CTX_ID_SHIFT - 32);
/*
* Pick an unused context id
- * 0 - (NUM_CONTEXT_TAG - 1) are used by other contexts
+ * 0 - BITS_PER_LONG are used by other contexts
* GEN12_MAX_CONTEXT_HW_ID (0x7ff) is used by idle context
*/
stream->specific_ctx_id = (GEN12_MAX_CONTEXT_HW_ID - 1) << (GEN11_SW_CTX_ID_SHIFT - 32);
- BUILD_BUG_ON((GEN12_MAX_CONTEXT_HW_ID - 1) < NUM_CONTEXT_TAG);
break;
}
diff --git a/drivers/gpu/drm/i915/selftests/i915_vma.c b/drivers/gpu/drm/i915/selftests/i915_vma.c
index 58b5f40a07dd..af89c7fc8f59 100644
--- a/drivers/gpu/drm/i915/selftests/i915_vma.c
+++ b/drivers/gpu/drm/i915/selftests/i915_vma.c
@@ -173,7 +173,7 @@ static int igt_vma_create(void *arg)
}
nc = 0;
- for_each_prime_number(num_ctx, 2 * NUM_CONTEXT_TAG) {
+ for_each_prime_number(num_ctx, 2 * BITS_PER_LONG) {
for (; nc < num_ctx; nc++) {
ctx = mock_context(i915, "mock");
if (!ctx)
--
2.20.1
Upstream commit e1eb075c5051987fbbadbc0fb8211679df657721.
If we use a non-forcewaked write to PMINTRMSK, it does not take effect
until much later, if at all, causing a loss of RPS interrupts and no GPU
reclocking, leaving the GPU running at the wrong frequency for long
periods of time.
Reported-by: Francisco Jerez <currojerez(a)riseup.net>
Suggested-by: Francisco Jerez <currojerez(a)riseup.net>
Fixes: 35cc7f32c298 ("drm/i915/gt: Use non-forcewake writes for RPS")
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Francisco Jerez <currojerez(a)riseup.net>
Cc: Mika Kuoppala <mika.kuoppala(a)linux.intel.com>
Cc: Andi Shyti <andi.shyti(a)intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala(a)linux.intel.com>
Reviewed-by: Andi Shyti <andi.shyti(a)intel.com>
Reviewed-by: Francisco Jerez <currojerez(a)riseup.net>
Cc: <stable(a)vger.kernel.org> # v5.6+
Link: https://patchwork.freedesktop.org/patch/msgid/20200415170318.16771-2-chris@…
(cherry picked from commit a080bd994c4023042a2b605c65fa10a25933f636)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi(a)intel.com>
(cherry picked from commit e1eb075c5051987fbbadbc0fb8211679df657721)
---
drivers/gpu/drm/i915/gt/intel_rps.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
index b2d245963d9f..8accea06185b 100644
--- a/drivers/gpu/drm/i915/gt/intel_rps.c
+++ b/drivers/gpu/drm/i915/gt/intel_rps.c
@@ -83,7 +83,8 @@ static void rps_enable_interrupts(struct intel_rps *rps)
gen6_gt_pm_enable_irq(gt, rps->pm_events);
spin_unlock_irq(>->irq_lock);
- set(gt->uncore, GEN6_PMINTRMSK, rps_pm_mask(rps, rps->cur_freq));
+ intel_uncore_write(gt->uncore,
+ GEN6_PMINTRMSK, rps_pm_mask(rps, rps->last_freq));
}
static void gen6_rps_reset_interrupts(struct intel_rps *rps)
@@ -117,7 +118,8 @@ static void rps_disable_interrupts(struct intel_rps *rps)
rps->pm_events = 0;
- set(gt->uncore, GEN6_PMINTRMSK, rps_pm_sanitize_mask(rps, ~0u));
+ intel_uncore_write(gt->uncore,
+ GEN6_PMINTRMSK, rps_pm_sanitize_mask(rps, ~0u));
spin_lock_irq(>->irq_lock);
gen6_gt_pm_disable_irq(gt, GEN6_PM_RPS_EVENTS);
--
2.26.2
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From feb8e960d780e170e992a70491eec9dd68f4dbf2 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy(a)c-s.fr>
Date: Fri, 17 Apr 2020 11:58:36 +0000
Subject: [PATCH] powerpc/mm: Fix CONFIG_PPC_KUAP_DEBUG on PPC32
CONFIG_PPC_KUAP_DEBUG is not selectable because it depends on PPC_32
which doesn't exists.
Fixing it leads to a deadlock due to a vital register getting
clobbered in _switch().
Change dependency to PPC32 and use r0 instead of r4 in _switch()
Fixes: e2fb9f544431 ("powerpc/32: Prepare for Kernel Userspace Access Protection")
Cc: stable(a)vger.kernel.org # v5.2+
Signed-off-by: Christophe Leroy <christophe.leroy(a)c-s.fr>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/540242f7d4573f7cdf1b3bf46bb35f743b2cd68f.15871246…
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index a6371fb8f761..8420abd4ea1c 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -732,7 +732,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPE)
stw r10,_CCR(r1)
stw r1,KSP(r3) /* Set old stack pointer */
- kuap_check r2, r4
+ kuap_check r2, r0
#ifdef CONFIG_SMP
/* We need a sync somewhere here to make sure that if the
* previous task gets rescheduled on another CPU, it sees all
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 0c3c1902135c..27a81c291be8 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -397,7 +397,7 @@ config PPC_KUAP
config PPC_KUAP_DEBUG
bool "Extra debugging for Kernel Userspace Access Protection"
- depends on PPC_KUAP && (PPC_RADIX_MMU || PPC_32)
+ depends on PPC_KUAP && (PPC_RADIX_MMU || PPC32)
help
Add extra debugging for Kernel Userspace Access Protection (KUAP)
If you're unsure, say N.
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0fe0781f29dd8ab618999e6bda33c782ebbdb109 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc(a)cjr.nz>
Date: Mon, 20 Apr 2020 23:44:24 -0300
Subject: [PATCH] cifs: fix uninitialised lease_key in open_shroot()
SMB2_open_init() expects a pre-initialised lease_key when opening a
file with a lease, so set pfid->lease_key prior to calling it in
open_shroot().
This issue was observed when performing some DFS failover tests and
the lease key was never randomly generated.
Signed-off-by: Paulo Alcantara (SUSE) <pc(a)cjr.nz>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
Reviewed-by: Ronnie Sahlberg <lsahlber(a)redhat.com>
Reviewed-by: Aurelien Aptel <aaptel(a)suse.com>
CC: Stable <stable(a)vger.kernel.org>
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index b36c46f48705..f829f4165d38 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -687,6 +687,11 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
+ if (!server->ops->new_lease_key)
+ return -EIO;
+
+ server->ops->new_lease_key(pfid);
+
memset(rqst, 0, sizeof(rqst));
resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
memset(rsp_iov, 0, sizeof(rsp_iov));
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b61c38baa98056d4802ff5be5cfb979efc2d0f7a Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy(a)c-s.fr>
Date: Mon, 20 Apr 2020 05:37:42 +0000
Subject: [PATCH] powerpc/8xx: Fix STRICT_KERNEL_RWX startup test failure
WRITE_RO lkdtm test works.
But when selecting CONFIG_DEBUG_RODATA_TEST, the kernel reports
rodata_test: test data was not read only
This is because when rodata test runs, there are still old entries
in TLB.
Flush TLB after setting kernel pages RO or NX.
Fixes: d5f17ee96447 ("powerpc/8xx: don't disable large TLBs with CONFIG_STRICT_KERNEL_RWX")
Cc: stable(a)vger.kernel.org # v5.1+
Signed-off-by: Christophe Leroy <christophe.leroy(a)c-s.fr>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/485caac75f195f18c11eb077b0031fdd2bb7fb9e.15873610…
diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index 3189308dece4..d83a12c5bc7f 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -185,6 +185,7 @@ void mmu_mark_initmem_nx(void)
mmu_mapin_ram_chunk(etext8, einittext8, PAGE_KERNEL);
}
}
+ _tlbil_all();
}
#ifdef CONFIG_STRICT_KERNEL_RWX
@@ -199,6 +200,8 @@ void mmu_mark_rodata_ro(void)
~(LARGE_PAGE_SIZE_8M - 1)));
mmu_patch_addis(&patch__dtlbmiss_romem_top, -__pa(_sinittext));
+ _tlbil_all();
+
/* Update page tables for PTDUMP and BDI */
mmu_mapin_ram_chunk(0, sinittext, __pgprot(0));
mmu_mapin_ram_chunk(0, etext, PAGE_KERNEL_ROX);
This patch is an improvement of a previous version[1], as the previous
version is not easy to understand.
This issue persists in the newest kernel, I have to resend the fix. As
the implementation is changed, I drop Roman's ack from the previous
version.
Here's the explanation of this issue.
memory.{low,min} won't take effect if the to-be-reclaimed memcg is the
sc->target_mem_cgroup, that can also be proved by the implementation in
mem_cgroup_protected(), see bellow,
mem_cgroup_protected
if (memcg == root) [2]
return MEMCG_PROT_NONE;
But this rule is ignored in mem_cgroup_protection(), which will read
memory.{emin, elow} as the protection whatever the memcg is.
How would this issue happen?
Because in mem_cgroup_protected() we forget to clear the
memory.{emin, elow} if the memcg is target_mem_cgroup [2].
An example to illustrate this issue.
root_mem_cgroup
/
A memory.max: 1024M
memory.min: 512M
memory.current: 800M ('current' must be greater than 'min')
Once kswapd starts to reclaim memcg A, it assigns 512M to memory.emin of A.
Then kswapd stops.
As a result of it, the memory values of A will be,
root_mem_cgroup
/
A memory.max: 1024M
memory.min: 512M
memory.current: 512M (approximately)
memory.emin: 512M
Then a new workload starts to run in memcg A, and it will trigger memcg
relcaim in A soon. As memcg A is the target_mem_cgroup of this
reclaimer, so it return directly without touching memory.{emin, elow}.[2]
The memory values of A will be,
root_mem_cgroup
/
A memory.max: 1024M
memory.min: 512M
memory.current: 1024M (approximately)
memory.emin: 512M
Then this memory.emin will be used in mem_cgroup_protection() to get the
scan count, which is obvoiusly a wrong scan count.
[1]. https://lore.kernel.org/linux-mm/20200216145249.6900-1-laoar.shao@gmail.com/
Fixes: 9783aa9917f8 ("mm, memcg: proportional memory.{low,min} reclaim")
Cc: Chris Down <chris(a)chrisdown.name>
Cc: Roman Gushchin <guro(a)fb.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Yafang Shao <laoar.shao(a)gmail.com>
---
include/linux/memcontrol.h | 13 +++++++++++--
mm/vmscan.c | 4 ++--
2 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d275c72c4f8e..114cfe06bf60 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -344,12 +344,20 @@ static inline bool mem_cgroup_disabled(void)
return !cgroup_subsys_enabled(memory_cgrp_subsys);
}
-static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg,
bool in_low_reclaim)
{
if (mem_cgroup_disabled())
return 0;
+ /*
+ * Memcg protection won't take effect if the memcg is the target
+ * root memcg.
+ */
+ if (root == memcg)
+ return 0;
+
if (in_low_reclaim)
return READ_ONCE(memcg->memory.emin);
@@ -835,7 +843,8 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
{
}
-static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg,
bool in_low_reclaim)
{
return 0;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b06868fc4926..ad2782f754ab 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2346,9 +2346,9 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long protection;
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- protection = mem_cgroup_protection(memcg,
+ protection = mem_cgroup_protection(sc->target_mem_cgroup,
+ memcg,
sc->memcg_low_reclaim);
-
if (protection) {
/*
* Scale a cgroup's reclaim pressure by proportioning
--
2.18.2
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 61e713bdca3678e84815f2427f7a063fc353a1fc Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm(a)xmission.com>
Date: Mon, 20 Apr 2020 11:41:50 -0500
Subject: [PATCH] signal: Avoid corrupting si_pid and si_uid in
do_notify_parent
Christof Meerwald <cmeerw(a)cmeerw.org> writes:
> Hi,
>
> this is probably related to commit
> 7a0cf094944e2540758b7f957eb6846d5126f535 (signal: Correct namespace
> fixups of si_pid and si_uid).
>
> With a 5.6.5 kernel I am seeing SIGCHLD signals that don't include a
> properly set si_pid field - this seems to happen for multi-threaded
> child processes.
>
> A simple test program (based on the sample from the signalfd man page):
>
> #include <sys/signalfd.h>
> #include <signal.h>
> #include <unistd.h>
> #include <spawn.h>
> #include <stdlib.h>
> #include <stdio.h>
>
> #define handle_error(msg) \
> do { perror(msg); exit(EXIT_FAILURE); } while (0)
>
> int main(int argc, char *argv[])
> {
> sigset_t mask;
> int sfd;
> struct signalfd_siginfo fdsi;
> ssize_t s;
>
> sigemptyset(&mask);
> sigaddset(&mask, SIGCHLD);
>
> if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1)
> handle_error("sigprocmask");
>
> pid_t chldpid;
> char *chldargv[] = { "./sfdclient", NULL };
> posix_spawn(&chldpid, "./sfdclient", NULL, NULL, chldargv, NULL);
>
> sfd = signalfd(-1, &mask, 0);
> if (sfd == -1)
> handle_error("signalfd");
>
> for (;;) {
> s = read(sfd, &fdsi, sizeof(struct signalfd_siginfo));
> if (s != sizeof(struct signalfd_siginfo))
> handle_error("read");
>
> if (fdsi.ssi_signo == SIGCHLD) {
> printf("Got SIGCHLD %d %d %d %d\n",
> fdsi.ssi_status, fdsi.ssi_code,
> fdsi.ssi_uid, fdsi.ssi_pid);
> return 0;
> } else {
> printf("Read unexpected signal\n");
> }
> }
> }
>
>
> and a multi-threaded client to test with:
>
> #include <unistd.h>
> #include <pthread.h>
>
> void *f(void *arg)
> {
> sleep(100);
> }
>
> int main()
> {
> pthread_t t[8];
>
> for (int i = 0; i != 8; ++i)
> {
> pthread_create(&t[i], NULL, f, NULL);
> }
> }
>
> I tried to do a bit of debugging and what seems to be happening is
> that
>
> /* From an ancestor pid namespace? */
> if (!task_pid_nr_ns(current, task_active_pid_ns(t))) {
>
> fails inside task_pid_nr_ns because the check for "pid_alive" fails.
>
> This code seems to be called from do_notify_parent and there we
> actually have "tsk != current" (I am assuming both are threads of the
> current process?)
I instrumented the code with a warning and received the following backtrace:
> WARNING: CPU: 0 PID: 777 at kernel/pid.c:501 __task_pid_nr_ns.cold.6+0xc/0x15
> Modules linked in:
> CPU: 0 PID: 777 Comm: sfdclient Not tainted 5.7.0-rc1userns+ #2924
> Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> RIP: 0010:__task_pid_nr_ns.cold.6+0xc/0x15
> Code: ff 66 90 48 83 ec 08 89 7c 24 04 48 8d 7e 08 48 8d 74 24 04 e8 9a b6 44 00 48 83 c4 08 c3 48 c7 c7 59 9f ac 82 e8 c2 c4 04 00 <0f> 0b e9 3fd
> RSP: 0018:ffffc9000042fbf8 EFLAGS: 00010046
> RAX: 000000000000000c RBX: 0000000000000000 RCX: ffffc9000042faf4
> RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff81193d29
> RBP: ffffc9000042fc18 R08: 0000000000000000 R09: 0000000000000001
> R10: 000000100f938416 R11: 0000000000000309 R12: ffff8880b941c140
> R13: 0000000000000000 R14: 0000000000000000 R15: ffff8880b941c140
> FS: 0000000000000000(0000) GS:ffff8880bca00000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007f2e8c0a32e0 CR3: 0000000002e10000 CR4: 00000000000006f0
> Call Trace:
> send_signal+0x1c8/0x310
> do_notify_parent+0x50f/0x550
> release_task.part.21+0x4fd/0x620
> do_exit+0x6f6/0xaf0
> do_group_exit+0x42/0xb0
> get_signal+0x13b/0xbb0
> do_signal+0x2b/0x670
> ? __audit_syscall_exit+0x24d/0x2b0
> ? rcu_read_lock_sched_held+0x4d/0x60
> ? kfree+0x24c/0x2b0
> do_syscall_64+0x176/0x640
> ? trace_hardirqs_off_thunk+0x1a/0x1c
> entry_SYSCALL_64_after_hwframe+0x49/0xb3
The immediate problem is as Christof noticed that "pid_alive(current) == false".
This happens because do_notify_parent is called from the last thread to exit
in a process after that thread has been reaped.
The bigger issue is that do_notify_parent can be called from any
process that manages to wait on a thread of a multi-threaded process
from wait_task_zombie. So any logic based upon current for
do_notify_parent is just nonsense, as current can be pretty much
anything.
So change do_notify_parent to call __send_signal directly.
Inspecting the code it appears this problem has existed since the pid
namespace support started handling this case in 2.6.30. This fix only
backports to 7a0cf094944e ("signal: Correct namespace fixups of si_pid and si_uid")
where the problem logic was moved out of __send_signal and into send_signal.
Cc: stable(a)vger.kernel.org
Fixes: 6588c1e3ff01 ("signals: SI_USER: Masquerade si_pid when crossing pid ns boundary")
Ref: 921cf9f63089 ("signals: protect cinit from unblocked SIG_DFL signals")
Link: https://lore.kernel.org/lkml/20200419201336.GI22017@edge.cmeerw.net/
Reported-by: Christof Meerwald <cmeerw(a)cmeerw.org>
Acked-by: Oleg Nesterov <oleg(a)redhat.com>
Acked-by: Christian Brauner <christian.brauner(a)ubuntu.com>
Signed-off-by: "Eric W. Biederman" <ebiederm(a)xmission.com>
diff --git a/kernel/signal.c b/kernel/signal.c
index e58a6c619824..7938c60e11dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1993,8 +1993,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
sig = 0;
}
+ /*
+ * Send with __send_signal as si_pid and si_uid are in the
+ * parent's namespaces.
+ */
if (valid_signal(sig) && sig)
- __group_send_sig_info(sig, &info, tsk->parent);
+ __send_signal(sig, &info, tsk->parent, PIDTYPE_TGID, false);
__wake_up_parent(tsk, tsk->parent);
spin_unlock_irqrestore(&psig->siglock, flags);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 61e713bdca3678e84815f2427f7a063fc353a1fc Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm(a)xmission.com>
Date: Mon, 20 Apr 2020 11:41:50 -0500
Subject: [PATCH] signal: Avoid corrupting si_pid and si_uid in
do_notify_parent
Christof Meerwald <cmeerw(a)cmeerw.org> writes:
> Hi,
>
> this is probably related to commit
> 7a0cf094944e2540758b7f957eb6846d5126f535 (signal: Correct namespace
> fixups of si_pid and si_uid).
>
> With a 5.6.5 kernel I am seeing SIGCHLD signals that don't include a
> properly set si_pid field - this seems to happen for multi-threaded
> child processes.
>
> A simple test program (based on the sample from the signalfd man page):
>
> #include <sys/signalfd.h>
> #include <signal.h>
> #include <unistd.h>
> #include <spawn.h>
> #include <stdlib.h>
> #include <stdio.h>
>
> #define handle_error(msg) \
> do { perror(msg); exit(EXIT_FAILURE); } while (0)
>
> int main(int argc, char *argv[])
> {
> sigset_t mask;
> int sfd;
> struct signalfd_siginfo fdsi;
> ssize_t s;
>
> sigemptyset(&mask);
> sigaddset(&mask, SIGCHLD);
>
> if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1)
> handle_error("sigprocmask");
>
> pid_t chldpid;
> char *chldargv[] = { "./sfdclient", NULL };
> posix_spawn(&chldpid, "./sfdclient", NULL, NULL, chldargv, NULL);
>
> sfd = signalfd(-1, &mask, 0);
> if (sfd == -1)
> handle_error("signalfd");
>
> for (;;) {
> s = read(sfd, &fdsi, sizeof(struct signalfd_siginfo));
> if (s != sizeof(struct signalfd_siginfo))
> handle_error("read");
>
> if (fdsi.ssi_signo == SIGCHLD) {
> printf("Got SIGCHLD %d %d %d %d\n",
> fdsi.ssi_status, fdsi.ssi_code,
> fdsi.ssi_uid, fdsi.ssi_pid);
> return 0;
> } else {
> printf("Read unexpected signal\n");
> }
> }
> }
>
>
> and a multi-threaded client to test with:
>
> #include <unistd.h>
> #include <pthread.h>
>
> void *f(void *arg)
> {
> sleep(100);
> }
>
> int main()
> {
> pthread_t t[8];
>
> for (int i = 0; i != 8; ++i)
> {
> pthread_create(&t[i], NULL, f, NULL);
> }
> }
>
> I tried to do a bit of debugging and what seems to be happening is
> that
>
> /* From an ancestor pid namespace? */
> if (!task_pid_nr_ns(current, task_active_pid_ns(t))) {
>
> fails inside task_pid_nr_ns because the check for "pid_alive" fails.
>
> This code seems to be called from do_notify_parent and there we
> actually have "tsk != current" (I am assuming both are threads of the
> current process?)
I instrumented the code with a warning and received the following backtrace:
> WARNING: CPU: 0 PID: 777 at kernel/pid.c:501 __task_pid_nr_ns.cold.6+0xc/0x15
> Modules linked in:
> CPU: 0 PID: 777 Comm: sfdclient Not tainted 5.7.0-rc1userns+ #2924
> Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> RIP: 0010:__task_pid_nr_ns.cold.6+0xc/0x15
> Code: ff 66 90 48 83 ec 08 89 7c 24 04 48 8d 7e 08 48 8d 74 24 04 e8 9a b6 44 00 48 83 c4 08 c3 48 c7 c7 59 9f ac 82 e8 c2 c4 04 00 <0f> 0b e9 3fd
> RSP: 0018:ffffc9000042fbf8 EFLAGS: 00010046
> RAX: 000000000000000c RBX: 0000000000000000 RCX: ffffc9000042faf4
> RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff81193d29
> RBP: ffffc9000042fc18 R08: 0000000000000000 R09: 0000000000000001
> R10: 000000100f938416 R11: 0000000000000309 R12: ffff8880b941c140
> R13: 0000000000000000 R14: 0000000000000000 R15: ffff8880b941c140
> FS: 0000000000000000(0000) GS:ffff8880bca00000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007f2e8c0a32e0 CR3: 0000000002e10000 CR4: 00000000000006f0
> Call Trace:
> send_signal+0x1c8/0x310
> do_notify_parent+0x50f/0x550
> release_task.part.21+0x4fd/0x620
> do_exit+0x6f6/0xaf0
> do_group_exit+0x42/0xb0
> get_signal+0x13b/0xbb0
> do_signal+0x2b/0x670
> ? __audit_syscall_exit+0x24d/0x2b0
> ? rcu_read_lock_sched_held+0x4d/0x60
> ? kfree+0x24c/0x2b0
> do_syscall_64+0x176/0x640
> ? trace_hardirqs_off_thunk+0x1a/0x1c
> entry_SYSCALL_64_after_hwframe+0x49/0xb3
The immediate problem is as Christof noticed that "pid_alive(current) == false".
This happens because do_notify_parent is called from the last thread to exit
in a process after that thread has been reaped.
The bigger issue is that do_notify_parent can be called from any
process that manages to wait on a thread of a multi-threaded process
from wait_task_zombie. So any logic based upon current for
do_notify_parent is just nonsense, as current can be pretty much
anything.
So change do_notify_parent to call __send_signal directly.
Inspecting the code it appears this problem has existed since the pid
namespace support started handling this case in 2.6.30. This fix only
backports to 7a0cf094944e ("signal: Correct namespace fixups of si_pid and si_uid")
where the problem logic was moved out of __send_signal and into send_signal.
Cc: stable(a)vger.kernel.org
Fixes: 6588c1e3ff01 ("signals: SI_USER: Masquerade si_pid when crossing pid ns boundary")
Ref: 921cf9f63089 ("signals: protect cinit from unblocked SIG_DFL signals")
Link: https://lore.kernel.org/lkml/20200419201336.GI22017@edge.cmeerw.net/
Reported-by: Christof Meerwald <cmeerw(a)cmeerw.org>
Acked-by: Oleg Nesterov <oleg(a)redhat.com>
Acked-by: Christian Brauner <christian.brauner(a)ubuntu.com>
Signed-off-by: "Eric W. Biederman" <ebiederm(a)xmission.com>
diff --git a/kernel/signal.c b/kernel/signal.c
index e58a6c619824..7938c60e11dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1993,8 +1993,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
sig = 0;
}
+ /*
+ * Send with __send_signal as si_pid and si_uid are in the
+ * parent's namespaces.
+ */
if (valid_signal(sig) && sig)
- __group_send_sig_info(sig, &info, tsk->parent);
+ __send_signal(sig, &info, tsk->parent, PIDTYPE_TGID, false);
__wake_up_parent(tsk, tsk->parent);
spin_unlock_irqrestore(&psig->siglock, flags);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 61e713bdca3678e84815f2427f7a063fc353a1fc Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm(a)xmission.com>
Date: Mon, 20 Apr 2020 11:41:50 -0500
Subject: [PATCH] signal: Avoid corrupting si_pid and si_uid in
do_notify_parent
Christof Meerwald <cmeerw(a)cmeerw.org> writes:
> Hi,
>
> this is probably related to commit
> 7a0cf094944e2540758b7f957eb6846d5126f535 (signal: Correct namespace
> fixups of si_pid and si_uid).
>
> With a 5.6.5 kernel I am seeing SIGCHLD signals that don't include a
> properly set si_pid field - this seems to happen for multi-threaded
> child processes.
>
> A simple test program (based on the sample from the signalfd man page):
>
> #include <sys/signalfd.h>
> #include <signal.h>
> #include <unistd.h>
> #include <spawn.h>
> #include <stdlib.h>
> #include <stdio.h>
>
> #define handle_error(msg) \
> do { perror(msg); exit(EXIT_FAILURE); } while (0)
>
> int main(int argc, char *argv[])
> {
> sigset_t mask;
> int sfd;
> struct signalfd_siginfo fdsi;
> ssize_t s;
>
> sigemptyset(&mask);
> sigaddset(&mask, SIGCHLD);
>
> if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1)
> handle_error("sigprocmask");
>
> pid_t chldpid;
> char *chldargv[] = { "./sfdclient", NULL };
> posix_spawn(&chldpid, "./sfdclient", NULL, NULL, chldargv, NULL);
>
> sfd = signalfd(-1, &mask, 0);
> if (sfd == -1)
> handle_error("signalfd");
>
> for (;;) {
> s = read(sfd, &fdsi, sizeof(struct signalfd_siginfo));
> if (s != sizeof(struct signalfd_siginfo))
> handle_error("read");
>
> if (fdsi.ssi_signo == SIGCHLD) {
> printf("Got SIGCHLD %d %d %d %d\n",
> fdsi.ssi_status, fdsi.ssi_code,
> fdsi.ssi_uid, fdsi.ssi_pid);
> return 0;
> } else {
> printf("Read unexpected signal\n");
> }
> }
> }
>
>
> and a multi-threaded client to test with:
>
> #include <unistd.h>
> #include <pthread.h>
>
> void *f(void *arg)
> {
> sleep(100);
> }
>
> int main()
> {
> pthread_t t[8];
>
> for (int i = 0; i != 8; ++i)
> {
> pthread_create(&t[i], NULL, f, NULL);
> }
> }
>
> I tried to do a bit of debugging and what seems to be happening is
> that
>
> /* From an ancestor pid namespace? */
> if (!task_pid_nr_ns(current, task_active_pid_ns(t))) {
>
> fails inside task_pid_nr_ns because the check for "pid_alive" fails.
>
> This code seems to be called from do_notify_parent and there we
> actually have "tsk != current" (I am assuming both are threads of the
> current process?)
I instrumented the code with a warning and received the following backtrace:
> WARNING: CPU: 0 PID: 777 at kernel/pid.c:501 __task_pid_nr_ns.cold.6+0xc/0x15
> Modules linked in:
> CPU: 0 PID: 777 Comm: sfdclient Not tainted 5.7.0-rc1userns+ #2924
> Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> RIP: 0010:__task_pid_nr_ns.cold.6+0xc/0x15
> Code: ff 66 90 48 83 ec 08 89 7c 24 04 48 8d 7e 08 48 8d 74 24 04 e8 9a b6 44 00 48 83 c4 08 c3 48 c7 c7 59 9f ac 82 e8 c2 c4 04 00 <0f> 0b e9 3fd
> RSP: 0018:ffffc9000042fbf8 EFLAGS: 00010046
> RAX: 000000000000000c RBX: 0000000000000000 RCX: ffffc9000042faf4
> RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff81193d29
> RBP: ffffc9000042fc18 R08: 0000000000000000 R09: 0000000000000001
> R10: 000000100f938416 R11: 0000000000000309 R12: ffff8880b941c140
> R13: 0000000000000000 R14: 0000000000000000 R15: ffff8880b941c140
> FS: 0000000000000000(0000) GS:ffff8880bca00000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007f2e8c0a32e0 CR3: 0000000002e10000 CR4: 00000000000006f0
> Call Trace:
> send_signal+0x1c8/0x310
> do_notify_parent+0x50f/0x550
> release_task.part.21+0x4fd/0x620
> do_exit+0x6f6/0xaf0
> do_group_exit+0x42/0xb0
> get_signal+0x13b/0xbb0
> do_signal+0x2b/0x670
> ? __audit_syscall_exit+0x24d/0x2b0
> ? rcu_read_lock_sched_held+0x4d/0x60
> ? kfree+0x24c/0x2b0
> do_syscall_64+0x176/0x640
> ? trace_hardirqs_off_thunk+0x1a/0x1c
> entry_SYSCALL_64_after_hwframe+0x49/0xb3
The immediate problem is as Christof noticed that "pid_alive(current) == false".
This happens because do_notify_parent is called from the last thread to exit
in a process after that thread has been reaped.
The bigger issue is that do_notify_parent can be called from any
process that manages to wait on a thread of a multi-threaded process
from wait_task_zombie. So any logic based upon current for
do_notify_parent is just nonsense, as current can be pretty much
anything.
So change do_notify_parent to call __send_signal directly.
Inspecting the code it appears this problem has existed since the pid
namespace support started handling this case in 2.6.30. This fix only
backports to 7a0cf094944e ("signal: Correct namespace fixups of si_pid and si_uid")
where the problem logic was moved out of __send_signal and into send_signal.
Cc: stable(a)vger.kernel.org
Fixes: 6588c1e3ff01 ("signals: SI_USER: Masquerade si_pid when crossing pid ns boundary")
Ref: 921cf9f63089 ("signals: protect cinit from unblocked SIG_DFL signals")
Link: https://lore.kernel.org/lkml/20200419201336.GI22017@edge.cmeerw.net/
Reported-by: Christof Meerwald <cmeerw(a)cmeerw.org>
Acked-by: Oleg Nesterov <oleg(a)redhat.com>
Acked-by: Christian Brauner <christian.brauner(a)ubuntu.com>
Signed-off-by: "Eric W. Biederman" <ebiederm(a)xmission.com>
diff --git a/kernel/signal.c b/kernel/signal.c
index e58a6c619824..7938c60e11dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1993,8 +1993,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
sig = 0;
}
+ /*
+ * Send with __send_signal as si_pid and si_uid are in the
+ * parent's namespaces.
+ */
if (valid_signal(sig) && sig)
- __group_send_sig_info(sig, &info, tsk->parent);
+ __send_signal(sig, &info, tsk->parent, PIDTYPE_TGID, false);
__wake_up_parent(tsk, tsk->parent);
spin_unlock_irqrestore(&psig->siglock, flags);