After a treclaim, we expect to be in non-transactional state. If we don't
immediately clear the current thread's MSR[TS] and we get preempted, then
tm_recheckpoint_new_task() will recheckpoint and we get rescheduled in
suspended transaction state.
When handling a signal caught in transactional state, handle_rt_signal64()
calls get_tm_stackpointer() that treclaims the transaction using
tm_reclaim_current() but without clearing the thread's MSR[TS]. This can cause
the TM Bad Thing exception below if later we pagefault and get preempted trying
to access the user's sigframe, using __put_user(). Afterwards, when we are
rescheduled back into do_page_fault() (but now in suspended state since the
thread's MSR[TS] was not cleared), upon executing 'rfid' after completion of
the page fault handling, the exception is raised because a transition from
suspended to non-transactional state is invalid.
Unexpected TM Bad Thing exception at c00000000000de44 (msr 0x8000000302a03031) tm_scratch=800000010280b033
Oops: Unrecoverable exception, sig: 6 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
Modules linked in: nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip6_tables ip_tables nft_compat ip_set nf_tables nfnetlink xts vmx_crypto sg virtio_balloon
r_mod cdrom virtio_net net_failover virtio_blk virtio_scsi failover dm_mirror dm_region_hash dm_log dm_mod
CPU: 25 PID: 15547 Comm: a.out Not tainted 5.4.0-rc2 #32
NIP: c00000000000de44 LR: c000000000034728 CTR: 0000000000000000
REGS: c00000003fe7bd70 TRAP: 0700 Not tainted (5.4.0-rc2)
MSR: 8000000302a03031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[SE]> CR: 44000884 XER: 00000000
CFAR: c00000000000dda4 IRQMASK: 0
PACATMSCRATCH: 800000010280b033
GPR00: c000000000034728 c000000f65a17c80 c000000001662800 00007fffacf3fd78
GPR04: 0000000000001000 0000000000001000 0000000000000000 c000000f611f8af0
GPR08: 0000000000000000 0000000078006001 0000000000000000 000c000000000000
GPR12: c000000f611f84b0 c00000003ffcb200 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 c000000f611f8140
GPR24: 0000000000000000 00007fffacf3fd68 c000000f65a17d90 c000000f611f7800
GPR28: c000000f65a17e90 c000000f65a17e90 c000000001685e18 00007fffacf3f000
NIP [c00000000000de44] fast_exception_return+0xf4/0x1b0
LR [c000000000034728] handle_rt_signal64+0x78/0xc50
Call Trace:
[c000000f65a17c80] [c000000000034710] handle_rt_signal64+0x60/0xc50 (unreliable)
[c000000f65a17d30] [c000000000023640] do_notify_resume+0x330/0x460
[c000000f65a17e20] [c00000000000dcc4] ret_from_except_lite+0x70/0x74
Instruction dump:
7c4ff120 e8410170 7c5a03a6 38400000 f8410060 e8010070 e8410080 e8610088
60000000 60000000 e8810090 e8210078 <4c000024> 48000000 e8610178 88ed0989
---[ end trace 93094aa44b442f87 ]---
The simplified sequence of events that triggers the above exception is:
... # userspace in NON-TRANSACTIONAL state
tbegin # userspace in TRANSACTIONAL state
signal delivery # kernelspace in SUSPENDED state
handle_rt_signal64()
get_tm_stackpointer()
treclaim # kernelspace in NON-TRANSACTIONAL state
__put_user()
page fault happens. We will never get back here because of the TM Bad Thing exception.
page fault handling kicks in and we voluntarily preempt ourselves
do_page_fault()
__schedule()
__switch_to(other_task)
our task is rescheduled and we recheckpoint because the thread's MSR[TS] was not cleared
__switch_to(our_task)
switch_to_tm()
tm_recheckpoint_new_task()
trechkpt # kernelspace in SUSPENDED state
The page fault handling resumes, but now we are in suspended transaction state
do_page_fault() completes
rfid <----- trying to get back where the page fault happened (we were non-transactional back then)
TM Bad Thing # illegal transition from suspended to non-transactional
This patch fixes that issue by clearing the current thread's MSR[TS] just after
treclaim in get_tm_stackpointer() so that we stay in non-transactional state in
case we are preempted. In order to make treclaim and clearing the thread's
MSR[TS] atomic from a preemption perspective when CONFIG_PREEMPT is set,
preempt_disable/enable() is used. It's also necessary to save the previous
value of the thread's MSR before get_tm_stackpointer() is called so that it can
be exposed to the signal handler later in setup_tm_sigcontexts() to inform the
userspace MSR at the moment of the signal delivery.
Found with tm-signal-context-force-tm kernel selftest on P8 KVM.
Fixes: 2b0a576d15e0 ("powerpc: Add new transactional memory state to the signal context")
Cc: stable(a)vger.kernel.org # v3.9
Signed-off-by: Gustavo Luiz Duarte <gustavold(a)linux.ibm.com>
---
arch/powerpc/kernel/signal.c | 17 +++++++++++++++--
arch/powerpc/kernel/signal_32.c | 24 ++++++++++--------------
arch/powerpc/kernel/signal_64.c | 20 ++++++++------------
3 files changed, 33 insertions(+), 28 deletions(-)
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index e6c30cee6abf..1660be1061ac 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -200,14 +200,27 @@ unsigned long get_tm_stackpointer(struct task_struct *tsk)
* normal/non-checkpointed stack pointer.
*/
+ unsigned long ret = tsk->thread.regs->gpr[1];
+
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BUG_ON(tsk != current);
if (MSR_TM_ACTIVE(tsk->thread.regs->msr)) {
+ preempt_disable();
tm_reclaim_current(TM_CAUSE_SIGNAL);
if (MSR_TM_TRANSACTIONAL(tsk->thread.regs->msr))
- return tsk->thread.ckpt_regs.gpr[1];
+ ret = tsk->thread.ckpt_regs.gpr[1];
+
+ /* If we treclaim, we must immediately clear the current
+ * thread's TM bits. Otherwise we might be preempted and have
+ * the live MSR[TS] changed behind our back
+ * (tm_recheckpoint_new_task() would recheckpoint).
+ * Besides, we enter the signal handler in non-transactional
+ * state.
+ */
+ tsk->thread.regs->msr &= ~MSR_TS_MASK;
+ preempt_enable();
}
#endif
- return tsk->thread.regs->gpr[1];
+ return ret;
}
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 98600b276f76..132a092cd170 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -489,19 +489,11 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
*/
static int save_tm_user_regs(struct pt_regs *regs,
struct mcontext __user *frame,
- struct mcontext __user *tm_frame, int sigret)
+ struct mcontext __user *tm_frame, int sigret,
+ unsigned long msr)
{
- unsigned long msr = regs->msr;
-
WARN_ON(tm_suspend_disabled);
- /* Remove TM bits from thread's MSR. The MSR in the sigcontext
- * just indicates to userland that we were doing a transaction, but we
- * don't want to return in transactional state. This also ensures
- * that flush_fp_to_thread won't set TIF_RESTORE_TM again.
- */
- regs->msr &= ~MSR_TS_MASK;
-
/* Save both sets of general registers */
if (save_general_regs(¤t->thread.ckpt_regs, frame)
|| save_general_regs(regs, tm_frame))
@@ -912,6 +904,8 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
int sigret;
unsigned long tramp;
struct pt_regs *regs = tsk->thread.regs;
+ /* Save the thread's msr before get_tm_stackpointer() changes it */
+ unsigned long msr = regs->msr;
BUG_ON(tsk != current);
@@ -944,13 +938,13 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
tm_frame = &rt_sf->uc_transact.uc_mcontext;
- if (MSR_TM_ACTIVE(regs->msr)) {
+ if (MSR_TM_ACTIVE(msr)) {
if (__put_user((unsigned long)&rt_sf->uc_transact,
&rt_sf->uc.uc_link) ||
__put_user((unsigned long)tm_frame,
&rt_sf->uc_transact.uc_regs))
goto badframe;
- if (save_tm_user_regs(regs, frame, tm_frame, sigret))
+ if (save_tm_user_regs(regs, frame, tm_frame, sigret, msr))
goto badframe;
}
else
@@ -1369,6 +1363,8 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
int sigret;
unsigned long tramp;
struct pt_regs *regs = tsk->thread.regs;
+ /* Save the thread's msr before get_tm_stackpointer() changes it */
+ unsigned long msr = regs->msr;
BUG_ON(tsk != current);
@@ -1402,9 +1398,9 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
tm_mctx = &frame->mctx_transact;
- if (MSR_TM_ACTIVE(regs->msr)) {
+ if (MSR_TM_ACTIVE(msr)) {
if (save_tm_user_regs(regs, &frame->mctx, &frame->mctx_transact,
- sigret))
+ sigret, msr))
goto badframe;
}
else
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 117515564ec7..e5b5f9738056 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -192,7 +192,8 @@ static long setup_sigcontext(struct sigcontext __user *sc,
static long setup_tm_sigcontexts(struct sigcontext __user *sc,
struct sigcontext __user *tm_sc,
struct task_struct *tsk,
- int signr, sigset_t *set, unsigned long handler)
+ int signr, sigset_t *set, unsigned long handler,
+ unsigned long msr)
{
/* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the
* process never used altivec yet (MSR_VEC is zero in pt_regs of
@@ -207,12 +208,11 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
elf_vrreg_t __user *tm_v_regs = sigcontext_vmx_regs(tm_sc);
#endif
struct pt_regs *regs = tsk->thread.regs;
- unsigned long msr = tsk->thread.regs->msr;
long err = 0;
BUG_ON(tsk != current);
- BUG_ON(!MSR_TM_ACTIVE(regs->msr));
+ BUG_ON(!MSR_TM_ACTIVE(msr));
WARN_ON(tm_suspend_disabled);
@@ -222,13 +222,6 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
*/
msr |= tsk->thread.ckpt_regs.msr & (MSR_FP | MSR_VEC | MSR_VSX);
- /* Remove TM bits from thread's MSR. The MSR in the sigcontext
- * just indicates to userland that we were doing a transaction, but we
- * don't want to return in transactional state. This also ensures
- * that flush_fp_to_thread won't set TIF_RESTORE_TM again.
- */
- regs->msr &= ~MSR_TS_MASK;
-
#ifdef CONFIG_ALTIVEC
err |= __put_user(v_regs, &sc->v_regs);
err |= __put_user(tm_v_regs, &tm_sc->v_regs);
@@ -824,6 +817,8 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
unsigned long newsp = 0;
long err = 0;
struct pt_regs *regs = tsk->thread.regs;
+ /* Save the thread's msr before get_tm_stackpointer() changes it */
+ unsigned long msr = regs->msr;
BUG_ON(tsk != current);
@@ -841,7 +836,7 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
err |= __put_user(0, &frame->uc.uc_flags);
err |= __save_altstack(&frame->uc.uc_stack, regs->gpr[1]);
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
- if (MSR_TM_ACTIVE(regs->msr)) {
+ if (MSR_TM_ACTIVE(msr)) {
/* The ucontext_t passed to userland points to the second
* ucontext_t (for transactional state) with its uc_link ptr.
*/
@@ -849,7 +844,8 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
err |= setup_tm_sigcontexts(&frame->uc.uc_mcontext,
&frame->uc_transact.uc_mcontext,
tsk, ksig->sig, NULL,
- (unsigned long)ksig->ka.sa.sa_handler);
+ (unsigned long)ksig->ka.sa.sa_handler,
+ msr);
} else
#endif
{
--
2.21.0
As discussed in the strace issue tracker, it appears that the sparc32
sysvipc support has been broken for the past 11 years. It was however
working in compat mode, which is how it must have escaped most of the
regular testing.
The problem is that a cleanup patch inadvertently changed the uid/gid
fields in struct ipc64_perm from 32-bit types to 16-bit types in uapi
headers.
Both glibc and uclibc-ng still use the original types, so they should
work fine with compat mode, but not natively. Change the definitions
to use __kernel_uid32_t and __kernel_gid32_t again.
Fixes: 83c86984bff2 ("sparc: unify ipcbuf.h")
Link: https://github.com/strace/strace/issues/116
Cc: <stable(a)vger.kernel.org> # v2.6.29
Cc: Sam Ravnborg <sam(a)ravnborg.org>
Cc: "Dmitry V . Levin" <ldv(a)altlinux.org>
Cc: Rich Felker <dalias(a)libc.org>
Cc: libc-alpha(a)sourceware.org
Signed-off-by: Arnd Bergmann <arnd(a)arndb.de>
---
arch/sparc/include/uapi/asm/ipcbuf.h | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/arch/sparc/include/uapi/asm/ipcbuf.h b/arch/sparc/include/uapi/asm/ipcbuf.h
index 5b933a598a33..0ea1240d2ea1 100644
--- a/arch/sparc/include/uapi/asm/ipcbuf.h
+++ b/arch/sparc/include/uapi/asm/ipcbuf.h
@@ -17,19 +17,19 @@
struct ipc64_perm
{
- __kernel_key_t key;
- __kernel_uid_t uid;
- __kernel_gid_t gid;
- __kernel_uid_t cuid;
- __kernel_gid_t cgid;
+ __kernel_key_t key;
+ __kernel_uid32_t uid;
+ __kernel_gid32_t gid;
+ __kernel_uid32_t cuid;
+ __kernel_gid32_t cgid;
#ifndef __arch64__
- unsigned short __pad0;
+ unsigned short __pad0;
#endif
- __kernel_mode_t mode;
- unsigned short __pad1;
- unsigned short seq;
- unsigned long long __unused1;
- unsigned long long __unused2;
+ __kernel_mode_t mode;
+ unsigned short __pad1;
+ unsigned short seq;
+ unsigned long long __unused1;
+ unsigned long long __unused2;
};
#endif /* __SPARC_IPCBUF_H */
--
2.20.0
Hi Sasha,
On Sun, Jan 19, 2020 at 5:26 PM Sasha Levin <sashal(a)kernel.org> wrote:
>
> Hi,
>
> [This is an automated email]
>
> This commit has been processed because it contains a -stable tag.
> The stable tag indicates that it's relevant for the following trees: all
>
> The bot has tested the following trees: v5.4.13, v4.19.97, v4.14.166, v4.9.210, v4.4.210.
>
> v5.4.13: Build OK!
> v4.19.97: Failed to apply! Possible dependencies:
'm looking into making a patch for v4.19.y. The rest are not relevant
Thanks,
Gilad
Hi,
On 2020-01-19 15:26:54 +0000, Sasha Levin wrote:
> This commit has been processed because it contains a "Fixes:" tag,
> fixing commit: 722ddfde366f ("perf tools: Fix time sorting").
> The bot has tested the following trees: v5.4.13, v4.19.97, v4.14.166, v4.9.210, v4.4.210.
>
> How should we proceed with this patch?
>
> v5.4.13: Build OK!
> v4.19.97: Build OK!
> v4.14.166: Build OK!
> v4.9.210: Failed to apply! Possible dependencies:
> 7aef3bf3daa1 ("perf c2c: Add c2c command")
> v4.4.210: Failed to apply! Possible dependencies:
> 7aef3bf3daa1 ("perf c2c: Add c2c command")
Looks like it doesn't need to be backpatched to the failing
branches. "Fixes" was approximate here anyway, since that commit just
exposed a pre-existing coding bug, that was masked by the fix in
722ddfde366f. And since 4.9/4.4 don't have c2c at all, there's nothing
to fix.
Greetings,
Andres Freund
Congratulations,
Your email was randomly selected for the 2020 first quarter reimbursement via certified ATM CARD. Please reach Mrs. Sarah Buchiri with your code:U.N.D.C/2020/10/0109 for more information.
Contact Name: Mrs. Sarah Buchiri
Email: sarahbuchiri54(a)gmail.com
Robert Andrew Piper
Assistant Secretary-General for Development Coordination
From: Tianyu Lan <Tianyu.Lan(a)microsoft.com>
Current code has assumption that balloon request memory size aligns
with 2MB. But actually Hyper-V doesn't guarantee such alignment. When
balloon driver receives non-aligned balloon request, it produces warning
and balloon up more memory than requested in order to keep 2MB alignment.
Remove the warning and balloon up memory according to actual requested
memory size.
Fixes: f6712238471a ("hv: hv_balloon: avoid memory leak on alloc_error of 2MB memory block")
Cc: stable(a)vger.kernel.org
Reviewed-by: Vitaly Kuznetsov <vkuznets(a)redhat.com>
Signed-off-by: Tianyu Lan <Tianyu.Lan(a)microsoft.com>
---
Change since v2:
- Remove check between request page number and alloc_unit
in the alloc_balloon_pages() because it's redundant with
new change.
- Remove the "continue" just follwoing alloc_unit switch
from 2MB to 4K in order to avoid skipping allocated
memory.
Change since v1:
- Change logic of switching alloc_unit from 2MB to 4KB
in the balloon_up() to avoid redundant iteration when
handle non-aligned page request.
- Remove 2MB alignment operation and comment in balloon_up()
---
drivers/hv/hv_balloon.c | 17 ++++-------------
1 file changed, 4 insertions(+), 13 deletions(-)
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 7f3e7ab22d5d..73092a7a3345 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -1681,10 +1681,7 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
unsigned int i, j;
struct page *pg;
- if (num_pages < alloc_unit)
- return 0;
-
- for (i = 0; (i * alloc_unit) < num_pages; i++) {
+ for (i = 0; i < num_pages / alloc_unit; i++) {
if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) >
HV_HYP_PAGE_SIZE)
return i * alloc_unit;
@@ -1722,7 +1719,7 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
}
- return num_pages;
+ return i * alloc_unit;
}
static void balloon_up(union dm_msg_info *msg_info)
@@ -1737,9 +1734,6 @@ static void balloon_up(union dm_msg_info *msg_info)
long avail_pages;
unsigned long floor;
- /* The host balloons pages in 2M granularity. */
- WARN_ON_ONCE(num_pages % PAGES_IN_2M != 0);
-
/*
* We will attempt 2M allocations. However, if we fail to
* allocate 2M chunks, we will go back to PAGE_SIZE allocations.
@@ -1749,14 +1743,13 @@ static void balloon_up(union dm_msg_info *msg_info)
avail_pages = si_mem_available();
floor = compute_balloon_floor();
- /* Refuse to balloon below the floor, keep the 2M granularity. */
+ /* Refuse to balloon below the floor. */
if (avail_pages < num_pages || avail_pages - num_pages < floor) {
pr_warn("Balloon request will be partially fulfilled. %s\n",
avail_pages < num_pages ? "Not enough memory." :
"Balloon floor reached.");
num_pages = avail_pages > floor ? (avail_pages - floor) : 0;
- num_pages -= num_pages % PAGES_IN_2M;
}
while (!done) {
@@ -1770,10 +1763,8 @@ static void balloon_up(union dm_msg_info *msg_info)
num_ballooned = alloc_balloon_pages(&dm_device, num_pages,
bl_resp, alloc_unit);
- if (alloc_unit != 1 && num_ballooned == 0) {
+ if (alloc_unit != 1 && num_ballooned != num_pages)
alloc_unit = 1;
- continue;
- }
if (num_ballooned == 0 || num_ballooned == num_pages) {
pr_debug("Ballooned %u out of %u requested pages.\n",
--
2.14.5