From: Joerg Roedel <jroedel(a)suse.de>
Allow a runtime opt-out of kexec support for architecture code in case
the kernel is running in an environment where kexec is not properly
supported yet.
This will be used on x86 when the kernel is running as an SEV-ES
guest. SEV-ES guests need special handling for kexec to hand over all
CPUs to the new kernel. This requires special hypervisor support and
handling code in the guest which is not yet implemented.
Cc: stable(a)vger.kernel.org # v5.10+
Signed-off-by: Joerg Roedel <jroedel(a)suse.de>
---
include/linux/kexec.h | 1 +
kernel/kexec.c | 14 ++++++++++++++
kernel/kexec_file.c | 9 +++++++++
3 files changed, 24 insertions(+)
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 0c994ae37729..85c30dcd0bdc 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -201,6 +201,7 @@ int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
unsigned long buf_len);
#endif
int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf);
+bool arch_kexec_supported(void);
extern int kexec_add_buffer(struct kexec_buf *kbuf);
int kexec_locate_mem_hole(struct kexec_buf *kbuf);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b5e40f069768..275cda429380 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -190,11 +190,25 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
* that to happen you need to do that yourself.
*/
+bool __weak arch_kexec_supported(void)
+{
+ return true;
+}
+
static inline int kexec_load_check(unsigned long nr_segments,
unsigned long flags)
{
int result;
+ /*
+ * The architecture may support kexec in general, but the kernel could
+ * run in an environment where it is not (yet) possible to execute a new
+ * kernel. Allow the architecture code to opt-out of kexec support when
+ * it is running in such an environment.
+ */
+ if (!arch_kexec_supported())
+ return -ENOSYS;
+
/* We only trust the superuser with rebooting the system. */
if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
return -EPERM;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 33400ff051a8..96d08a512e9c 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -358,6 +358,15 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
int ret = 0, i;
struct kimage **dest_image, *image;
+ /*
+ * The architecture may support kexec in general, but the kernel could
+ * run in an environment where it is not (yet) possible to execute a new
+ * kernel. Allow the architecture code to opt-out of kexec support when
+ * it is running in such an environment.
+ */
+ if (!arch_kexec_supported())
+ return -ENOSYS;
+
/* We only trust the superuser with rebooting the system. */
if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
return -EPERM;
--
2.33.0
Not the child partition should be removed from the partition list
but the partition itself. Otherwise the partition list gets broken
and any subsequent remove operations leads to a kernel panic.
Fixes: 46b5889cc2c5 ("mtd: implement proper partition handling")
Signed-off-by: Andreas Oetken <andreas.oetken(a)siemens-energy.com>
Cc: stable(a)vger.kernel.org
---
drivers/mtd/mtdpart.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 95d47422bbf20..5725818fa199f 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -313,7 +313,7 @@ static int __mtd_del_partition(struct mtd_info *mtd)
if (err)
return err;
- list_del(&child->part.node);
+ list_del(&mtd->part.node);
free_partition(mtd);
return 0;
--
2.30.2
copy_process currently copies task_struct.posix_cputimers_work as-is. If a
timer interrupt arrives while handling clone and before dup_task_struct
completes then the child task will have:
1. posix_cputimers_work.scheduled = true
2. posix_cputimers_work.work queued.
copy_process clears task_struct.task_works, so (2) will have no effect and
posix_cpu_timers_work will never run (not to mention it doesn't make sense
for two tasks to share a common linked list).
Since posix_cpu_timers_work never runs, posix_cputimers_work.scheduled is
never cleared. Since scheduled is set, future timer interrupts will skip
scheduling work, with the ultimate result that the task will never receive
timer expirations.
Together, the complete flow is:
1. Task 1 calls clone(), enters kernel.
2. Timer interrupt fires, schedules task work on Task 1.
2a. task_struct.posix_cputimers_work.scheduled = true
2b. task_struct.posix_cputimers_work.work added to
task_struct.task_works.
3. dup_task_struct copies Task 1 to Task 2.
4. copy_process clears task_struct.task_works for Task 2.
5. Future timer interrupts on Task 2 see
task_struct.posix_cputimers_work.scheduled = true and skip scheduling
work.
Fix this by explicitly clearing contents of
task_struct.posix_cputimers_work in copy_process. This was never meant to
be shared or inherited across tasks in the first place.
Signed-off-by: Michael Pratt <mpratt(a)google.com>
Reported-by: Rhys Hiltner <rhys(a)justin.tv>
Fixes: 1fb497dd0030 ("posix-cpu-timers: Provide mechanisms to defer timer handling to task_work")
Cc: <stable(a)vger.kernel.org>
---
This issue was discovered while investigating a flaky test in the Go
language standard libary, https://golang.org/issue/49065. After our testing
VMs upgraded from 5.4 to 5.10 kernels, several profiling tests started
failing ~1% of the time with threads not receiving their expected profiling
signals.
Bisection of problem by Rhys blamed b6b178e38f40 ("Merge tag
'timers-core-2020-08-14' of
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip"). This merge commit
introduced the broken commit 1fb497dd0030 ("posix-cpu-timers: Provide
mechanisms to defer timer handling to task_work") and its child
0099808553ad ("x86: Select POSIX_CPU_TIMERS_TASK_WORK"), which enables the
new codepath.
The C program below also reproduces the problem. Build with `gcc repro.c
-lrt -pthread -O2`.
The program starts a CPU timer on the main thread, which then spawns child
threads that create their own CPU timers and verify that they receive timer
signals. At HEAD and 0099808553ad this program fails with ~3-15 / 20000
threads not receiving signals.
Prior to 0099808553ad and with this patch, the program reports no failures.
// SPDX-License-Identifier: GPL-2.0
#include <pthread.h>
#include <signal.h>
#include <stdint.h>
#include <stdio.h>
#include <sys/syscall.h>
#include <time.h>
#include <unistd.h>
__thread uint64_t signaled;
_Atomic int threads_bad;
void signal_handler(int signo, siginfo_t *siginfo, void *uctx)
{
signaled++;
}
int gettid(void)
{
return syscall(SYS_gettid);
}
timer_t setup_timer(void)
{
struct sigevent sev = {
.sigev_signo = SIGPROF,
.sigev_notify = SIGEV_THREAD_ID,
._sigev_un = {
._tid = gettid(),
},
};
struct itimerspec spec = {
.it_interval = {
.tv_nsec = 10*1000*1000, /* 10ms */
},
.it_value = {
.tv_nsec = 10*1000*1000, /* 10ms */
},
};
timer_t timerid;
int ret;
ret = timer_create(CLOCK_THREAD_CPUTIME_ID, &sev, &timerid);
if (ret != 0) {
perror("timer_create");
_exit(1);
}
ret = timer_settime(timerid, 0, &spec, NULL);
if (ret != 0) {
perror("timer_settime");
_exit(1);
}
return timerid;
}
uint64_t thread_cpu_ns(void)
{
struct timespec ts;
int ret;
ret = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
if (ret != 0) {
perror("clock_gettime");
_exit(1);
}
return ts.tv_nsec + 1000*1000*1000*ts.tv_sec;
}
void *thread(void *arg)
{
timer_t timerid;
uint64_t start;
int ret;
timerid = setup_timer();
start = thread_cpu_ns();
while (1) {
uint64_t now;
/* 50ms passed? */
now = thread_cpu_ns();
if (now - start > 50*1000*1000)
break;
/* Busy loop */
for (volatile int i = 0; i < 100000; i++)
;
}
/*
* 50ms passed; we should certainly have received some profiling
* signals.
*/
if (signaled == 0) {
printf("Thread %d received no profiling signals!\n", gettid());
threads_bad++;
}
ret = timer_delete(timerid);
if (ret != 0) {
perror("timer_delete");
_exit(1);
}
return NULL;
}
int main(void)
{
struct sigaction sa = {
.sa_sigaction = &signal_handler,
.sa_flags = SA_SIGINFO | SA_RESTART,
};
int ret;
sigset_t set;
timer_t timerid;
int bad;
int thread_count = 0;
ret = sigaction(SIGPROF, &sa, NULL);
if (ret != 0) {
perror("sigaction");
return 1;
}
sigemptyset(&set);
sigaddset(&set, SIGPROF);
ret = sigprocmask(SIG_UNBLOCK, &set, NULL);
if (ret != 0) {
perror("sigprocmask");
return 1;
}
timerid = setup_timer();
while (thread_count < 20000) {
pthread_t threads[10];
for (int i = 0; i < 10; i++) {
ret = pthread_create(&threads[i], NULL, &thread, NULL);
if (ret != 0) {
perror("pthread_create");
return 1;
}
thread_count++;
}
/* Busy loop */
for (volatile int i = 0; i < 100000; i++)
;
for (int i = 0; i < 10; i++) {
ret = pthread_join(threads[i], NULL);
if (ret != 0) {
perror("pthread_join");
return 1;
}
}
if (thread_count % 100 == 0)
printf("%d threads\n", thread_count);
}
bad = threads_bad;
printf("Bad threads %d / %d = %f%%\n", threads_bad, thread_count,
100*((double)threads_bad) / ((double)thread_count));
if (threads_bad > 0)
return 1;
return 0;
}
include/linux/posix-timers.h | 2 ++
kernel/fork.c | 1 +
kernel/time/posix-cpu-timers.c | 19 +++++++++++++++++--
3 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 00fef0064355..5bbcd280bfd2 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -184,8 +184,10 @@ static inline void posix_cputimers_group_init(struct posix_cputimers *pct,
#endif
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
+void clear_posix_cputimers_work(struct task_struct *p);
void posix_cputimers_init_work(void);
#else
+static inline void clear_posix_cputimers_work(struct task_struct *p) { }
static inline void posix_cputimers_init_work(void) { }
#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 38681ad44c76..b1551c074b74 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2280,6 +2280,7 @@ static __latent_entropy struct task_struct *copy_process(
p->pdeath_signal = 0;
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
+ clear_posix_cputimers_work(p);
#ifdef CONFIG_KRETPROBES
p->kretprobe_instances.first = NULL;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 643d412ac623..96b4e7810426 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1158,14 +1158,29 @@ static void posix_cpu_timers_work(struct callback_head *work)
handle_posix_cpu_timers(current);
}
+/*
+ * Clear existing posix CPU timers task work.
+ */
+void clear_posix_cputimers_work(struct task_struct *p)
+{
+ /*
+ * A copied work entry from the old task is not meaningful, clear it.
+ * N.B. init_task_work will not do this.
+ */
+ memset(&p->posix_cputimers_work.work, 0,
+ sizeof(p->posix_cputimers_work.work));
+ init_task_work(&p->posix_cputimers_work.work,
+ posix_cpu_timers_work);
+ p->posix_cputimers_work.scheduled = false;
+}
+
/*
* Initialize posix CPU timers task work in init task. Out of line to
* keep the callback static and to avoid header recursion hell.
*/
void __init posix_cputimers_init_work(void)
{
- init_task_work(¤t->posix_cputimers_work.work,
- posix_cpu_timers_work);
+ clear_posix_cputimers_work(current);
}
/*
--
2.33.1.1089.g2158813163f-goog