This patchset is based on Google-internal RSEQ work done by Paul Turner and Andrew Hunter.
When working with per-CPU RSEQ-based memory allocations, it is sometimes important to make sure that a global memory location is no longer accessed from RSEQ critical sections. For example, there can be two per-CPU lists, one is "active" and accessed per-CPU, while another one is inactive and worked on asynchronously "off CPU" (e.g. garbage collection is performed). Then at some point the two lists are swapped, and a fast RCU-like mechanism is required to make sure that the previously active list is no longer accessed.
This patch introduces such a mechanism: in short, membarrier() syscall issues an IPI to a CPU, restarting a potentially active RSEQ critical section on the CPU.
v1->v2: - removed the ability to IPI all CPUs in a single sycall; - use task->mm rather than task->group_leader to identify tasks belonging to the same process. v2->v3: - re-added the ability to IPI all CPUs in a single syscall; - integrated with membarrier_private_expedited() to make sure only CPUs running tasks with the same mm as the current task are interrupted; - also added MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ; - flags in membarrier_private_expedited are never actually bit flags but always distinct values (i.e. never two flags are combined), so I modified bit testing to full equation comparison for simplicity (otherwise the code needs to work when several bits are set, for example). v3->v4: - added the third parameter to membarrier syscall: @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, then @cpu_id indicates the cpu on which RSEQ CS should be restarted. v4->v5: - added @cpu_id parameter to sys_membarrier in syscalls.h. v5->v6: - made membarrier_private_expedited more efficient in a single-cpu case; - a couple of minor refactorings. v6->v7: - made @flags an unsigned int in sys_membarrier; - a couple of minor refactorings. v7->v8: - replaced BUG_ON with WARN_ON_ONCE in membarrier.c.
The second patch in the patchset adds a selftest of this feature.
Signed-off-by: Peter Oskolkov posk@google.com --- include/linux/sched/mm.h | 3 + include/linux/syscalls.h | 2 +- include/uapi/linux/membarrier.h | 26 ++++++ kernel/sched/membarrier.c | 136 +++++++++++++++++++++++++------- 4 files changed, 136 insertions(+), 31 deletions(-)
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index f889e332912f..15bfb06f2884 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -348,10 +348,13 @@ enum { MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5), + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY = (1U << 6), + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ = (1U << 7), };
enum { MEMBARRIER_FLAG_SYNC_CORE = (1U << 0), + MEMBARRIER_FLAG_RSEQ = (1U << 1), };
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 75ac7f8ae93c..466c993e52bf 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -974,7 +974,7 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename, const char __user *const __user *argv, const char __user *const __user *envp, int flags); asmlinkage long sys_userfaultfd(int flags); -asmlinkage long sys_membarrier(int cmd, int flags); +asmlinkage long sys_membarrier(int cmd, int flags, int cpu_id); asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in, int fd_out, loff_t __user *off_out, diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h index 5891d7614c8c..737605897f36 100644 --- a/include/uapi/linux/membarrier.h +++ b/include/uapi/linux/membarrier.h @@ -114,6 +114,26 @@ * If this command is not implemented by an * architecture, -EINVAL is returned. * Returns 0 on success. + * @MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + * Ensure the caller thread, upon return from + * system call, that all its running thread + * siblings have any currently running rseq + * critical sections restarted if @flags + * parameter is 0; if @flags parameter is + * MEMBARRIER_CMD_FLAG_CPU, + * then this operation is performed only + * on CPU indicated by @cpu_id. If this command is + * not implemented by an architecture, -EINVAL + * is returned. A process needs to register its + * intent to use the private expedited rseq + * command prior to using it, otherwise + * this command returns -EPERM. + * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: + * Register the process intent to use + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ. + * If this command is not implemented by an + * architecture, -EINVAL is returned. + * Returns 0 on success. * @MEMBARRIER_CMD_SHARED: * Alias to MEMBARRIER_CMD_GLOBAL. Provided for * header backward compatibility. @@ -131,9 +151,15 @@ enum membarrier_cmd { MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4), MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 5), MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6), + MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ = (1 << 7), + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = (1 << 8),
/* Alias for header backward compatibility. */ MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL, };
+enum membarrier_cmd_flag { + MEMBARRIER_CMD_FLAG_CPU = (1 << 0), +}; + #endif /* _UAPI_LINUX_MEMBARRIER_H */ diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 168479a7d61b..e23e74d52db5 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -18,6 +18,14 @@ #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 #endif
+#ifdef CONFIG_RSEQ +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \ + (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK) +#else +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 +#endif + #define MEMBARRIER_CMD_BITMASK \ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ @@ -30,6 +38,11 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ }
+static void ipi_rseq(void *info) +{ + rseq_preempt(current); +} + static void ipi_sync_rq_state(void *info) { struct mm_struct *mm = (struct mm_struct *) info; @@ -129,19 +142,27 @@ static int membarrier_global_expedited(void) return 0; }
-static int membarrier_private_expedited(int flags) +static int membarrier_private_expedited(int flags, int cpu_id) { - int cpu; cpumask_var_t tmpmask; struct mm_struct *mm = current->mm; + smp_call_func_t ipi_func = ipi_mb;
- if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + if (!(atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY)) + return -EPERM; + ipi_func = ipi_rseq; } else { + WARN_ON_ONCE(flags); if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return -EPERM; @@ -156,35 +177,59 @@ static int membarrier_private_expedited(int flags) */ smp_mb(); /* system call entry is not a mb. */
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) return -ENOMEM;
cpus_read_lock(); - rcu_read_lock(); - for_each_online_cpu(cpu) { + + if (cpu_id >= 0) { struct task_struct *p;
- /* - * Skipping the current CPU is OK even through we can be - * migrated at any point. The current CPU, at the point - * where we read raw_smp_processor_id(), is ensured to - * be in program order with respect to the caller - * thread. Therefore, we can skip this CPU from the - * iteration. - */ - if (cpu == raw_smp_processor_id()) - continue; - p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm == mm) - __cpumask_set_cpu(cpu, tmpmask); + if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) + goto out; + if (cpu_id == raw_smp_processor_id()) + goto out; + rcu_read_lock(); + p = rcu_dereference(cpu_rq(cpu_id)->curr); + if (!p || p->mm != mm) { + rcu_read_unlock(); + goto out; + } + rcu_read_unlock(); + } else { + int cpu; + + rcu_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + p = rcu_dereference(cpu_rq(cpu)->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); + } + rcu_read_unlock(); } - rcu_read_unlock();
preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + if (cpu_id >= 0) + smp_call_function_single(cpu_id, ipi_func, NULL, 1); + else + smp_call_function_many(tmpmask, ipi_func, NULL, 1); preempt_enable();
- free_cpumask_var(tmpmask); +out: + if (cpu_id < 0) + free_cpumask_var(tmpmask); cpus_read_unlock();
/* @@ -283,11 +328,18 @@ static int membarrier_register_private_expedited(int flags) set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, ret;
- if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + ready_state = + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY; + } else { + WARN_ON_ONCE(flags); }
/* @@ -299,6 +351,8 @@ static int membarrier_register_private_expedited(int flags) return 0; if (flags & MEMBARRIER_FLAG_SYNC_CORE) set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; + if (flags & MEMBARRIER_FLAG_RSEQ) + set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ; atomic_or(set_state, &mm->membarrier_state); ret = sync_runqueues_membarrier_state(mm); if (ret) @@ -310,8 +364,15 @@ static int membarrier_register_private_expedited(int flags)
/** * sys_membarrier - issue memory barriers on a set of threads - * @cmd: Takes command values defined in enum membarrier_cmd. - * @flags: Currently needs to be 0. For future extensions. + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0 for all commands other than + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter + * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id + * contains the CPU on which to interrupt (= restart) + * the RSEQ critical section. + * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which + * RSEQ CS should be interrupted (@cmd must be + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ). * * If this system call is not implemented, -ENOSYS is returned. If the * command specified does not exist, not available on the running @@ -337,10 +398,21 @@ static int membarrier_register_private_expedited(int flags) * smp_mb() X O O * sys_membarrier() O O O */ -SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id) { - if (unlikely(flags)) - return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU)) + return -EINVAL; + break; + default: + if (unlikely(flags)) + return -EINVAL; + } + + if (!(flags & MEMBARRIER_CMD_FLAG_CPU)) + cpu_id = -1; + switch (cmd) { case MEMBARRIER_CMD_QUERY: { @@ -362,13 +434,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: return membarrier_register_global_expedited(); case MEMBARRIER_CMD_PRIVATE_EXPEDITED: - return membarrier_private_expedited(0); + return membarrier_private_expedited(0, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: return membarrier_register_private_expedited(0); case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: - return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id); + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: + return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ); default: return -EINVAL; }
This patch adds rseq_offset_deref_addv function to tools/testing/selftests/rseq/rseq-x86.h, to be used in a selftest in the next patch in the patchset.
v7->v8: this patch split out of the v7 selftest patch.
Signed-off-by: Peter Oskolkov posk@google.com --- tools/testing/selftests/rseq/rseq-x86.h | 57 +++++++++++++++++++++++++ 1 file changed, 57 insertions(+)
diff --git a/tools/testing/selftests/rseq/rseq-x86.h b/tools/testing/selftests/rseq/rseq-x86.h index b2da6004fe30..640411518e46 100644 --- a/tools/testing/selftests/rseq/rseq-x86.h +++ b/tools/testing/selftests/rseq/rseq-x86.h @@ -279,6 +279,63 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu) #endif }
+#define RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV + +/* + * pval = *(ptr+off) + * *pval += inc; + */ +static inline __attribute__((always_inline)) +int rseq_offset_deref_addv(intptr_t *ptr, off_t off, intptr_t inc, int cpu) +{ + RSEQ_INJECT_C(9) + + __asm__ __volatile__ goto ( + RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */ +#ifdef RSEQ_COMPARE_TWICE + RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1]) +#endif + /* Start rseq by storing table entry pointer into rseq_cs. */ + RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi])) + RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f) + RSEQ_INJECT_ASM(3) +#ifdef RSEQ_COMPARE_TWICE + RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1]) +#endif + /* get p+v */ + "movq %[ptr], %%rbx\n\t" + "addq %[off], %%rbx\n\t" + /* get pv */ + "movq (%%rbx), %%rcx\n\t" + /* *pv += inc */ + "addq %[inc], (%%rcx)\n\t" + "2:\n\t" + RSEQ_INJECT_ASM(4) + RSEQ_ASM_DEFINE_ABORT(4, "", abort) + : /* gcc asm goto does not allow outputs */ + : [cpu_id] "r" (cpu), + [rseq_abi] "r" (&__rseq_abi), + /* final store input */ + [ptr] "m" (*ptr), + [off] "er" (off), + [inc] "er" (inc) + : "memory", "cc", "rax", "rbx", "rcx" + RSEQ_INJECT_CLOBBER + : abort +#ifdef RSEQ_COMPARE_TWICE + , error1 +#endif + ); + return 0; +abort: + RSEQ_INJECT_FAILED + return -1; +#ifdef RSEQ_COMPARE_TWICE +error1: + rseq_bug("cpu_id comparison failed"); +#endif +} + static inline __attribute__((always_inline)) int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect, intptr_t *v2, intptr_t newv2,
----- On Sep 23, 2020, at 7:36 PM, Peter Oskolkov posk@google.com wrote:
The patch title should state that it only adds rseq_offset_deref_addv to x86-64. Considering that other architecture maintainers will look at it as inspiration for other architectures, we should also state in the commit message that architectures implementing it should define "RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV".
Thanks,
Mathieu
This patch adds rseq_offset_deref_addv function to tools/testing/selftests/rseq/rseq-x86.h, to be used in a selftest in the next patch in the patchset.
v7->v8: this patch split out of the v7 selftest patch.
Signed-off-by: Peter Oskolkov posk@google.com
tools/testing/selftests/rseq/rseq-x86.h | 57 +++++++++++++++++++++++++ 1 file changed, 57 insertions(+)
diff --git a/tools/testing/selftests/rseq/rseq-x86.h b/tools/testing/selftests/rseq/rseq-x86.h index b2da6004fe30..640411518e46 100644 --- a/tools/testing/selftests/rseq/rseq-x86.h +++ b/tools/testing/selftests/rseq/rseq-x86.h @@ -279,6 +279,63 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu) #endif }
+#define RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
+/*
- pval = *(ptr+off)
- *pval += inc;
- */
+static inline __attribute__((always_inline)) +int rseq_offset_deref_addv(intptr_t *ptr, off_t off, intptr_t inc, int cpu) +{
- RSEQ_INJECT_C(9)
- __asm__ __volatile__ goto (
RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
/* Start rseq by storing table entry pointer into rseq_cs. */
RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
+#endif
/* get p+v */
"movq %[ptr], %%rbx\n\t"
"addq %[off], %%rbx\n\t"
/* get pv */
"movq (%%rbx), %%rcx\n\t"
/* *pv += inc */
"addq %[inc], (%%rcx)\n\t"
"2:\n\t"
RSEQ_INJECT_ASM(4)
RSEQ_ASM_DEFINE_ABORT(4, "", abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
[rseq_abi] "r" (&__rseq_abi),
/* final store input */
[ptr] "m" (*ptr),
[off] "er" (off),
[inc] "er" (inc)
: "memory", "cc", "rax", "rbx", "rcx"
RSEQ_INJECT_CLOBBER
: abort
+#ifdef RSEQ_COMPARE_TWICE
, error1
+#endif
- );
- return 0;
+abort:
- RSEQ_INJECT_FAILED
- return -1;
+#ifdef RSEQ_COMPARE_TWICE +error1:
- rseq_bug("cpu_id comparison failed");
+#endif +}
static inline __attribute__((always_inline)) int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect, intptr_t *v2, intptr_t newv2, -- 2.28.0.709.gb0816b6eb0-goog
----- On Sep 24, 2020, at 9:33 AM, Mathieu Desnoyers mathieu.desnoyers@efficios.com wrote:
----- On Sep 23, 2020, at 7:36 PM, Peter Oskolkov posk@google.com wrote:
The patch title should state that it only adds rseq_offset_deref_addv to x86-64. Considering that other architecture maintainers will look at it as inspiration for other architectures, we should also state in the commit message that architectures implementing it should define "RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV".
With those changes applied, you can also add my:
Acked-by: Mathieu Desnoyers mathieu.desnoyers@efficios.com
Thanks,
Mathieu
Thanks,
Mathieu
This patch adds rseq_offset_deref_addv function to tools/testing/selftests/rseq/rseq-x86.h, to be used in a selftest in the next patch in the patchset.
v7->v8: this patch split out of the v7 selftest patch.
Signed-off-by: Peter Oskolkov posk@google.com
tools/testing/selftests/rseq/rseq-x86.h | 57 +++++++++++++++++++++++++ 1 file changed, 57 insertions(+)
diff --git a/tools/testing/selftests/rseq/rseq-x86.h b/tools/testing/selftests/rseq/rseq-x86.h index b2da6004fe30..640411518e46 100644 --- a/tools/testing/selftests/rseq/rseq-x86.h +++ b/tools/testing/selftests/rseq/rseq-x86.h @@ -279,6 +279,63 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu) #endif }
+#define RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
+/*
- pval = *(ptr+off)
- *pval += inc;
- */
+static inline __attribute__((always_inline)) +int rseq_offset_deref_addv(intptr_t *ptr, off_t off, intptr_t inc, int cpu) +{
- RSEQ_INJECT_C(9)
- __asm__ __volatile__ goto (
RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
/* Start rseq by storing table entry pointer into rseq_cs. */
RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
+#endif
/* get p+v */
"movq %[ptr], %%rbx\n\t"
"addq %[off], %%rbx\n\t"
/* get pv */
"movq (%%rbx), %%rcx\n\t"
/* *pv += inc */
"addq %[inc], (%%rcx)\n\t"
"2:\n\t"
RSEQ_INJECT_ASM(4)
RSEQ_ASM_DEFINE_ABORT(4, "", abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
[rseq_abi] "r" (&__rseq_abi),
/* final store input */
[ptr] "m" (*ptr),
[off] "er" (off),
[inc] "er" (inc)
: "memory", "cc", "rax", "rbx", "rcx"
RSEQ_INJECT_CLOBBER
: abort
+#ifdef RSEQ_COMPARE_TWICE
, error1
+#endif
- );
- return 0;
+abort:
- RSEQ_INJECT_FAILED
- return -1;
+#ifdef RSEQ_COMPARE_TWICE +error1:
- rseq_bug("cpu_id comparison failed");
+#endif +}
static inline __attribute__((always_inline)) int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect, intptr_t *v2, intptr_t newv2, -- 2.28.0.709.gb0816b6eb0-goog
-- Mathieu Desnoyers EfficiOS Inc. http://www.efficios.com
Based on Google-internal RSEQ work done by Paul Turner and Andrew Hunter.
This patch adds a selftest for MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ. The test quite often fails without the previous patch in this patchset, but consistently passes with it.
v3: added rseq_offset_deref_addv() to x86_64 to make the test more explicit; on other architectures I kept using existing rseq_cmpeqv_cmpeqv_storev() as I have no easy way to test there. Added a comment explaining why the test works this way. v4: skipped the test if rseq_offset_deref_addv() is not present (that is, on all architectures other than x86_64). v8: split rseq_offset_deref_addv() into a separate patch; moved the test to param_test; other minor tweaks.
Signed-off-by: Peter Oskolkov posk@google.com --- tools/testing/selftests/rseq/param_test.c | 223 +++++++++++++++++- .../testing/selftests/rseq/run_param_test.sh | 2 + 2 files changed, 224 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c index e8a657a5f48a..384589095864 100644 --- a/tools/testing/selftests/rseq/param_test.c +++ b/tools/testing/selftests/rseq/param_test.c @@ -1,8 +1,10 @@ // SPDX-License-Identifier: LGPL-2.1 #define _GNU_SOURCE #include <assert.h> +#include <linux/membarrier.h> #include <pthread.h> #include <sched.h> +#include <stdatomic.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> @@ -1131,6 +1133,220 @@ static int set_signal_handler(void) return ret; }
+struct test_membarrier_thread_args { + int stop; + intptr_t percpu_list_ptr; +}; + +/* Worker threads modify data in their "active" percpu lists. */ +void *test_membarrier_worker_thread(void *arg) +{ + struct test_membarrier_thread_args *args = + (struct test_membarrier_thread_args *)arg; + const int iters = opt_reps; + int i; + + if (rseq_register_current_thread()) { + fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n", + errno, strerror(errno)); + abort(); + } + + /* Wait for initialization. */ + while (!atomic_load(&args->percpu_list_ptr)) {} + + for (i = 0; i < iters; ++i) { + int ret; + + do { + int cpu = rseq_cpu_start(); + + ret = rseq_offset_deref_addv(&args->percpu_list_ptr, + sizeof(struct percpu_list_entry) * cpu, 1, cpu); + } while (rseq_unlikely(ret)); + } + + if (rseq_unregister_current_thread()) { + fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n", + errno, strerror(errno)); + abort(); + } + return NULL; +} + +void test_membarrier_init_percpu_list(struct percpu_list *list) +{ + int i; + + memset(list, 0, sizeof(*list)); + for (i = 0; i < CPU_SETSIZE; i++) { + struct percpu_list_node *node; + + node = malloc(sizeof(*node)); + assert(node); + node->data = 0; + node->next = NULL; + list->c[i].head = node; + } +} + +void test_membarrier_free_percpu_list(struct percpu_list *list) +{ + int i; + + for (i = 0; i < CPU_SETSIZE; i++) + free(list->c[i].head); +} + +static int sys_membarrier(int cmd, int flags, int cpu_id) +{ + return syscall(__NR_membarrier, cmd, flags, cpu_id); +} + +/* + * The manager thread swaps per-cpu lists that worker threads see, + * and validates that there are no unexpected modifications. + */ +void *test_membarrier_manager_thread(void *arg) +{ + struct test_membarrier_thread_args *args = + (struct test_membarrier_thread_args *)arg; + struct percpu_list list_a, list_b; + intptr_t expect_a = 0, expect_b = 0; + int cpu_a = 0, cpu_b = 0; + + if (rseq_register_current_thread()) { + fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n", + errno, strerror(errno)); + abort(); + } + + /* Init lists. */ + test_membarrier_init_percpu_list(&list_a); + test_membarrier_init_percpu_list(&list_b); + + atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a); + + while (!atomic_load(&args->stop)) { + /* list_a is "active". */ + cpu_a = rand() % CPU_SETSIZE; + /* + * As list_b is "inactive", we should never see changes + * to list_b. + */ + if (expect_b != atomic_load(&list_b.c[cpu_b].head->data)) { + fprintf(stderr, "Membarrier test failed\n"); + abort(); + } + + /* Make list_b "active". */ + atomic_store(&args->percpu_list_ptr, (intptr_t)&list_b); + if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ, + MEMBARRIER_CMD_FLAG_CPU, cpu_a) && + errno != ENXIO /* missing CPU */) { + perror("sys_membarrier"); + abort(); + } + /* + * Cpu A should now only modify list_b, so the values + * in list_a should be stable. + */ + expect_a = atomic_load(&list_a.c[cpu_a].head->data); + + cpu_b = rand() % CPU_SETSIZE; + /* + * As list_a is "inactive", we should never see changes + * to list_a. + */ + if (expect_a != atomic_load(&list_a.c[cpu_a].head->data)) { + fprintf(stderr, "Membarrier test failed\n"); + abort(); + } + + /* Make list_a "active". */ + atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a); + if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ, + MEMBARRIER_CMD_FLAG_CPU, cpu_b) && + errno != ENXIO /* missing CPU*/) { + perror("sys_membarrier"); + abort(); + } + /* Remember a value from list_b. */ + expect_b = atomic_load(&list_b.c[cpu_b].head->data); + } + + test_membarrier_free_percpu_list(&list_a); + test_membarrier_free_percpu_list(&list_b); + + if (rseq_unregister_current_thread()) { + fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n", + errno, strerror(errno)); + abort(); + } + return NULL; +} + +/* Test MEMBARRIER_CMD_PRIVATE_RESTART_RSEQ_ON_CPU membarrier command. */ +#ifdef RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV +void test_membarrier(void) +{ + const int num_threads = opt_threads; + struct test_membarrier_thread_args thread_args; + pthread_t worker_threads[num_threads]; + pthread_t manager_thread; + int i, ret; + + if (sys_membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, 0, 0)) { + perror("sys_membarrier"); + abort(); + } + + thread_args.stop = 0; + thread_args.percpu_list_ptr = 0; + ret = pthread_create(&manager_thread, NULL, + test_membarrier_manager_thread, &thread_args); + if (ret) { + errno = ret; + perror("pthread_create"); + abort(); + } + + for (i = 0; i < num_threads; i++) { + ret = pthread_create(&worker_threads[i], NULL, + test_membarrier_worker_thread, &thread_args); + if (ret) { + errno = ret; + perror("pthread_create"); + abort(); + } + } + + + for (i = 0; i < num_threads; i++) { + ret = pthread_join(worker_threads[i], NULL); + if (ret) { + errno = ret; + perror("pthread_join"); + abort(); + } + } + + atomic_store(&thread_args.stop, 1); + ret = pthread_join(manager_thread, NULL); + if (ret) { + errno = ret; + perror("pthread_join"); + abort(); + } +} +#else /* RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV */ +void test_membarrier(void) +{ + fprintf(stderr, "rseq_offset_deref_addv is not implemented on this architecture. " + "Skipping membarrier test.\n"); +} +#endif + static void show_usage(int argc, char **argv) { printf("Usage : %s <OPTIONS>\n", @@ -1153,7 +1369,7 @@ static void show_usage(int argc, char **argv) printf(" [-r N] Number of repetitions per thread (default 5000)\n"); printf(" [-d] Disable rseq system call (no initialization)\n"); printf(" [-D M] Disable rseq for each M threads\n"); - printf(" [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement\n"); + printf(" [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement, membarrie(r)\n"); printf(" [-M] Push into buffer and memcpy buffer with memory barriers.\n"); printf(" [-v] Verbose output.\n"); printf(" [-h] Show this help.\n"); @@ -1268,6 +1484,7 @@ int main(int argc, char **argv) case 'i': case 'b': case 'm': + case 'r': break; default: show_usage(argc, argv); @@ -1320,6 +1537,10 @@ int main(int argc, char **argv) printf_verbose("counter increment\n"); test_percpu_inc(); break; + case 'r': + printf_verbose("membarrier\n"); + test_membarrier(); + break; } if (!opt_disable_rseq && rseq_unregister_current_thread()) abort(); diff --git a/tools/testing/selftests/rseq/run_param_test.sh b/tools/testing/selftests/rseq/run_param_test.sh index e426304fd4a0..f51bc83c9e41 100755 --- a/tools/testing/selftests/rseq/run_param_test.sh +++ b/tools/testing/selftests/rseq/run_param_test.sh @@ -15,6 +15,7 @@ TEST_LIST=( "-T m" "-T m -M" "-T i" + "-T r" )
TEST_NAME=( @@ -25,6 +26,7 @@ TEST_NAME=( "memcpy" "memcpy with barrier" "increment" + "membarrier" ) IFS="$OLDIFS"
----- On Sep 23, 2020, at 7:36 PM, Peter Oskolkov posk@google.com wrote:
Based on Google-internal RSEQ work done by Paul Turner and Andrew Hunter.
This patch adds a selftest for MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ. The test quite often fails without the previous patch in this patchset, but consistently passes with it.
v3: added rseq_offset_deref_addv() to x86_64 to make the test more explicit; on other architectures I kept using existing rseq_cmpeqv_cmpeqv_storev() as I have no easy way to test there. Added a comment explaining why the test works this way. v4: skipped the test if rseq_offset_deref_addv() is not present (that is, on all architectures other than x86_64). v8: split rseq_offset_deref_addv() into a separate patch; moved the test to param_test; other minor tweaks.
Signed-off-by: Peter Oskolkov posk@google.com
Acked-by: Mathieu Desnoyers mathieu.desnoyers@efficios.com
Thanks!
Mathieu
tools/testing/selftests/rseq/param_test.c | 223 +++++++++++++++++- .../testing/selftests/rseq/run_param_test.sh | 2 + 2 files changed, 224 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c index e8a657a5f48a..384589095864 100644 --- a/tools/testing/selftests/rseq/param_test.c +++ b/tools/testing/selftests/rseq/param_test.c @@ -1,8 +1,10 @@ // SPDX-License-Identifier: LGPL-2.1 #define _GNU_SOURCE #include <assert.h> +#include <linux/membarrier.h> #include <pthread.h> #include <sched.h> +#include <stdatomic.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> @@ -1131,6 +1133,220 @@ static int set_signal_handler(void) return ret; }
+struct test_membarrier_thread_args {
- int stop;
- intptr_t percpu_list_ptr;
+};
+/* Worker threads modify data in their "active" percpu lists. */ +void *test_membarrier_worker_thread(void *arg) +{
- struct test_membarrier_thread_args *args =
(struct test_membarrier_thread_args *)arg;
- const int iters = opt_reps;
- int i;
- if (rseq_register_current_thread()) {
fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
errno, strerror(errno));
abort();
- }
- /* Wait for initialization. */
- while (!atomic_load(&args->percpu_list_ptr)) {}
- for (i = 0; i < iters; ++i) {
int ret;
do {
int cpu = rseq_cpu_start();
ret = rseq_offset_deref_addv(&args->percpu_list_ptr,
sizeof(struct percpu_list_entry) * cpu, 1, cpu);
} while (rseq_unlikely(ret));
- }
- if (rseq_unregister_current_thread()) {
fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d):
%s\n",
errno, strerror(errno));
abort();
- }
- return NULL;
+}
+void test_membarrier_init_percpu_list(struct percpu_list *list) +{
- int i;
- memset(list, 0, sizeof(*list));
- for (i = 0; i < CPU_SETSIZE; i++) {
struct percpu_list_node *node;
node = malloc(sizeof(*node));
assert(node);
node->data = 0;
node->next = NULL;
list->c[i].head = node;
- }
+}
+void test_membarrier_free_percpu_list(struct percpu_list *list) +{
- int i;
- for (i = 0; i < CPU_SETSIZE; i++)
free(list->c[i].head);
+}
+static int sys_membarrier(int cmd, int flags, int cpu_id) +{
- return syscall(__NR_membarrier, cmd, flags, cpu_id);
+}
+/*
- The manager thread swaps per-cpu lists that worker threads see,
- and validates that there are no unexpected modifications.
- */
+void *test_membarrier_manager_thread(void *arg) +{
- struct test_membarrier_thread_args *args =
(struct test_membarrier_thread_args *)arg;
- struct percpu_list list_a, list_b;
- intptr_t expect_a = 0, expect_b = 0;
- int cpu_a = 0, cpu_b = 0;
- if (rseq_register_current_thread()) {
fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
errno, strerror(errno));
abort();
- }
- /* Init lists. */
- test_membarrier_init_percpu_list(&list_a);
- test_membarrier_init_percpu_list(&list_b);
- atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a);
- while (!atomic_load(&args->stop)) {
/* list_a is "active". */
cpu_a = rand() % CPU_SETSIZE;
/*
* As list_b is "inactive", we should never see changes
* to list_b.
*/
if (expect_b != atomic_load(&list_b.c[cpu_b].head->data)) {
fprintf(stderr, "Membarrier test failed\n");
abort();
}
/* Make list_b "active". */
atomic_store(&args->percpu_list_ptr, (intptr_t)&list_b);
if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
MEMBARRIER_CMD_FLAG_CPU, cpu_a) &&
errno != ENXIO /* missing CPU */) {
perror("sys_membarrier");
abort();
}
/*
* Cpu A should now only modify list_b, so the values
* in list_a should be stable.
*/
expect_a = atomic_load(&list_a.c[cpu_a].head->data);
cpu_b = rand() % CPU_SETSIZE;
/*
* As list_a is "inactive", we should never see changes
* to list_a.
*/
if (expect_a != atomic_load(&list_a.c[cpu_a].head->data)) {
fprintf(stderr, "Membarrier test failed\n");
abort();
}
/* Make list_a "active". */
atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a);
if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
MEMBARRIER_CMD_FLAG_CPU, cpu_b) &&
errno != ENXIO /* missing CPU*/) {
perror("sys_membarrier");
abort();
}
/* Remember a value from list_b. */
expect_b = atomic_load(&list_b.c[cpu_b].head->data);
- }
- test_membarrier_free_percpu_list(&list_a);
- test_membarrier_free_percpu_list(&list_b);
- if (rseq_unregister_current_thread()) {
fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d):
%s\n",
errno, strerror(errno));
abort();
- }
- return NULL;
+}
+/* Test MEMBARRIER_CMD_PRIVATE_RESTART_RSEQ_ON_CPU membarrier command. */ +#ifdef RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV +void test_membarrier(void) +{
- const int num_threads = opt_threads;
- struct test_membarrier_thread_args thread_args;
- pthread_t worker_threads[num_threads];
- pthread_t manager_thread;
- int i, ret;
- if (sys_membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, 0, 0)) {
perror("sys_membarrier");
abort();
- }
- thread_args.stop = 0;
- thread_args.percpu_list_ptr = 0;
- ret = pthread_create(&manager_thread, NULL,
test_membarrier_manager_thread, &thread_args);
- if (ret) {
errno = ret;
perror("pthread_create");
abort();
- }
- for (i = 0; i < num_threads; i++) {
ret = pthread_create(&worker_threads[i], NULL,
test_membarrier_worker_thread, &thread_args);
if (ret) {
errno = ret;
perror("pthread_create");
abort();
}
- }
- for (i = 0; i < num_threads; i++) {
ret = pthread_join(worker_threads[i], NULL);
if (ret) {
errno = ret;
perror("pthread_join");
abort();
}
- }
- atomic_store(&thread_args.stop, 1);
- ret = pthread_join(manager_thread, NULL);
- if (ret) {
errno = ret;
perror("pthread_join");
abort();
- }
+} +#else /* RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV */ +void test_membarrier(void) +{
- fprintf(stderr, "rseq_offset_deref_addv is not implemented on this
architecture. "
"Skipping membarrier test.\n");
+} +#endif
static void show_usage(int argc, char **argv) { printf("Usage : %s <OPTIONS>\n", @@ -1153,7 +1369,7 @@ static void show_usage(int argc, char **argv) printf(" [-r N] Number of repetitions per thread (default 5000)\n"); printf(" [-d] Disable rseq system call (no initialization)\n"); printf(" [-D M] Disable rseq for each M threads\n");
- printf(" [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy,
(i)ncrement\n");
- printf(" [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy,
(i)ncrement, membarrie(r)\n"); printf(" [-M] Push into buffer and memcpy buffer with memory barriers.\n"); printf(" [-v] Verbose output.\n"); printf(" [-h] Show this help.\n"); @@ -1268,6 +1484,7 @@ int main(int argc, char **argv) case 'i': case 'b': case 'm':
case 'r': break; default: show_usage(argc, argv);
@@ -1320,6 +1537,10 @@ int main(int argc, char **argv) printf_verbose("counter increment\n"); test_percpu_inc(); break;
- case 'r':
printf_verbose("membarrier\n");
test_membarrier();
} if (!opt_disable_rseq && rseq_unregister_current_thread()) abort();break;
diff --git a/tools/testing/selftests/rseq/run_param_test.sh b/tools/testing/selftests/rseq/run_param_test.sh index e426304fd4a0..f51bc83c9e41 100755 --- a/tools/testing/selftests/rseq/run_param_test.sh +++ b/tools/testing/selftests/rseq/run_param_test.sh @@ -15,6 +15,7 @@ TEST_LIST=( "-T m" "-T m -M" "-T i"
- "-T r"
)
TEST_NAME=( @@ -25,6 +26,7 @@ TEST_NAME=( "memcpy" "memcpy with barrier" "increment"
- "membarrier"
) IFS="$OLDIFS"
-- 2.28.0.709.gb0816b6eb0-goog
----- On Sep 23, 2020, at 7:36 PM, Peter Oskolkov posk@google.com wrote:
This patchset is based on Google-internal RSEQ work done by Paul Turner and Andrew Hunter.
When working with per-CPU RSEQ-based memory allocations, it is sometimes important to make sure that a global memory location is no longer accessed from RSEQ critical sections. For example, there can be two per-CPU lists, one is "active" and accessed per-CPU, while another one is inactive and worked on asynchronously "off CPU" (e.g. garbage collection is performed). Then at some point the two lists are swapped, and a fast RCU-like mechanism is required to make sure that the previously active list is no longer accessed.
This patch introduces such a mechanism: in short, membarrier() syscall issues an IPI to a CPU, restarting a potentially active RSEQ critical section on the CPU.
Acked-by: Mathieu Desnoyers mathieu.desnoyers@efficios.com
For the next time, you should move the changelog below (vN->vN+1) after a "---" line, which comes after all the Signed-off-by, acked-by and others.
Thanks,
Mathieu
v1->v2:
- removed the ability to IPI all CPUs in a single sycall;
- use task->mm rather than task->group_leader to identify tasks belonging to the same process.
v2->v3:
- re-added the ability to IPI all CPUs in a single syscall;
- integrated with membarrier_private_expedited() to make sure only CPUs running tasks with the same mm as the current task are interrupted;
- also added MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ;
- flags in membarrier_private_expedited are never actually bit flags but always distinct values (i.e. never two flags are combined), so I modified bit testing to full equation comparison for simplicity (otherwise the code needs to work when several bits are set, for example).
v3->v4:
- added the third parameter to membarrier syscall: @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, then @cpu_id indicates the cpu on which RSEQ CS should be restarted.
v4->v5:
- added @cpu_id parameter to sys_membarrier in syscalls.h.
v5->v6:
- made membarrier_private_expedited more efficient in a single-cpu case;
- a couple of minor refactorings.
v6->v7:
- made @flags an unsigned int in sys_membarrier;
- a couple of minor refactorings.
v7->v8:
- replaced BUG_ON with WARN_ON_ONCE in membarrier.c.
The second patch in the patchset adds a selftest of this feature.
Signed-off-by: Peter Oskolkov posk@google.com
include/linux/sched/mm.h | 3 + include/linux/syscalls.h | 2 +- include/uapi/linux/membarrier.h | 26 ++++++ kernel/sched/membarrier.c | 136 +++++++++++++++++++++++++------- 4 files changed, 136 insertions(+), 31 deletions(-)
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index f889e332912f..15bfb06f2884 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -348,10 +348,13 @@ enum { MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5),
- MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY = (1U << 6),
- MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ = (1U << 7),
};
enum { MEMBARRIER_FLAG_SYNC_CORE = (1U << 0),
- MEMBARRIER_FLAG_RSEQ = (1U << 1),
};
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 75ac7f8ae93c..466c993e52bf 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -974,7 +974,7 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename, const char __user *const __user *argv, const char __user *const __user *envp, int flags); asmlinkage long sys_userfaultfd(int flags); -asmlinkage long sys_membarrier(int cmd, int flags); +asmlinkage long sys_membarrier(int cmd, int flags, int cpu_id); asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in, int fd_out, loff_t __user *off_out, diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h index 5891d7614c8c..737605897f36 100644 --- a/include/uapi/linux/membarrier.h +++ b/include/uapi/linux/membarrier.h @@ -114,6 +114,26 @@
If this command is not implemented by an
architecture, -EINVAL is returned.
Returns 0 on success.
- @MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
Ensure the caller thread, upon return from
system call, that all its running thread
siblings have any currently running rseq
critical sections restarted if @flags
parameter is 0; if @flags parameter is
MEMBARRIER_CMD_FLAG_CPU,
then this operation is performed only
on CPU indicated by @cpu_id. If this command is
not implemented by an architecture, -EINVAL
is returned. A process needs to register its
intent to use the private expedited rseq
command prior to using it, otherwise
this command returns -EPERM.
- @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
Register the process intent to use
MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ.
If this command is not implemented by an
architecture, -EINVAL is returned.
Returns 0 on success.
- @MEMBARRIER_CMD_SHARED:
Alias to MEMBARRIER_CMD_GLOBAL. Provided for
header backward compatibility.
@@ -131,9 +151,15 @@ enum membarrier_cmd { MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4), MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 5), MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6),
MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ = (1 << 7),
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = (1 << 8),
/* Alias for header backward compatibility. */ MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL,
};
+enum membarrier_cmd_flag {
- MEMBARRIER_CMD_FLAG_CPU = (1 << 0),
+};
#endif /* _UAPI_LINUX_MEMBARRIER_H */ diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 168479a7d61b..e23e74d52db5 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -18,6 +18,14 @@ #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 #endif
+#ifdef CONFIG_RSEQ +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \
- (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
- | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
+#else +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 +#endif
#define MEMBARRIER_CMD_BITMASK \ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ @@ -30,6 +38,11 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ }
+static void ipi_rseq(void *info) +{
- rseq_preempt(current);
+}
static void ipi_sync_rq_state(void *info) { struct mm_struct *mm = (struct mm_struct *) info; @@ -129,19 +142,27 @@ static int membarrier_global_expedited(void) return 0; }
-static int membarrier_private_expedited(int flags) +static int membarrier_private_expedited(int flags, int cpu_id) {
- int cpu; cpumask_var_t tmpmask; struct mm_struct *mm = current->mm;
- smp_call_func_t ipi_func = ipi_mb;
- if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
- if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM;
- } else if (flags == MEMBARRIER_FLAG_RSEQ) {
if (!IS_ENABLED(CONFIG_RSEQ))
return -EINVAL;
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
return -EPERM;
} else {ipi_func = ipi_rseq;
if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return -EPERM;WARN_ON_ONCE(flags);
@@ -156,35 +177,59 @@ static int membarrier_private_expedited(int flags) */ smp_mb(); /* system call entry is not a mb. */
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) return -ENOMEM;
cpus_read_lock();
- rcu_read_lock();
- for_each_online_cpu(cpu) {
- if (cpu_id >= 0) { struct task_struct *p;
/*
* Skipping the current CPU is OK even through we can be
* migrated at any point. The current CPU, at the point
* where we read raw_smp_processor_id(), is ensured to
* be in program order with respect to the caller
* thread. Therefore, we can skip this CPU from the
* iteration.
*/
if (cpu == raw_smp_processor_id())
continue;
p = rcu_dereference(cpu_rq(cpu)->curr);
if (p && p->mm == mm)
__cpumask_set_cpu(cpu, tmpmask);
if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
goto out;
if (cpu_id == raw_smp_processor_id())
goto out;
rcu_read_lock();
p = rcu_dereference(cpu_rq(cpu_id)->curr);
if (!p || p->mm != mm) {
rcu_read_unlock();
goto out;
}
rcu_read_unlock();
- } else {
int cpu;
rcu_read_lock();
for_each_online_cpu(cpu) {
struct task_struct *p;
/*
* Skipping the current CPU is OK even through we can be
* migrated at any point. The current CPU, at the point
* where we read raw_smp_processor_id(), is ensured to
* be in program order with respect to the caller
* thread. Therefore, we can skip this CPU from the
* iteration.
*/
if (cpu == raw_smp_processor_id())
continue;
p = rcu_dereference(cpu_rq(cpu)->curr);
if (p && p->mm == mm)
__cpumask_set_cpu(cpu, tmpmask);
}
}rcu_read_unlock();
rcu_read_unlock();
preempt_disable();
smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
- if (cpu_id >= 0)
smp_call_function_single(cpu_id, ipi_func, NULL, 1);
- else
preempt_enable();smp_call_function_many(tmpmask, ipi_func, NULL, 1);
- free_cpumask_var(tmpmask);
+out:
if (cpu_id < 0)
free_cpumask_var(tmpmask);
cpus_read_unlock();
/*
@@ -283,11 +328,18 @@ static int membarrier_register_private_expedited(int flags) set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, ret;
- if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
} else if (flags == MEMBARRIER_FLAG_RSEQ) {
if (!IS_ENABLED(CONFIG_RSEQ))
return -EINVAL;
ready_state =
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
} else {
WARN_ON_ONCE(flags);
}
/*
@@ -299,6 +351,8 @@ static int membarrier_register_private_expedited(int flags) return 0; if (flags & MEMBARRIER_FLAG_SYNC_CORE) set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
- if (flags & MEMBARRIER_FLAG_RSEQ)
atomic_or(set_state, &mm->membarrier_state); ret = sync_runqueues_membarrier_state(mm); if (ret)set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
@@ -310,8 +364,15 @@ static int membarrier_register_private_expedited(int flags)
/**
- sys_membarrier - issue memory barriers on a set of threads
- @cmd: Takes command values defined in enum membarrier_cmd.
- @flags: Currently needs to be 0. For future extensions.
- @cmd: Takes command values defined in enum membarrier_cmd.
- @flags: Currently needs to be 0 for all commands other than
MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
contains the CPU on which to interrupt (= restart)
the RSEQ critical section.
- @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
RSEQ CS should be interrupted (@cmd must be
MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
- If this system call is not implemented, -ENOSYS is returned. If the
- command specified does not exist, not available on the running
@@ -337,10 +398,21 @@ static int membarrier_register_private_expedited(int flags)
smp_mb() X O O
sys_membarrier() O O O
*/ -SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id) {
- if (unlikely(flags))
return -EINVAL;
- switch (cmd) {
- case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
return -EINVAL;
break;
- default:
if (unlikely(flags))
return -EINVAL;
- }
- if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
cpu_id = -1;
- switch (cmd) { case MEMBARRIER_CMD_QUERY: {
@@ -362,13 +434,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: return membarrier_register_global_expedited(); case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
return membarrier_private_expedited(0);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: return membarrier_register_private_expedited(0); case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:return membarrier_private_expedited(0, cpu_id);
return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
- case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
- case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
default: return -EINVAL; }return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
-- 2.28.0.709.gb0816b6eb0-goog
On Thu, Sep 24, 2020 at 09:51:43AM -0400, Mathieu Desnoyers wrote:
----- On Sep 23, 2020, at 7:36 PM, Peter Oskolkov posk@google.com wrote:
This patchset is based on Google-internal RSEQ work done by Paul Turner and Andrew Hunter.
When working with per-CPU RSEQ-based memory allocations, it is sometimes important to make sure that a global memory location is no longer accessed from RSEQ critical sections. For example, there can be two per-CPU lists, one is "active" and accessed per-CPU, while another one is inactive and worked on asynchronously "off CPU" (e.g. garbage collection is performed). Then at some point the two lists are swapped, and a fast RCU-like mechanism is required to make sure that the previously active list is no longer accessed.
This patch introduces such a mechanism: in short, membarrier() syscall issues an IPI to a CPU, restarting a potentially active RSEQ critical section on the CPU.
Acked-by: Mathieu Desnoyers mathieu.desnoyers@efficios.com
Thanks!, I've queued them in:
git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git sched/core
please double check the Subject/Changelog edits I made. Once all the robots are green, I'll push out the lot to -tip.
On Wed, Sep 23, 2020 at 04:36:16PM -0700, Peter Oskolkov wrote:
--- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -974,7 +974,7 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename, const char __user *const __user *argv, const char __user *const __user *envp, int flags); asmlinkage long sys_userfaultfd(int flags); -asmlinkage long sys_membarrier(int cmd, int flags); +asmlinkage long sys_membarrier(int cmd, int flags, int cpu_id); asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in, int fd_out, loff_t __user *off_out,
The below is required to make arm build... I'll update the patch and push out again.
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 466c993e52bf..06db09875aa4 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -974,7 +974,7 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename, const char __user *const __user *argv, const char __user *const __user *envp, int flags); asmlinkage long sys_userfaultfd(int flags); -asmlinkage long sys_membarrier(int cmd, int flags, int cpu_id); +asmlinkage long sys_membarrier(int cmd, unsigned int flags, int cpu_id); asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in, int fd_out, loff_t __user *off_out,
linux-kselftest-mirror@lists.linaro.org