Adds bit perf_event_attr::remove_on_exec, to support removing an event from a task on exec.
This option supports the case where an event is supposed to be process-wide only, and should not propagate beyond exec, to limit monitoring to the original process image only.
Suggested-by: Peter Zijlstra peterz@infradead.org Signed-off-by: Marco Elver elver@google.com --- v3: * Rework based on Peter's "perf: Rework perf_event_exit_event()" added to the beginning of the series. Intermediate attempts between v2 and this v3 can be found here: https://lkml.kernel.org/r/YFm6aakSRlF2nWtu@elver.google.com
v2: * Add patch to series. --- include/uapi/linux/perf_event.h | 3 +- kernel/events/core.c | 70 +++++++++++++++++++++++++++++---- 2 files changed, 64 insertions(+), 9 deletions(-)
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 813efb65fea8..8c5b9f5ad63f 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -390,7 +390,8 @@ struct perf_event_attr { text_poke : 1, /* include text poke events */ build_id : 1, /* use build id in mmap2 events */ inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ - __reserved_1 : 28; + remove_on_exec : 1, /* event is removed from task on exec */ + __reserved_1 : 27;
union { __u32 wakeup_events; /* wakeup every n events */ diff --git a/kernel/events/core.c b/kernel/events/core.c index de2917b3c59e..19c045ff2b9c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4247,6 +4247,57 @@ static void perf_event_enable_on_exec(int ctxn) put_ctx(clone_ctx); }
+static void perf_remove_from_owner(struct perf_event *event); +static void perf_event_exit_event(struct perf_event *event, + struct perf_event_context *ctx); + +/* + * Removes all events from the current task that have been marked + * remove-on-exec, and feeds their values back to parent events. + */ +static void perf_event_remove_on_exec(int ctxn) +{ + struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_event *event, *next; + LIST_HEAD(free_list); + unsigned long flags; + bool modified = false; + + ctx = perf_pin_task_context(current, ctxn); + if (!ctx) + return; + + mutex_lock(&ctx->mutex); + + if (WARN_ON_ONCE(ctx->task != current)) + goto unlock; + + list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) { + if (!event->attr.remove_on_exec) + continue; + + if (!is_kernel_event(event)) + perf_remove_from_owner(event); + + modified = true; + + perf_event_exit_event(event, ctx); + } + + raw_spin_lock_irqsave(&ctx->lock, flags); + if (modified) + clone_ctx = unclone_ctx(ctx); + --ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); + +unlock: + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); + if (clone_ctx) + put_ctx(clone_ctx); +} + struct perf_read_data { struct perf_event *event; bool group; @@ -7559,18 +7610,18 @@ void perf_event_exec(void) struct perf_event_context *ctx; int ctxn;
- rcu_read_lock(); for_each_task_context_nr(ctxn) { - ctx = current->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - perf_event_enable_on_exec(ctxn); + perf_event_remove_on_exec(ctxn);
- perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, - true); + rcu_read_lock(); + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) { + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, + NULL, true); + } + rcu_read_unlock(); } - rcu_read_unlock(); }
struct remote_output { @@ -11652,6 +11703,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (!attr->inherit && attr->inherit_thread) return -EINVAL;
+ if (attr->remove_on_exec && attr->enable_on_exec) + return -EINVAL; + out: return ret;