Dear friends,
this patch series adds support for nested seccomp listeners. It allows container runtimes and other sandboxing software to install seccomp listeners on top of existing ones, which is useful for nested LXC containers and other similar use-cases.
I decided to go with conservative approach and limit the maximum number of nested listeners to 8 per seccomp filter chain (MAX_LISTENERS_PER_PATH). This is done to avoid dynamic memory allocations in the very hot __seccomp_filter() function, where we use a preallocated static array on the stack to track matched listeners. 8 nested listeners should be enough for almost any practical scenarios.
Expecting potential discussions around this patch series, I'm going to present a talk at LPC 2025 about the design and implementation details of this feature [1].
Git tree (based on for-next/seccomp): v1: https://github.com/mihalicyn/linux/commits/seccomp.mult.listeners.v1 current: https://github.com/mihalicyn/linux/commits/seccomp.mult.listeners
Link: https://lpc.events/event/19/contributions/2241/ [1]
Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-kselftest@vger.kernel.org Cc: bpf@vger.kernel.org Cc: Kees Cook kees@kernel.org Cc: Andy Lutomirski luto@amacapital.net Cc: Will Drewry wad@chromium.org Cc: Jonathan Corbet corbet@lwn.net Cc: Shuah Khan shuah@kernel.org Cc: Tycho Andersen tycho@tycho.pizza Cc: Andrei Vagin avagin@gmail.com Cc: Christian Brauner brauner@kernel.org Cc: Stéphane Graber stgraber@stgraber.org
Alexander Mikhalitsyn (6): seccomp: remove unused argument from seccomp_do_user_notification seccomp: prepare seccomp_run_filters() to support more than one listener seccomp: limit number of listeners in seccomp tree seccomp: handle multiple listeners case seccomp: relax has_duplicate_listeners check tools/testing/selftests/seccomp: test nested listeners
.../userspace-api/seccomp_filter.rst | 6 + include/linux/seccomp.h | 3 +- include/uapi/linux/seccomp.h | 13 +- kernel/seccomp.c | 99 +++++++++-- tools/include/uapi/linux/seccomp.h | 13 +- tools/testing/selftests/seccomp/seccomp_bpf.c | 162 ++++++++++++++++++ 6 files changed, 269 insertions(+), 27 deletions(-)
Add some basic tests for nested listeners.
Cc: linux-kernel@vger.kernel.org Cc: linux-kselftest@vger.kernel.org Cc: bpf@vger.kernel.org Cc: Kees Cook kees@kernel.org Cc: Andy Lutomirski luto@amacapital.net Cc: Will Drewry wad@chromium.org Cc: Jonathan Corbet corbet@lwn.net Cc: Shuah Khan shuah@kernel.org Cc: Tycho Andersen tycho@tycho.pizza Cc: Andrei Vagin avagin@gmail.com Cc: Christian Brauner brauner@kernel.org Cc: Stéphane Graber stgraber@stgraber.org Signed-off-by: Alexander Mikhalitsyn aleksandr.mikhalitsyn@canonical.com --- tools/testing/selftests/seccomp/seccomp_bpf.c | 162 ++++++++++++++++++ 1 file changed, 162 insertions(+)
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index fc4910d35342..0bf02d04fe15 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -293,6 +293,10 @@ struct seccomp_notif_addfd_big { #define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5) #endif
+#ifndef SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS +#define SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS (1UL << 6) +#endif + #ifndef seccomp int seccomp(unsigned int op, unsigned int flags, void *args) { @@ -4408,6 +4412,164 @@ TEST(user_notification_sync) ASSERT_EQ(status, 0); }
+/* from kernel/seccomp.c */ +#define MAX_LISTENERS_PER_PATH 8 + +TEST(user_notification_nested_limits) +{ + pid_t pid; + long ret; + int i, status, listeners[MAX_LISTENERS_PER_PATH]; + + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + /* Install 6 levels of listeners and allow nesting. */ + for (i = 0; i < 6; i++) { + listeners[i] = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER | + SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS); + ASSERT_GE(listeners[i], 0); + + /* Add some no-op filters for grins. */ + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + } + + /* Check behavior when nesting is not allowed. */ + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) { + /* Install a next listener in the chain without nesting allowed. */ + listeners[6] = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + if (listeners[6] < 0) + exit(1); + + /* Add some no-op filters for grins. */ + ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog); + if (ret != 0) + exit(2); + + ret = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + /* Installing a next listener in the chain should result in EBUSY. */ + exit((ret >= 0 || errno != EBUSY) ? 3 : 0); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + + /* Install more filters with listeners to reach nesting levels limit. */ + for (; i < MAX_LISTENERS_PER_PATH; i++) { + listeners[i] = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER | + SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS); + ASSERT_GE(listeners[i], 0); + + /* Add some no-op filters for grins. */ + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + } + + /* Installing a next listener in the chain should result in ELOOP. */ + EXPECT_EQ(user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER), + -1); + EXPECT_EQ(errno, ELOOP); +} + +TEST(user_notification_nested) +{ + pid_t pid; + long ret; + int i, status, listeners[6]; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + /* Install 6 levels of listeners and allow nesting. */ + for (i = 0; i < 6; i++) { + listeners[i] = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER | + SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS); + ASSERT_GE(listeners[i], 0); + + /* Add some no-op filters for grins. */ + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + } + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ret = syscall(__NR_getppid); + exit(ret != (USER_NOTIF_MAGIC-3)); + } + + /* + * We want to have the following picture: + * + * | Listener level (i) | Listener decision | + * |--------------------|-------------------| + * | 0 | WHATEVER | + * | 1 | WHATEVER | + * | 2 | WHATEVER | + * | 3 | RETURN | <-- stop here + * | 4 | CONTINUE SYSCALL | + * | 5 | CONTINUE SYSCALL | <- start here (current->seccomp.filter) + * + * First listener who receives a notification is level 5, then 4, + * then we expect to stop on level 3 and return from syscall with + * (USER_NOTIF_MAGIC - 3) return value. + */ + for (i = 6 - 1; i >= 3; i--) { + memset(&req, 0, sizeof(req)); + EXPECT_EQ(ioctl(listeners[i], SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + EXPECT_EQ(req.pid, pid); + EXPECT_EQ(req.data.nr, __NR_getppid); + + memset(&resp, 0, sizeof(resp)); + resp.id = req.id; + + if (i == 5 || i == 4) { + resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE; + } else { + resp.error = 0; + resp.val = USER_NOTIF_MAGIC - i; + } + + EXPECT_EQ(ioctl(listeners[i], SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + + for (i = 0; i < 6; i++) + close(listeners[i]); +}
/* Make sure PTRACE_O_SUSPEND_SECCOMP requires CAP_SYS_ADMIN. */ FIXTURE(O_SUSPEND_SECCOMP) {
linux-kselftest-mirror@lists.linaro.org