Dear friends,
this patch series adds support for nested seccomp listeners. It allows container runtimes and other sandboxing software to install seccomp listeners on top of existing ones, which is useful for nested LXC containers and other similar use-cases.
Expecting potential discussions around this patch series, I'm going to present a talk at LPC 2025 about the design and implementation details of this feature [1].
Git tree (based on for-next/seccomp): v3: https://github.com/mihalicyn/linux/commits/seccomp.mult.listeners.v3 current: https://github.com/mihalicyn/linux/commits/seccomp.mult.listeners
Changelog for version 3: - almost completely rewritten (no static array on the stack, no nesting limit) - more testcases
Changelog for version 2: - add some explanatory comments - add RWB tags from Tycho Andersen (thanks, Tycho! ;) ) - CC-ed Aleksa as he might be interested in this stuff too
Links to previous versions: v2: https://lore.kernel.org/all/20251202115200.110646-1-aleksandr.mikhalitsyn@ca... tree: https://github.com/mihalicyn/linux/commits/seccomp.mult.listeners.v2 v1: https://lore.kernel.org/all/20251201122406.105045-1-aleksandr.mikhalitsyn@ca... tree: https://github.com/mihalicyn/linux/commits/seccomp.mult.listeners.v1
Link: https://lpc.events/event/19/contributions/2241/ [1]
Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-kselftest@vger.kernel.org Cc: bpf@vger.kernel.org Cc: Kees Cook kees@kernel.org Cc: Andy Lutomirski luto@amacapital.net Cc: Will Drewry wad@chromium.org Cc: Jonathan Corbet corbet@lwn.net Cc: Shuah Khan shuah@kernel.org Cc: Aleksa Sarai cyphar@cyphar.com Cc: Tycho Andersen tycho@tycho.pizza Cc: Andrei Vagin avagin@gmail.com Cc: Christian Brauner brauner@kernel.org Cc: Stéphane Graber stgraber@stgraber.org
Alexander Mikhalitsyn (7): seccomp: remove unused argument from seccomp_do_user_notification seccomp: use bitfields for boolean flags on seccomp_filter struct seccomp: keep track of seccomp filters with closed listeners seccomp: mark first listener in the tree seccomp: handle multiple listeners case seccomp: allow nested listeners tools/testing/selftests/seccomp: test nested listeners
.../userspace-api/seccomp_filter.rst | 6 + include/linux/seccomp.h | 3 +- include/uapi/linux/seccomp.h | 13 +- kernel/seccomp.c | 129 +++++++- tools/include/uapi/linux/seccomp.h | 13 +- tools/testing/selftests/seccomp/seccomp_bpf.c | 303 ++++++++++++++++++ 6 files changed, 438 insertions(+), 29 deletions(-)
Add some basic tests for nested listeners.
Cc: linux-kernel@vger.kernel.org Cc: linux-kselftest@vger.kernel.org Cc: bpf@vger.kernel.org Cc: Kees Cook kees@kernel.org Cc: Andy Lutomirski luto@amacapital.net Cc: Will Drewry wad@chromium.org Cc: Jonathan Corbet corbet@lwn.net Cc: Shuah Khan shuah@kernel.org Cc: Aleksa Sarai cyphar@cyphar.com Cc: Tycho Andersen tycho@tycho.pizza Cc: Andrei Vagin avagin@gmail.com Cc: Christian Brauner brauner@kernel.org Cc: Stéphane Graber stgraber@stgraber.org Signed-off-by: Alexander Mikhalitsyn aleksandr.mikhalitsyn@canonical.com --- tools/testing/selftests/seccomp/seccomp_bpf.c | 303 ++++++++++++++++++ 1 file changed, 303 insertions(+)
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 874f17763536..bbf3ef58ad07 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -301,6 +301,10 @@ struct seccomp_notif_addfd_big { #define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5) #endif
+#ifndef SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS +#define SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS (1UL << 6) +#endif + #ifndef seccomp int seccomp(unsigned int op, unsigned int flags, void *args) { @@ -4416,6 +4420,305 @@ TEST(user_notification_sync) ASSERT_EQ(status, 0); }
+/* + * This test is here to ensure that seccomp() behavior before + * introducing nested listeners is preserved. + */ +TEST(user_notification_many_ret_notif_old_behavior) +{ + pid_t pid, ppid; + long ret; + int status, listener; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + /* Add some no-op filters for grins. */ + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + + /* Install a filter that returns SECCOMP_RET_USER_NOTIF, but has no listener. */ + ASSERT_GE(user_notif_syscall(__NR_getppid, 0), 0); + + /* Install a filter that returns SECCOMP_RET_USER_NOTIF, and then close listener. */ + listener = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + ASSERT_GE(listener, 0); + close(listener); + + /* + * Note, that we can install another listener now (without nesting enabled!), + * because notify fd of the previous filter has been closed. + */ + listener = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + ASSERT_GE(listener, 0); + + /* Add some no-op filters for grins. */ + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + + ppid = getpid(); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ret = syscall(__NR_getppid); + exit(ret != ppid); + } + + memset(&req, 0, sizeof(req)); + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + EXPECT_EQ(req.pid, pid); + EXPECT_EQ(req.data.nr, __NR_getppid); + + memset(&resp, 0, sizeof(resp)); + resp.id = req.id; + + /* tell kernel to continue syscall and expect that upper-level filters are ignored */ + resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE; + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + + close(listener); +} + +TEST(user_notification_many_ret_notif_closed_listener_nested) +{ + pid_t pid; + long ret; + int status, listener, closed_listener; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + /* Add some no-op filters for grins. */ + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + + closed_listener = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER | + SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS); + ASSERT_GE(closed_listener, 0); + + /* + * Note, that we can install another listener now (without nesting enabled!), + * because notify fd of the previous filter has been closed. + */ + listener = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + ASSERT_GE(listener, 0); + + /* Now, once we installed a nested listener, close the previous one. */ + close(closed_listener); + + /* Add some no-op filters for grins. */ + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ret = syscall(__NR_getppid); + exit(ret >= 0 || errno != ENOSYS); + } + + memset(&req, 0, sizeof(req)); + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + EXPECT_EQ(req.pid, pid); + EXPECT_EQ(req.data.nr, __NR_getppid); + + memset(&resp, 0, sizeof(resp)); + resp.id = req.id; + + /* + * Tell kernel to continue syscall and expect ENOSYS, + * because upper filter's notify fd has been closed. + */ + resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE; + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + + close(listener); +} + +/* + * Ensure that EBUSY is returned on attempt to + * install a nested listener without nesting being allowed. + */ +TEST(user_notification_nested_limits) +{ + pid_t pid; + long ret; + int i, status, listeners[8]; + + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + /* Install 6 levels of listeners and allow nesting. */ + for (i = 0; i < 6; i++) { + listeners[i] = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER | + SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS); + ASSERT_GE(listeners[i], 0); + + /* Add some no-op filters for grins. */ + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + } + + /* Check behavior when nesting is not allowed. */ + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) { + /* Install a next listener in the chain without nesting allowed. */ + listeners[6] = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + if (listeners[6] < 0) + exit(1); + + /* Add some no-op filters for grins. */ + ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog); + if (ret != 0) + exit(2); + + ret = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + /* Installing a next listener in the chain should result in EBUSY. */ + exit((ret >= 0 || errno != EBUSY) ? 3 : 0); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST(user_notification_nested) +{ + pid_t pid; + long ret; + int i, status, listeners[6]; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + /* Install 6 levels of listeners and allow nesting. */ + for (i = 0; i < 6; i++) { + /* + * Install a filter that returns SECCOMP_RET_USER_NOTIF, but has no listener. + * We expect that these filters are not affecting the end result. + */ + ASSERT_GE(user_notif_syscall(__NR_getppid, 0), 0); + + listeners[i] = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER | + SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS); + ASSERT_GE(listeners[i], 0); + + /* Add some no-op filters for grins. */ + EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); + } + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + ret = syscall(__NR_getppid); + exit(ret != (USER_NOTIF_MAGIC-3)); + } + + /* + * We want to have the following picture: + * + * | Listener level (i) | Listener decision | + * |--------------------|-------------------| + * | 0 | WHATEVER | + * | 1 | WHATEVER | + * | 2 | WHATEVER | + * | 3 | RETURN | <-- stop here + * | 4 | CONTINUE SYSCALL | + * | 5 | CONTINUE SYSCALL | <- start here (current->seccomp.filter) + * + * First listener who receives a notification is level 5, then 4, + * then we expect to stop on level 3 and return from syscall with + * (USER_NOTIF_MAGIC - 3) return value. + */ + for (i = 6 - 1; i >= 3; i--) { + memset(&req, 0, sizeof(req)); + EXPECT_EQ(ioctl(listeners[i], SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + EXPECT_EQ(req.pid, pid); + EXPECT_EQ(req.data.nr, __NR_getppid); + + memset(&resp, 0, sizeof(resp)); + resp.id = req.id; + + if (i == 5 || i == 4) { + resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE; + } else { + resp.error = 0; + resp.val = USER_NOTIF_MAGIC - i; + } + + EXPECT_EQ(ioctl(listeners[i], SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + + for (i = 0; i < 6; i++) + close(listeners[i]); +}
/* Make sure PTRACE_O_SUSPEND_SECCOMP requires CAP_SYS_ADMIN. */ FIXTURE(O_SUSPEND_SECCOMP) {
linux-kselftest-mirror@lists.linaro.org