pidfd are /proc/pid directory file descriptors referring to a task group leader. Android low memory killer (LMK) needs pidfd polling support to replace code that currently checks for existence of /proc/pid for knowing a process that is signalled to be killed has died, which is both racy and slow. The pidfd poll approach is race-free, and also allows the LMK to do other things (such as by polling on other fds) while awaiting the process being killed to die.
It prevents a situation where a PID is reused between when LMK sends a kill signal and checks for existence of the PID, since the wrong PID is now possibly checked for existence.
In this patch, we follow the same mechanism used uhen the parent of the task group is to be notified, that is when the tasks waiting on a poll of pidfd are also awakened.
We have decided to include the waitqueue in struct pid for the following reasons: 1. The wait queue has to survive for the lifetime of the poll. Including it in task_struct would not be option in this case because the task can be reaped and destroyed before the poll returns.
2. By including the struct pid for the waitqueue means that during de_exec, the thread doing de_thread() automatically gets the new waitqueue/pid even though its task_struct is different.
Appropriate test cases are added in the second patch to provide coverage of all the cases the patch is handling.
Andy had a similar patch [1] in the past which was a good reference however this patch tries to handle different situations properly related to thread group existence, and how/where it notifies. And also solves other bugs (existence of taks_struct). Daniel had a similar patch [2] recently which this patch supercedes.
[1] https://lore.kernel.org/patchwork/patch/345098/ [2] https://lore.kernel.org/lkml/20181029175322.189042-1-dancol@google.com/
Cc: luto@amacapital.net Cc: rostedt@goodmis.org Cc: dancol@google.com Cc: christian@brauner.io Cc: jannh@google.com Cc: surenb@google.com Cc: torvalds@linux-foundation.org Co-developed-by: Daniel Colascione dancol@google.com Signed-off-by: Joel Fernandes (Google) joel@joelfernandes.org
--- fs/proc/base.c | 39 +++++++++++++++++++++++++++++++++++++++ include/linux/pid.h | 3 +++ kernel/exit.c | 1 - kernel/pid.c | 2 ++ kernel/signal.c | 14 ++++++++++++++ 5 files changed, 58 insertions(+), 1 deletion(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c index 6a803a0b75df..879900082647 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3069,8 +3069,47 @@ static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); }
+static unsigned int proc_tgid_base_poll(struct file *file, struct poll_table_struct *pts) +{ + int poll_flags = 0; + struct task_struct *task; + struct pid *pid; + + task = get_proc_task(file->f_path.dentry->d_inode); + + WARN_ON_ONCE(task && !thread_group_leader(task)); + + /* + * tasklist_lock must be held because to avoid racing with + * changes in exit_state and wake up. Basically to avoid: + * + * P0: read exit_state = 0 + * P1: write exit_state = EXIT_DEAD + * P1: Do a wake up - wq is empty, so do nothing + * P0: Queue for polling - wait forever. + */ + read_lock(&tasklist_lock); + if (!task) + poll_flags = POLLIN | POLLRDNORM | POLLERR; + else if (task->exit_state == EXIT_DEAD) + poll_flags = POLLIN | POLLRDNORM; + else if (task->exit_state == EXIT_ZOMBIE && thread_group_empty(task)) + poll_flags = POLLIN | POLLRDNORM; + + if (!poll_flags) { + pid = proc_pid(file->f_path.dentry->d_inode); + poll_wait(file, &pid->wait_pidfd, pts); + } + read_unlock(&tasklist_lock); + + if (task) + put_task_struct(task); + return poll_flags; +} + static const struct file_operations proc_tgid_base_operations = { .read = generic_read_dir, + .poll = proc_tgid_base_poll, .iterate_shared = proc_tgid_base_readdir, .llseek = generic_file_llseek, }; diff --git a/include/linux/pid.h b/include/linux/pid.h index b6f4ba16065a..2e0dcbc6d14e 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -3,6 +3,7 @@ #define _LINUX_PID_H
#include <linux/rculist.h> +#include <linux/wait.h>
enum pid_type { @@ -60,6 +61,8 @@ struct pid unsigned int level; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; + /* wait queue for pidfd pollers */ + wait_queue_head_t wait_pidfd; struct rcu_head rcu; struct upid numbers[1]; }; diff --git a/kernel/exit.c b/kernel/exit.c index 2166c2d92ddc..c386ec52687d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -181,7 +181,6 @@ static void delayed_put_task_struct(struct rcu_head *rhp) put_task_struct(tsk); }
- void release_task(struct task_struct *p) { struct task_struct *leader; diff --git a/kernel/pid.c b/kernel/pid.c index 20881598bdfa..5c90c239242f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -214,6 +214,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]);
+ init_waitqueue_head(&pid->wait_pidfd); + upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) diff --git a/kernel/signal.c b/kernel/signal.c index f98448cf2def..e3781703ef7e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1800,6 +1800,17 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) return ret; }
+static void do_wakeup_pidfd_pollers(struct task_struct *task) +{ + struct pid *pid; + + lockdep_assert_held(&tasklist_lock); + + pid = get_task_pid(task, PIDTYPE_PID); + wake_up_all(&pid->wait_pidfd); + put_pid(pid); +} + /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. @@ -1823,6 +1834,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+ /* Wake up all pidfd waiters */ + do_wakeup_pidfd_pollers(tsk); + if (sig != SIGCHLD) { /* * This is only possible if parent == real_parent.
Other than verifying pidfd based polling, the tests make sure that wait semantics are preserved with the pidfd poll. Notably the 2 cases: 1. If a thread group leader exits while threads still there, then no pidfd poll notifcation should happen. 2. If a non-thread group leader does an execve, then the thread group leader is signaled to exit and is replaced with the execing thread as the new leader, however the parent is not notified in this case.
Signed-off-by: Joel Fernandes (Google) joel@joelfernandes.org --- tools/testing/selftests/pidfd/Makefile | 2 +- tools/testing/selftests/pidfd/pidfd_test.c | 216 ++++++++++++++++++++- 2 files changed, 208 insertions(+), 10 deletions(-)
diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index deaf8073bc06..4b31c14f273c 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -1,4 +1,4 @@ -CFLAGS += -g -I../../../../usr/include/ +CFLAGS += -g -I../../../../usr/include/ -lpthread
TEST_GEN_PROGS := pidfd_test
diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c index d59378a93782..4d5206280091 100644 --- a/tools/testing/selftests/pidfd/pidfd_test.c +++ b/tools/testing/selftests/pidfd/pidfd_test.c @@ -4,18 +4,26 @@ #include <errno.h> #include <fcntl.h> #include <linux/types.h> +#include <pthread.h> #include <sched.h> #include <signal.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <syscall.h> +#include <sys/epoll.h> +#include <sys/mman.h> #include <sys/mount.h> #include <sys/wait.h> +#include <time.h> #include <unistd.h>
#include "../kselftest.h"
+#define CHILD_THREAD_MIN_WAIT 3 /* seconds */ +#define MAX_EVENTS 5 +#define __NR_pidfd_send_signal 424 + static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags) { @@ -30,6 +38,22 @@ static void set_signal_received_on_sigusr1(int sig) signal_received = 1; }
+static int open_pidfd(const char *test_name, pid_t pid) +{ + char buf[256]; + int pidfd; + + snprintf(buf, sizeof(buf), "/proc/%d", pid); + pidfd = open(buf, O_DIRECTORY | O_CLOEXEC); + + if (pidfd < 0) + ksft_exit_fail_msg( + "%s test: Failed to open process file descriptor\n", + test_name); + + return pidfd; +} + /* * Straightforward test to see whether pidfd_send_signal() works is to send * a signal to ourself. @@ -87,7 +111,6 @@ static int wait_for_pid(pid_t pid) static int test_pidfd_send_signal_exited_fail(void) { int pidfd, ret, saved_errno; - char buf[256]; pid_t pid; const char *test_name = "pidfd_send_signal signal exited process";
@@ -99,17 +122,10 @@ static int test_pidfd_send_signal_exited_fail(void) if (pid == 0) _exit(EXIT_SUCCESS);
- snprintf(buf, sizeof(buf), "/proc/%d", pid); - - pidfd = open(buf, O_DIRECTORY | O_CLOEXEC); + pidfd = open_pidfd(test_name, pid);
(void)wait_for_pid(pid);
- if (pidfd < 0) - ksft_exit_fail_msg( - "%s test: Failed to open process file descriptor\n", - test_name); - ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0); saved_errno = errno; close(pidfd); @@ -368,10 +384,192 @@ static int test_pidfd_send_signal_syscall_support(void) return 0; }
+void *test_pidfd_poll_exec_thread(void *priv) +{ + char waittime[256]; + + ksft_print_msg("Child Thread: starting. pid %d tid %d ; and sleeping\n", + getpid(), syscall(SYS_gettid)); + ksft_print_msg("Child Thread: doing exec of sleep\n"); + + sprintf(waittime, "%d", CHILD_THREAD_MIN_WAIT); + execl("/bin/sleep", "sleep", waittime, (char *)NULL); + + ksft_print_msg("Child Thread: DONE. pid %d tid %d\n", + getpid(), syscall(SYS_gettid)); + return NULL; +} + +static int poll_pidfd(const char *test_name, int pidfd) +{ + int c; + int epoll_fd = epoll_create1(0); + struct epoll_event event, events[MAX_EVENTS]; + + if (epoll_fd == -1) + ksft_exit_fail_msg("%s test: Failed to create epoll file descriptor\n", + test_name); + + event.events = EPOLLIN; + event.data.fd = pidfd; + + if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pidfd, &event)) { + ksft_print_msg("%s test: Failed to add epoll file descriptor: Skipping\n", + test_name); + _exit(PIDFD_SKIP); + } + + c = epoll_wait(epoll_fd, events, MAX_EVENTS, 5000); + if (c != 1 || !(events[0].events & EPOLLIN)) + ksft_exit_fail_msg("%s test: Unexpected epoll_wait result (c=%d, events=%x)\n", + test_name, c, events[0].events); + + close(epoll_fd); + return events[0].events; + +} + +int test_pidfd_poll_exec(int use_waitpid) +{ + int pid, pidfd; + int status, ret; + pthread_t t1; + time_t prog_start = time(NULL); + const char *test_name = "pidfd_poll check for premature notification on child thread exec"; + + ksft_print_msg("Parent: pid: %d\n", getpid()); + pid = fork(); + if (pid == 0) { + ksft_print_msg("Child: starting. pid %d tid %d\n", getpid(), + syscall(SYS_gettid)); + pthread_create(&t1, NULL, test_pidfd_poll_exec_thread, NULL); + /* + * Exec in the non-leader thread will destroy the leader immediately. + * If the wait in the parent returns too soon, the test fails. + */ + while (1) + ; + } + + ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid); + + if (use_waitpid) { + ret = waitpid(pid, &status, 0); + if (ret == -1) + ksft_print_msg("Parent: error\n"); + + if (ret == pid) + ksft_print_msg("Parent: Child process waited for.\n"); + } else { + pidfd = open_pidfd(test_name, pid); + if (poll_pidfd(test_name, pidfd) & EPOLLERR) + ksft_exit_fail_msg("%s test: Unexpected epoll error\n", test_name); + } + + time_t prog_time = time(NULL) - prog_start; + + ksft_print_msg("Time waited for child: %lu\n", prog_time); + + /* Check to make sure poll_pidfd returns error after reaping */ + if (!use_waitpid && + (waitpid(pid, &status, 0) != pid || !(poll_pidfd(test_name, pidfd) & EPOLLERR))) { + ksft_exit_fail_msg("%s test: poll_pidfd EPOLLERR check failed\n", test_name); + } + close(pidfd); + + if (prog_time < CHILD_THREAD_MIN_WAIT || prog_time > CHILD_THREAD_MIN_WAIT + 2) + ksft_exit_fail_msg("%s test: Failed\n", test_name); + else + ksft_test_result_pass("%s test: Passed\n", test_name); +} + +void *test_pidfd_poll_leader_exit_thread(void *priv) +{ + char waittime[256]; + + ksft_print_msg("Child Thread: starting. pid %d tid %d ; and sleeping\n", + getpid(), syscall(SYS_gettid)); + sleep(CHILD_THREAD_MIN_WAIT); + ksft_print_msg("Child Thread: DONE. pid %d tid %d\n", getpid(), syscall(SYS_gettid)); + return NULL; +} + +static time_t *child_exit_secs; +int test_pidfd_poll_leader_exit(int use_waitpid) +{ + int pid, pidfd; + int status, ret; + pthread_t t1, t2; + time_t prog_start = time(NULL); + const char *test_name = "pidfd_poll check for premature notification on non-empty" + "group leader exit"; + + child_exit_secs = mmap(NULL, sizeof *child_exit_secs, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + + ksft_print_msg("Parent: pid: %d\n", getpid()); + pid = fork(); + if (pid == 0) { + ksft_print_msg("Child: starting. pid %d tid %d\n", getpid(), syscall(SYS_gettid)); + pthread_create(&t1, NULL, test_pidfd_poll_leader_exit_thread, NULL); + pthread_create(&t2, NULL, test_pidfd_poll_leader_exit_thread, NULL); + + /* + * glibc exit calls exit_group syscall, so explicity call exit only + * so that only the group leader exits, leaving the threads alone. + */ + *child_exit_secs = time(NULL); + syscall(SYS_exit, 0); + } + + ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid); + + if (use_waitpid) { + ret = waitpid(pid, &status, 0); + if (ret == -1) + ksft_print_msg("Parent: error\n"); + } else { + /* + * This sleep tests for the case where if the child exits, and is in + * EXIT_ZOMBIE, but the thread group leader is non-empty, then the poll + * doesn't prematurely return even though there are active threads + */ + sleep(1); + pidfd = open_pidfd(test_name, pid); + if (poll_pidfd(test_name, pidfd) & EPOLLERR) + ksft_exit_fail_msg("%s test: Unexpected epoll error\n", test_name); + } + + if (ret == pid) + ksft_print_msg("Parent: Child process waited for.\n"); + + time_t since_child_exit = time(NULL) - *child_exit_secs; + + ksft_print_msg("Time since child exit: %lu\n", since_child_exit); + + /* Check to make sure poll_pidfd returns error after reaping */ + if (!use_waitpid && + (waitpid(pid, &status, 0) != pid || + !(poll_pidfd(test_name, pidfd) & EPOLLERR))) { + ksft_exit_fail_msg("%s test: poll_pidfd EPOLLERR check failed\n", test_name); + } + close(pidfd); + + if (since_child_exit < CHILD_THREAD_MIN_WAIT || + since_child_exit > CHILD_THREAD_MIN_WAIT + 2) + ksft_exit_fail_msg("%s test: Failed\n", test_name); + else + ksft_test_result_pass("%s test: Passed\n", test_name); +} + int main(int argc, char **argv) { ksft_print_header();
+ test_pidfd_poll_exec(0); + test_pidfd_poll_exec(1); + test_pidfd_poll_leader_exit(0); + test_pidfd_poll_leader_exit(1); test_pidfd_send_signal_syscall_support(); test_pidfd_send_signal_simple_success(); test_pidfd_send_signal_exited_fail();
On Thu, Apr 11, 2019 at 01:50:43PM -0400, Joel Fernandes (Google) wrote:
Other than verifying pidfd based polling, the tests make sure that wait semantics are preserved with the pidfd poll. Notably the 2 cases:
- If a thread group leader exits while threads still there, then no pidfd poll notifcation should happen.
- If a non-thread group leader does an execve, then the thread group leader is signaled to exit and is replaced with the execing thread as the new leader, however the parent is not notified in this case.
Signed-off-by: Joel Fernandes (Google) joel@joelfernandes.org
tools/testing/selftests/pidfd/Makefile | 2 +- tools/testing/selftests/pidfd/pidfd_test.c | 216 ++++++++++++++++++++- 2 files changed, 208 insertions(+), 10 deletions(-)
diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index deaf8073bc06..4b31c14f273c 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -1,4 +1,4 @@ -CFLAGS += -g -I../../../../usr/include/ +CFLAGS += -g -I../../../../usr/include/ -lpthread TEST_GEN_PROGS := pidfd_test diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c index d59378a93782..4d5206280091 100644 --- a/tools/testing/selftests/pidfd/pidfd_test.c +++ b/tools/testing/selftests/pidfd/pidfd_test.c @@ -4,18 +4,26 @@ #include <errno.h> #include <fcntl.h> #include <linux/types.h> +#include <pthread.h> #include <sched.h> #include <signal.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <syscall.h> +#include <sys/epoll.h> +#include <sys/mman.h> #include <sys/mount.h> #include <sys/wait.h> +#include <time.h> #include <unistd.h> #include "../kselftest.h" +#define CHILD_THREAD_MIN_WAIT 3 /* seconds */ +#define MAX_EVENTS 5 +#define __NR_pidfd_send_signal 424
static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags) { @@ -30,6 +38,22 @@ static void set_signal_received_on_sigusr1(int sig) signal_received = 1; } +static int open_pidfd(const char *test_name, pid_t pid) +{
- char buf[256];
- int pidfd;
- snprintf(buf, sizeof(buf), "/proc/%d", pid);
- pidfd = open(buf, O_DIRECTORY | O_CLOEXEC);
- if (pidfd < 0)
ksft_exit_fail_msg(
"%s test: Failed to open process file descriptor\n",
test_name);
- return pidfd;
+}
/*
- Straightforward test to see whether pidfd_send_signal() works is to send
- a signal to ourself.
@@ -87,7 +111,6 @@ static int wait_for_pid(pid_t pid) static int test_pidfd_send_signal_exited_fail(void) { int pidfd, ret, saved_errno;
- char buf[256]; pid_t pid; const char *test_name = "pidfd_send_signal signal exited process";
@@ -99,17 +122,10 @@ static int test_pidfd_send_signal_exited_fail(void) if (pid == 0) _exit(EXIT_SUCCESS);
- snprintf(buf, sizeof(buf), "/proc/%d", pid);
- pidfd = open(buf, O_DIRECTORY | O_CLOEXEC);
- pidfd = open_pidfd(test_name, pid);
(void)wait_for_pid(pid);
- if (pidfd < 0)
ksft_exit_fail_msg(
"%s test: Failed to open process file descriptor\n",
test_name);
- ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0); saved_errno = errno; close(pidfd);
@@ -368,10 +384,192 @@ static int test_pidfd_send_signal_syscall_support(void) return 0; } +void *test_pidfd_poll_exec_thread(void *priv)
I think you can do static here?
+{
- char waittime[256];
- ksft_print_msg("Child Thread: starting. pid %d tid %d ; and sleeping\n",
getpid(), syscall(SYS_gettid));
- ksft_print_msg("Child Thread: doing exec of sleep\n");
- sprintf(waittime, "%d", CHILD_THREAD_MIN_WAIT);
- execl("/bin/sleep", "sleep", waittime, (char *)NULL);
- ksft_print_msg("Child Thread: DONE. pid %d tid %d\n",
getpid(), syscall(SYS_gettid));
You execl(), but then print stuff after that? Might also be worth switching to execlp().
- return NULL;
+}
+static int poll_pidfd(const char *test_name, int pidfd) +{
- int c;
- int epoll_fd = epoll_create1(0);
A style point, but I find it's best not to do resource allocation in variable declarations like this. It breaks up the usual pattern of:
ret = -ENOMEM; resource = allocate(); if (allocation_failed(resource)) goto err;
...
out: free(resource); err: return ret;
You're not closing this fd on every path (they all exit [for now :D] so it's probably ok), but it might be nice to make this match a more regular pattern.
- struct epoll_event event, events[MAX_EVENTS];
- if (epoll_fd == -1)
ksft_exit_fail_msg("%s test: Failed to create epoll file descriptor\n",
test_name);
- event.events = EPOLLIN;
- event.data.fd = pidfd;
- if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pidfd, &event)) {
ksft_print_msg("%s test: Failed to add epoll file descriptor: Skipping\n",
test_name);
Might be worth checking errno == EPERM here too (which according to the man page is the error for "epoll not supported", which is weird :).
_exit(PIDFD_SKIP);
- }
- c = epoll_wait(epoll_fd, events, MAX_EVENTS, 5000);
- if (c != 1 || !(events[0].events & EPOLLIN))
ksft_exit_fail_msg("%s test: Unexpected epoll_wait result (c=%d, events=%x)\n",
test_name, c, events[0].events);
- close(epoll_fd);
- return events[0].events;
+}
+int test_pidfd_poll_exec(int use_waitpid)
I think this can be static too.
+{
- int pid, pidfd;
- int status, ret;
- pthread_t t1;
- time_t prog_start = time(NULL);
- const char *test_name = "pidfd_poll check for premature notification on child thread exec";
- ksft_print_msg("Parent: pid: %d\n", getpid());
- pid = fork();
- if (pid == 0) {
ksft_print_msg("Child: starting. pid %d tid %d\n", getpid(),
syscall(SYS_gettid));
pthread_create(&t1, NULL, test_pidfd_poll_exec_thread, NULL);
/*
* Exec in the non-leader thread will destroy the leader immediately.
* If the wait in the parent returns too soon, the test fails.
*/
while (1)
;
- }
- ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid);
- if (use_waitpid) {
ret = waitpid(pid, &status, 0);
if (ret == -1)
ksft_print_msg("Parent: error\n");
if (ret == pid)
ksft_print_msg("Parent: Child process waited for.\n");
- } else {
pidfd = open_pidfd(test_name, pid);
if (poll_pidfd(test_name, pidfd) & EPOLLERR)
ksft_exit_fail_msg("%s test: Unexpected epoll error\n", test_name);
- }
- time_t prog_time = time(NULL) - prog_start;
- ksft_print_msg("Time waited for child: %lu\n", prog_time);
- /* Check to make sure poll_pidfd returns error after reaping */
- if (!use_waitpid &&
(waitpid(pid, &status, 0) != pid || !(poll_pidfd(test_name, pidfd) & EPOLLERR))) {
ksft_exit_fail_msg("%s test: poll_pidfd EPOLLERR check failed\n", test_name);
- }
- close(pidfd);
- if (prog_time < CHILD_THREAD_MIN_WAIT || prog_time > CHILD_THREAD_MIN_WAIT + 2)
ksft_exit_fail_msg("%s test: Failed\n", test_name);
- else
ksft_test_result_pass("%s test: Passed\n", test_name);
+}
+void *test_pidfd_poll_leader_exit_thread(void *priv)
Another static I think?
Tycho
On Thu, Apr 11, 2019 at 01:50:42PM -0400, Joel Fernandes (Google) wrote:
pidfd are /proc/pid directory file descriptors referring to a task group leader. Android low memory killer (LMK) needs pidfd polling support to replace code that currently checks for existence of /proc/pid for knowing a process that is signalled to be killed has died, which is both racy and slow. The pidfd poll approach is race-free, and also allows the LMK to do other things (such as by polling on other fds) while awaiting the process being killed to die.
It appears to me that the "pidfd" now will be an anon inode fd, and not based on /proc/, based on discussions with Linus. So I'll rework the patches accordingly. However that is relatively independent of this patch so this version can also be reviewed before I send out the reworked version.
thanks,
- Joel
It prevents a situation where a PID is reused between when LMK sends a kill signal and checks for existence of the PID, since the wrong PID is now possibly checked for existence.
In this patch, we follow the same mechanism used uhen the parent of the task group is to be notified, that is when the tasks waiting on a poll of pidfd are also awakened.
We have decided to include the waitqueue in struct pid for the following reasons:
- The wait queue has to survive for the lifetime of the poll. Including
it in task_struct would not be option in this case because the task can be reaped and destroyed before the poll returns.
- By including the struct pid for the waitqueue means that during
de_exec, the thread doing de_thread() automatically gets the new waitqueue/pid even though its task_struct is different.
Appropriate test cases are added in the second patch to provide coverage of all the cases the patch is handling.
Andy had a similar patch [1] in the past which was a good reference however this patch tries to handle different situations properly related to thread group existence, and how/where it notifies. And also solves other bugs (existence of taks_struct). Daniel had a similar patch [2] recently which this patch supercedes.
[1] https://lore.kernel.org/patchwork/patch/345098/ [2] https://lore.kernel.org/lkml/20181029175322.189042-1-dancol@google.com/
Cc: luto@amacapital.net Cc: rostedt@goodmis.org Cc: dancol@google.com Cc: christian@brauner.io Cc: jannh@google.com Cc: surenb@google.com Cc: torvalds@linux-foundation.org Co-developed-by: Daniel Colascione dancol@google.com Signed-off-by: Joel Fernandes (Google) joel@joelfernandes.org
fs/proc/base.c | 39 +++++++++++++++++++++++++++++++++++++++ include/linux/pid.h | 3 +++ kernel/exit.c | 1 - kernel/pid.c | 2 ++ kernel/signal.c | 14 ++++++++++++++ 5 files changed, 58 insertions(+), 1 deletion(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c index 6a803a0b75df..879900082647 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3069,8 +3069,47 @@ static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } +static unsigned int proc_tgid_base_poll(struct file *file, struct poll_table_struct *pts) +{
- int poll_flags = 0;
- struct task_struct *task;
- struct pid *pid;
- task = get_proc_task(file->f_path.dentry->d_inode);
- WARN_ON_ONCE(task && !thread_group_leader(task));
- /*
* tasklist_lock must be held because to avoid racing with
* changes in exit_state and wake up. Basically to avoid:
*
* P0: read exit_state = 0
* P1: write exit_state = EXIT_DEAD
* P1: Do a wake up - wq is empty, so do nothing
* P0: Queue for polling - wait forever.
*/
- read_lock(&tasklist_lock);
- if (!task)
poll_flags = POLLIN | POLLRDNORM | POLLERR;
- else if (task->exit_state == EXIT_DEAD)
poll_flags = POLLIN | POLLRDNORM;
- else if (task->exit_state == EXIT_ZOMBIE && thread_group_empty(task))
poll_flags = POLLIN | POLLRDNORM;
- if (!poll_flags) {
pid = proc_pid(file->f_path.dentry->d_inode);
poll_wait(file, &pid->wait_pidfd, pts);
- }
- read_unlock(&tasklist_lock);
- if (task)
put_task_struct(task);
- return poll_flags;
+}
static const struct file_operations proc_tgid_base_operations = { .read = generic_read_dir,
- .poll = proc_tgid_base_poll, .iterate_shared = proc_tgid_base_readdir, .llseek = generic_file_llseek,
}; diff --git a/include/linux/pid.h b/include/linux/pid.h index b6f4ba16065a..2e0dcbc6d14e 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -3,6 +3,7 @@ #define _LINUX_PID_H #include <linux/rculist.h> +#include <linux/wait.h> enum pid_type { @@ -60,6 +61,8 @@ struct pid unsigned int level; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX];
- /* wait queue for pidfd pollers */
- wait_queue_head_t wait_pidfd; struct rcu_head rcu; struct upid numbers[1];
}; diff --git a/kernel/exit.c b/kernel/exit.c index 2166c2d92ddc..c386ec52687d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -181,7 +181,6 @@ static void delayed_put_task_struct(struct rcu_head *rhp) put_task_struct(tsk); }
void release_task(struct task_struct *p) { struct task_struct *leader; diff --git a/kernel/pid.c b/kernel/pid.c index 20881598bdfa..5c90c239242f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -214,6 +214,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]);
- init_waitqueue_head(&pid->wait_pidfd);
- upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING))
diff --git a/kernel/signal.c b/kernel/signal.c index f98448cf2def..e3781703ef7e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1800,6 +1800,17 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) return ret; } +static void do_wakeup_pidfd_pollers(struct task_struct *task) +{
- struct pid *pid;
- lockdep_assert_held(&tasklist_lock);
- pid = get_task_pid(task, PIDTYPE_PID);
- wake_up_all(&pid->wait_pidfd);
- put_pid(pid);
+}
/*
- Let a parent know about the death of a child.
- For a stopped/continued status change, use do_notify_parent_cldstop instead.
@@ -1823,6 +1834,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk)));
- /* Wake up all pidfd waiters */
- do_wakeup_pidfd_pollers(tsk);
- if (sig != SIGCHLD) { /*
- This is only possible if parent == real_parent.
-- 2.21.0.392.gf8f6787159e-goog
On April 11, 2019 10:00:59 PM GMT+02:00, Joel Fernandes joel@joelfernandes.org wrote:
On Thu, Apr 11, 2019 at 01:50:42PM -0400, Joel Fernandes (Google) wrote:
pidfd are /proc/pid directory file descriptors referring to a task
group
leader. Android low memory killer (LMK) needs pidfd polling support
to
replace code that currently checks for existence of /proc/pid for knowing a process that is signalled to be killed has died, which is
both
racy and slow. The pidfd poll approach is race-free, and also allows
the
LMK to do other things (such as by polling on other fds) while
awaiting
the process being killed to die.
It appears to me that the "pidfd" now will be an anon inode fd, and not based on /proc/, based on discussions with Linus. So I'll rework the patches accordingly. However that is relatively independent of this patch so this version can also be reviewed before I send out the reworked version.
Thank you very much, Joel. I'm off this week and traveling but I'll try to give it a look asap.
Christian
thanks,
- Joel
It prevents a situation where a PID is reused between when LMK sends
a
kill signal and checks for existence of the PID, since the wrong PID
is
now possibly checked for existence.
In this patch, we follow the same mechanism used uhen the parent of
the
task group is to be notified, that is when the tasks waiting on a
poll
of pidfd are also awakened.
We have decided to include the waitqueue in struct pid for the
following
reasons:
- The wait queue has to survive for the lifetime of the poll.
Including
it in task_struct would not be option in this case because the task
can
be reaped and destroyed before the poll returns.
- By including the struct pid for the waitqueue means that during
de_exec, the thread doing de_thread() automatically gets the new waitqueue/pid even though its task_struct is different.
Appropriate test cases are added in the second patch to provide
coverage
of all the cases the patch is handling.
Andy had a similar patch [1] in the past which was a good reference however this patch tries to handle different situations properly
related
to thread group existence, and how/where it notifies. And also solves other bugs (existence of taks_struct). Daniel had a similar patch
[2]
recently which this patch supercedes.
https://lore.kernel.org/lkml/20181029175322.189042-1-dancol@google.com/
Cc: luto@amacapital.net Cc: rostedt@goodmis.org Cc: dancol@google.com Cc: christian@brauner.io Cc: jannh@google.com Cc: surenb@google.com Cc: torvalds@linux-foundation.org Co-developed-by: Daniel Colascione dancol@google.com Signed-off-by: Joel Fernandes (Google) joel@joelfernandes.org
fs/proc/base.c | 39 +++++++++++++++++++++++++++++++++++++++ include/linux/pid.h | 3 +++ kernel/exit.c | 1 - kernel/pid.c | 2 ++ kernel/signal.c | 14 ++++++++++++++ 5 files changed, 58 insertions(+), 1 deletion(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c index 6a803a0b75df..879900082647 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3069,8 +3069,47 @@ static int proc_tgid_base_readdir(struct file
*file, struct dir_context *ctx)
tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
} +static unsigned int proc_tgid_base_poll(struct file *file, struct
poll_table_struct *pts)
+{
- int poll_flags = 0;
- struct task_struct *task;
- struct pid *pid;
- task = get_proc_task(file->f_path.dentry->d_inode);
- WARN_ON_ONCE(task && !thread_group_leader(task));
- /*
* tasklist_lock must be held because to avoid racing with
* changes in exit_state and wake up. Basically to avoid:
*
* P0: read exit_state = 0
* P1: write exit_state = EXIT_DEAD
* P1: Do a wake up - wq is empty, so do nothing
* P0: Queue for polling - wait forever.
*/
- read_lock(&tasklist_lock);
- if (!task)
poll_flags = POLLIN | POLLRDNORM | POLLERR;
- else if (task->exit_state == EXIT_DEAD)
poll_flags = POLLIN | POLLRDNORM;
- else if (task->exit_state == EXIT_ZOMBIE &&
thread_group_empty(task))
poll_flags = POLLIN | POLLRDNORM;
- if (!poll_flags) {
pid = proc_pid(file->f_path.dentry->d_inode);
poll_wait(file, &pid->wait_pidfd, pts);
- }
- read_unlock(&tasklist_lock);
- if (task)
put_task_struct(task);
- return poll_flags;
+}
static const struct file_operations proc_tgid_base_operations = { .read = generic_read_dir,
- .poll = proc_tgid_base_poll, .iterate_shared = proc_tgid_base_readdir, .llseek = generic_file_llseek,
}; diff --git a/include/linux/pid.h b/include/linux/pid.h index b6f4ba16065a..2e0dcbc6d14e 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -3,6 +3,7 @@ #define _LINUX_PID_H #include <linux/rculist.h> +#include <linux/wait.h> enum pid_type { @@ -60,6 +61,8 @@ struct pid unsigned int level; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX];
- /* wait queue for pidfd pollers */
- wait_queue_head_t wait_pidfd; struct rcu_head rcu; struct upid numbers[1];
}; diff --git a/kernel/exit.c b/kernel/exit.c index 2166c2d92ddc..c386ec52687d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -181,7 +181,6 @@ static void delayed_put_task_struct(struct
rcu_head *rhp)
put_task_struct(tsk); }
void release_task(struct task_struct *p) { struct task_struct *leader; diff --git a/kernel/pid.c b/kernel/pid.c index 20881598bdfa..5c90c239242f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -214,6 +214,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]);
- init_waitqueue_head(&pid->wait_pidfd);
- upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING))
diff --git a/kernel/signal.c b/kernel/signal.c index f98448cf2def..e3781703ef7e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1800,6 +1800,17 @@ int send_sigqueue(struct sigqueue *q, struct
pid *pid, enum pid_type type)
return ret; } +static void do_wakeup_pidfd_pollers(struct task_struct *task) +{
- struct pid *pid;
- lockdep_assert_held(&tasklist_lock);
- pid = get_task_pid(task, PIDTYPE_PID);
- wake_up_all(&pid->wait_pidfd);
- put_pid(pid);
+}
/*
- Let a parent know about the death of a child.
- For a stopped/continued status change, use
do_notify_parent_cldstop instead.
@@ -1823,6 +1834,9 @@ bool do_notify_parent(struct task_struct *tsk,
int sig)
BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk)));
- /* Wake up all pidfd waiters */
- do_wakeup_pidfd_pollers(tsk);
- if (sig != SIGCHLD) { /*
- This is only possible if parent == real_parent.
-- 2.21.0.392.gf8f6787159e-goog
On Thu, Apr 11, 2019 at 10:02:32PM +0200, Christian Brauner wrote:
On April 11, 2019 10:00:59 PM GMT+02:00, Joel Fernandes joel@joelfernandes.org wrote:
On Thu, Apr 11, 2019 at 01:50:42PM -0400, Joel Fernandes (Google) wrote:
pidfd are /proc/pid directory file descriptors referring to a task
group
leader. Android low memory killer (LMK) needs pidfd polling support
to
replace code that currently checks for existence of /proc/pid for knowing a process that is signalled to be killed has died, which is
both
racy and slow. The pidfd poll approach is race-free, and also allows
the
LMK to do other things (such as by polling on other fds) while
awaiting
the process being killed to die.
It appears to me that the "pidfd" now will be an anon inode fd, and not based on /proc/, based on discussions with Linus. So I'll rework the patches accordingly. However that is relatively independent of this patch so this version can also be reviewed before I send out the reworked version.
Thank you very much, Joel. I'm off this week and traveling but I'll try to give it a look asap.
Christian
Sounds great to me, thanks a lot Christian.
- Joel
On Thu, Apr 11, 2019 at 10:51 AM Joel Fernandes (Google) joel@joelfernandes.org wrote:
pidfd are /proc/pid directory file descriptors referring to a task group leader. Android low memory killer (LMK) needs pidfd polling support to replace code that currently checks for existence of /proc/pid for knowing a process that is signalled to be killed has died, which is both racy and slow. The pidfd poll approach is race-free, and also allows the LMK to do other things (such as by polling on other fds) while awaiting the process being killed to die.
It prevents a situation where a PID is reused between when LMK sends a kill signal and checks for existence of the PID, since the wrong PID is now possibly checked for existence.
In this patch, we follow the same mechanism used uhen the parent of the task group is to be notified, that is when the tasks waiting on a poll of pidfd are also awakened.
We have decided to include the waitqueue in struct pid for the following reasons:
- The wait queue has to survive for the lifetime of the poll. Including
it in task_struct would not be option in this case because the task can be reaped and destroyed before the poll returns.
Are you sure? I admit I'm not all that familiar with the innards of poll() on Linux, but I thought that the waitqueue only had to survive long enough to kick the polling thread and did *not* have to survive until poll() actually returned.
- By including the struct pid for the waitqueue means that during
de_exec, the thread doing de_thread() automatically gets the new waitqueue/pid even though its task_struct is different.
I didn't follow this. Can you clarify?
Also, please don't call your new helper wake_up_pidfd_pollers(). One of the goals of my patch was to make it generically possible for kernel code to wait for a task to exit. There are other cases besides pidfd for which this would be useful. Ahem, kthread. (The kthread implementation currently does some seriously awful things to detect when kthreads die.) Also, some hypothetical future vastly improved debugging API (to supercede ptrace for new applications) might want this.
--Andy
Hi Andy!
On Fri, Apr 12, 2019 at 02:32:53PM -0700, Andy Lutomirski wrote:
On Thu, Apr 11, 2019 at 10:51 AM Joel Fernandes (Google) joel@joelfernandes.org wrote:
pidfd are /proc/pid directory file descriptors referring to a task group leader. Android low memory killer (LMK) needs pidfd polling support to replace code that currently checks for existence of /proc/pid for knowing a process that is signalled to be killed has died, which is both racy and slow. The pidfd poll approach is race-free, and also allows the LMK to do other things (such as by polling on other fds) while awaiting the process being killed to die.
It prevents a situation where a PID is reused between when LMK sends a kill signal and checks for existence of the PID, since the wrong PID is now possibly checked for existence.
In this patch, we follow the same mechanism used uhen the parent of the task group is to be notified, that is when the tasks waiting on a poll of pidfd are also awakened.
We have decided to include the waitqueue in struct pid for the following reasons:
- The wait queue has to survive for the lifetime of the poll. Including
it in task_struct would not be option in this case because the task can be reaped and destroyed before the poll returns.
Are you sure? I admit I'm not all that familiar with the innards of poll() on Linux, but I thought that the waitqueue only had to survive long enough to kick the polling thread and did *not* have to survive until poll() actually returned.
I am not sure now. I thought epoll(2) was based on the wait_event APIs, however more closely looking at the eventpoll code, it looks like there are 2 waitqueues involved, one that we pass and the other that is a part of the eventpoll session itself, so you could be right about that. Daniel Colascione may have some more thoughts about it since he brought up the possiblity of a wq life-time issue. Daniel? We were just playing it safe.
Either way the waitqueue in struct pid has the advantage mentioned below:
- By including the struct pid for the waitqueue means that during
de_exec, the thread doing de_thread() automatically gets the new waitqueue/pid even though its task_struct is different.
I didn't follow this. Can you clarify?
Sure. de_thread() can called when all threads of a thread group need to die when any thread in the group does an execve. The thread doing the execve will become the new thread leader. In this case, the thread that did the exec gets the pid of the new leader. The semantics of wait(2) are such that the wait should not return (unblock) in the above scenario because the group is non-empty even though the task_struct of the group leader died. IOW, we should not wake up any pidfd pollers in this cases.
So basically what I was trying to say in point 2 above is that because of putting the waitqueue in struct pid, the change_pid() in de_thread() automatically carries the waiting tasks to the new task_struct leader, because the pid gets transferred to the new leader. If we put it in task_struct, then that wouldn't work since the leader's task_struct would get destroyed and we would have to handle the case in some other way. At least that is the theory. Anyway we specifically test for this case in patch 2/2 and also tested that not handling this case fails the test.
Also, please don't call your new helper wake_up_pidfd_pollers(). One
I will call it wake_up_pollers() then, if that's Ok.
of the goals of my patch was to make it generically possible for kernel code to wait for a task to exit. There are other cases besides pidfd for which this would be useful. Ahem, kthread. (The kthread implementation currently does some seriously awful things to detect when kthreads die.) Also, some hypothetical future vastly improved debugging API (to supercede ptrace for new applications) might want this.
Ah I see :-) Nice to know we can use this to improve the kthread code.
thanks,
- Joel
On Fri, Apr 12, 2019 at 2:33 PM Andy Lutomirski luto@kernel.org wrote:
Are you sure? I admit I'm not all that familiar with the innards of poll() on Linux, but I thought that the waitqueue only had to survive long enough to kick the polling thread and did *not* have to survive until poll() actually returned.
That is *not* true by default.
You can do that, but you need to make sure that your wakeup function is one that removed itself from the wait queues. You can do that with DEFINE_WAIT(name), which uses autoremove_wake_function(), or by using your own auto-removing wakeup function together with DEFINE_WAIT_FUNC() or init_waitqueue_func_entry().
But the default wake function does not remove on wakeup, and you'll have to be around until poll() itself tears down all the tables.
In particular, the normal "poll_wait()" will use __pollwait, which does:
init_waitqueue_func_entry(&entry->wait, pollwake);
and pollwake() (which is thus what gets called at wake time) will not remove anything from the wait queue.
So no, by default your wait queue has to stay around for the duration of poll() (ie the duration of the file descriptor, since poll() gets a reference to it).
You *can* play games with pollwait functions (and with wait functions in general), but by default you should consider the wait function to stay around.
Linus
On 04/11, Joel Fernandes (Google) wrote:
+static unsigned int proc_tgid_base_poll(struct file *file, struct poll_table_struct *pts) +{
- int poll_flags = 0;
- struct task_struct *task;
- struct pid *pid;
- task = get_proc_task(file->f_path.dentry->d_inode);
- WARN_ON_ONCE(task && !thread_group_leader(task));
- /*
* tasklist_lock must be held because to avoid racing with
* changes in exit_state and wake up. Basically to avoid:
*
* P0: read exit_state = 0
* P1: write exit_state = EXIT_DEAD
* P1: Do a wake up - wq is empty, so do nothing
* P0: Queue for polling - wait forever.
*/
- read_lock(&tasklist_lock);
- if (!task)
poll_flags = POLLIN | POLLRDNORM | POLLERR;
- else if (task->exit_state == EXIT_DEAD)
poll_flags = POLLIN | POLLRDNORM;
- else if (task->exit_state == EXIT_ZOMBIE && thread_group_empty(task))
poll_flags = POLLIN | POLLRDNORM;
- if (!poll_flags) {
pid = proc_pid(file->f_path.dentry->d_inode);
poll_wait(file, &pid->wait_pidfd, pts);
- }
can't understand...
Could you explain when it should return POLLIN? When the whole process exits?
Then all you need is
!task || task->exit_state && thread_group_empty(task)
Please do not use EXIT_DEAD/EXIT_ZOMBIE. And ->wait_pidfd should probably live in task->signal_struct.
Oleg.
On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote:
On 04/11, Joel Fernandes (Google) wrote:
+static unsigned int proc_tgid_base_poll(struct file *file, struct poll_table_struct *pts) +{
- int poll_flags = 0;
- struct task_struct *task;
- struct pid *pid;
- task = get_proc_task(file->f_path.dentry->d_inode);
- WARN_ON_ONCE(task && !thread_group_leader(task));
- /*
* tasklist_lock must be held because to avoid racing with
* changes in exit_state and wake up. Basically to avoid:
*
* P0: read exit_state = 0
* P1: write exit_state = EXIT_DEAD
* P1: Do a wake up - wq is empty, so do nothing
* P0: Queue for polling - wait forever.
*/
- read_lock(&tasklist_lock);
- if (!task)
poll_flags = POLLIN | POLLRDNORM | POLLERR;
- else if (task->exit_state == EXIT_DEAD)
poll_flags = POLLIN | POLLRDNORM;
- else if (task->exit_state == EXIT_ZOMBIE && thread_group_empty(task))
poll_flags = POLLIN | POLLRDNORM;
- if (!poll_flags) {
pid = proc_pid(file->f_path.dentry->d_inode);
poll_wait(file, &pid->wait_pidfd, pts);
- }
can't understand...
Could you explain when it should return POLLIN? When the whole process exits?
It returns POLLIN when the task is dead or doesn't exist anymore, or when it is in a zombie state and there's no other thread in the thread group.
Then all you need is
!task || task->exit_state && thread_group_empty(task)
Yes this works as well, all the tests pass with your suggestion so I'll change it to that. Although I will the be giving up returing EPOLLERR if the task_struct doesn't exit. We don't need that, but I thought it was cool to return it anyway.
Please do not use EXIT_DEAD/EXIT_ZOMBIE. And ->wait_pidfd should probably live in task->signal_struct.
About wait_pidfd living in signal_struct, that wont work since the waitqueue has to survive for the duration of the poll system call. Linus also confirmed this: https://lore.kernel.org/patchwork/patch/1060650/#1257371
Also the waitqueue living in struct pid solves the de_thread() issue I mentioned later in the following thread and in the commit message: https://lore.kernel.org/patchwork/comment/1257175/
thanks,
- Joel
On Tue, Apr 16, 2019 at 03:20:51PM -0400, Joel Fernandes wrote:
On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote:
On 04/11, Joel Fernandes (Google) wrote:
+static unsigned int proc_tgid_base_poll(struct file *file, struct poll_table_struct *pts) +{
- int poll_flags = 0;
- struct task_struct *task;
- struct pid *pid;
- task = get_proc_task(file->f_path.dentry->d_inode);
- WARN_ON_ONCE(task && !thread_group_leader(task));
- /*
* tasklist_lock must be held because to avoid racing with
* changes in exit_state and wake up. Basically to avoid:
*
* P0: read exit_state = 0
* P1: write exit_state = EXIT_DEAD
* P1: Do a wake up - wq is empty, so do nothing
* P0: Queue for polling - wait forever.
*/
- read_lock(&tasklist_lock);
- if (!task)
poll_flags = POLLIN | POLLRDNORM | POLLERR;
- else if (task->exit_state == EXIT_DEAD)
poll_flags = POLLIN | POLLRDNORM;
- else if (task->exit_state == EXIT_ZOMBIE && thread_group_empty(task))
poll_flags = POLLIN | POLLRDNORM;
- if (!poll_flags) {
pid = proc_pid(file->f_path.dentry->d_inode);
poll_wait(file, &pid->wait_pidfd, pts);
- }
can't understand...
Could you explain when it should return POLLIN? When the whole process exits?
It returns POLLIN when the task is dead or doesn't exist anymore, or when it is in a zombie state and there's no other thread in the thread group.
Then all you need is
!task || task->exit_state && thread_group_empty(task)
Yes this works as well, all the tests pass with your suggestion so I'll change it to that. Although I will the be giving up returing EPOLLERR if the task_struct doesn't exit. We don't need that, but I thought it was cool to return it anyway.
Here I actually meant "task_struct doesn't exist" , sorry.
thanks,
- Joel
On 04/16, Joel Fernandes wrote:
On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote:
Could you explain when it should return POLLIN? When the whole process exits?
It returns POLLIN when the task is dead or doesn't exist anymore, or when it is in a zombie state and there's no other thread in the thread group.
IOW, when the whole thread group exits, so it can't be used to monitor sub-threads.
just in case... speaking of this patch it doesn't modify proc_tid_base_operations, so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are going to use the anonymous file returned by CLONE_PIDFD ?
Then all you need is
!task || task->exit_state && thread_group_empty(task)
Yes this works as well, all the tests pass with your suggestion so I'll change it to that. Although I will the be giving up returing EPOLLERR if the task_struct doesn't exit. We don't need that, but I thought it was cool to return it anyway.
OK, task == NULL means that it was already reaped by parent, pid_nr is free, probably useful....
Please do not use EXIT_DEAD/EXIT_ZOMBIE. And ->wait_pidfd should probably live in task->signal_struct.
About wait_pidfd living in signal_struct, that wont work since the waitqueue has to survive for the duration of the poll system call.
That is why I said this will need the additional cleanup in free_signal_struct(). But I was wrong, somehow I forgot that free_poll_entry() needs wq_head->lock ;) so this will need much more complications, lets forget it...
Also the waitqueue living in struct pid solves the de_thread() issue I mentioned later in the following thread and in the commit message: https://lore.kernel.org/patchwork/comment/1257175/
Hmm...
2. By including the struct pid for the waitqueue means that during de_exec, the thread doing de_thread() automatically gets the new waitqueue/pid even though its task_struct is different.
this one?
this is not true, or I do not understand...
it gets the _same_ (old, not new) PIDTYPE_TGID pid even if it changes task_struct. But probably this is what you actually meant, because this is what your patch wants or I am totally confused.
And note that exec/de_thread doesn't change ->signal_struct, so I do not understand you anyway. Nevermind.
Oleg.
On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote:
On 04/16, Joel Fernandes wrote:
On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote:
Could you explain when it should return POLLIN? When the whole process exits?
It returns POLLIN when the task is dead or doesn't exist anymore, or when it is in a zombie state and there's no other thread in the thread group.
IOW, when the whole thread group exits, so it can't be used to monitor sub-threads.
just in case... speaking of this patch it doesn't modify proc_tid_base_operations, so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are going to use the anonymous file returned by CLONE_PIDFD ?
I don't think procfs works that way. /proc/sub-thread-tid has proc_tgid_base_operations despite not being a thread group leader. (Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can be hit trivially, and then the code will misbehave.
@Joel: I think you'll have to either rewrite this to explicitly bail out if you're dealing with a thread group leader, or make the code work for threads, too.
On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote:
On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote:
On 04/16, Joel Fernandes wrote:
On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote:
Could you explain when it should return POLLIN? When the whole
process exits?
It returns POLLIN when the task is dead or doesn't exist anymore,
or when it
is in a zombie state and there's no other thread in the thread
group.
IOW, when the whole thread group exits, so it can't be used to
monitor sub-threads.
just in case... speaking of this patch it doesn't modify
proc_tid_base_operations,
so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are
going to use
the anonymous file returned by CLONE_PIDFD ?
I don't think procfs works that way. /proc/sub-thread-tid has proc_tgid_base_operations despite not being a thread group leader. (Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can be hit trivially, and then the code will misbehave.
@Joel: I think you'll have to either rewrite this to explicitly bail out if you're dealing with a thread group leader, or make the code work for threads, too.
The latter case probably being preferred if this API is supposed to be useable for thread management in userspace.
On Thu, Apr 18, 2019 at 10:26 AM Christian Brauner christian@brauner.io wrote:
On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote:
On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote:
On 04/16, Joel Fernandes wrote:
On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote:
Could you explain when it should return POLLIN? When the whole
process exits?
It returns POLLIN when the task is dead or doesn't exist anymore,
or when it
is in a zombie state and there's no other thread in the thread
group.
IOW, when the whole thread group exits, so it can't be used to
monitor sub-threads.
just in case... speaking of this patch it doesn't modify
proc_tid_base_operations,
so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are
going to use
the anonymous file returned by CLONE_PIDFD ?
I don't think procfs works that way. /proc/sub-thread-tid has proc_tgid_base_operations despite not being a thread group leader.
Huh. That seems very weird. Is that too late to change now? It feels like a bug.
(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can be hit trivially, and then the code will misbehave.
@Joel: I think you'll have to either rewrite this to explicitly bail out if you're dealing with a thread group leader
If you're _not_ dealing with a leader, right?
, or make the code
work for threads, too.
The latter case probably being preferred if this API is supposed to be useable for thread management in userspace.
IMHO, focusing on the thread group case for now might be best. We can always support thread management in future work.
Besides: I'm not sure that we need kernel support for thread monitoring. Can't libc provide a pollable FD for a thread internally? libc can always run code just before thread exit, and it can wake a signalfd at that point. Directly terminating individual threads without going through userland is something that breaks the process anyway: it's legal and normal to SIGKILL a process a whole, but if an individual thread terminates without going through libc, the process is likely going to be fatally broken anyway. (What if it's holding the heap lock?)
I know that in some tools want to wait for termination of individual threads in an external monitored process, but couldn't these tools cooperate with libc to get these per-thread eventfds?
Is there a use case I'm missing?
On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote:
On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote:
On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote:
On 04/16, Joel Fernandes wrote:
On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote:
Could you explain when it should return POLLIN? When the whole
process exits?
It returns POLLIN when the task is dead or doesn't exist anymore,
or when it
is in a zombie state and there's no other thread in the thread
group.
IOW, when the whole thread group exits, so it can't be used to
monitor sub-threads.
just in case... speaking of this patch it doesn't modify
proc_tid_base_operations,
so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are
going to use
the anonymous file returned by CLONE_PIDFD ?
I don't think procfs works that way. /proc/sub-thread-tid has proc_tgid_base_operations despite not being a thread group leader. (Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can be hit trivially, and then the code will misbehave.
@Joel: I think you'll have to either rewrite this to explicitly bail out if you're dealing with a thread group leader, or make the code work for threads, too.
The latter case probably being preferred if this API is supposed to be useable for thread management in userspace.
At the moment, we are not planning to use this for sub-thread management. I am reworking this patch to only work on clone(2) pidfds which makes the above discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD patches, CLONE_THREAD with pidfd is not supported.
Also we wanted to make the polling of pidfd quite close to the wait(2) family semantics which, as I understand correctly, is not something that works for sub-threads. In the future, we could bail in poll(2) or return an error, if clone(2) starts supporting thread pidfds, but at the moment I will like to keep the WARN_ON just in case. Please let me know if I missed something.
thanks!
- Joel
On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote:
On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote:
On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote:
On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote:
On 04/16, Joel Fernandes wrote:
On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote:
Could you explain when it should return POLLIN? When the whole
process exits?
It returns POLLIN when the task is dead or doesn't exist anymore,
or when it
is in a zombie state and there's no other thread in the thread
group.
IOW, when the whole thread group exits, so it can't be used to
monitor sub-threads.
just in case... speaking of this patch it doesn't modify
proc_tid_base_operations,
so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are
going to use
the anonymous file returned by CLONE_PIDFD ?
I don't think procfs works that way. /proc/sub-thread-tid has proc_tgid_base_operations despite not being a thread group leader. (Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can be hit trivially, and then the code will misbehave.
@Joel: I think you'll have to either rewrite this to explicitly bail out if you're dealing with a thread group leader, or make the code work for threads, too.
The latter case probably being preferred if this API is supposed to be useable for thread management in userspace.
At the moment, we are not planning to use this for sub-thread management. I am reworking this patch to only work on clone(2) pidfds which makes the above
Indeed and agreed.
discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD patches, CLONE_THREAD with pidfd is not supported.
Yes. We have no one asking for it right now and we can easily add this later.
Admittedly I haven't gotten around to reviewing the patches here yet completely. But one thing about using POLLIN. FreeBSD is using POLLHUP on process exit which I think is nice as well. How about returning POLLIN | POLLHUP on process exit? We already do things like this. For example, when you proxy between ttys. If the process that you're reading data from has exited and closed it's end you still can't usually simply exit because it might have still buffered data that you want to read. The way one can deal with this from userspace is that you can observe a (POLLHUP | POLLIN) event and you keep on reading until you only observe a POLLHUP without a POLLIN event at which point you know you have read all data. I like the semantics for pidfds as well as it would indicate: - POLLHUP -> process has exited - POLLIN -> information can be read
Christian
On Fri, Apr 19, 2019 at 09:18:58PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote:
On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote:
On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote:
On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote:
On 04/16, Joel Fernandes wrote:
On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > Could you explain when it should return POLLIN? When the whole
process exits?
It returns POLLIN when the task is dead or doesn't exist anymore,
or when it
is in a zombie state and there's no other thread in the thread
group.
IOW, when the whole thread group exits, so it can't be used to
monitor sub-threads.
just in case... speaking of this patch it doesn't modify
proc_tid_base_operations,
so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are
going to use
the anonymous file returned by CLONE_PIDFD ?
I don't think procfs works that way. /proc/sub-thread-tid has proc_tgid_base_operations despite not being a thread group leader. (Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can be hit trivially, and then the code will misbehave.
@Joel: I think you'll have to either rewrite this to explicitly bail out if you're dealing with a thread group leader, or make the code work for threads, too.
The latter case probably being preferred if this API is supposed to be useable for thread management in userspace.
At the moment, we are not planning to use this for sub-thread management. I am reworking this patch to only work on clone(2) pidfds which makes the above
Indeed and agreed.
discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD patches, CLONE_THREAD with pidfd is not supported.
Yes. We have no one asking for it right now and we can easily add this later.
Admittedly I haven't gotten around to reviewing the patches here yet completely. But one thing about using POLLIN. FreeBSD is using POLLHUP on process exit which I think is nice as well. How about returning POLLIN | POLLHUP on process exit? We already do things like this. For example, when you proxy between ttys. If the process that you're reading data from has exited and closed it's end you still can't usually simply exit because it might have still buffered data that you want to read. The way one can deal with this from userspace is that you can observe a (POLLHUP | POLLIN) event and you keep on reading until you only observe a POLLHUP without a POLLIN event at which point you know you have read all data. I like the semantics for pidfds as well as it would indicate:
- POLLHUP -> process has exited
or POLLRDHUP. The check you'd usually perform would probably be if ((revents & (POLLIN | POLLPRI)) > 0) && ((revents & (POLLHUP | POLLRDHUP)) > 0) /* keep on trying to read */
I guess you have that set of flags already suggested in another mail?
Christian
- POLLIN -> information can be read
Christian
On Fri, Apr 19, 2019 at 09:22:33PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 09:18:58PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote:
On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote:
On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote:
On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote:
On 04/16, Joel Fernandes wrote: > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > > > Could you explain when it should return POLLIN? When the whole
process exits?
> > It returns POLLIN when the task is dead or doesn't exist anymore,
or when it
> is in a zombie state and there's no other thread in the thread
group.
IOW, when the whole thread group exits, so it can't be used to
monitor sub-threads.
just in case... speaking of this patch it doesn't modify
proc_tid_base_operations,
so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are
going to use
the anonymous file returned by CLONE_PIDFD ?
I don't think procfs works that way. /proc/sub-thread-tid has proc_tgid_base_operations despite not being a thread group leader. (Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can be hit trivially, and then the code will misbehave.
@Joel: I think you'll have to either rewrite this to explicitly bail out if you're dealing with a thread group leader, or make the code work for threads, too.
The latter case probably being preferred if this API is supposed to be useable for thread management in userspace.
At the moment, we are not planning to use this for sub-thread management. I am reworking this patch to only work on clone(2) pidfds which makes the above
Indeed and agreed.
discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD patches, CLONE_THREAD with pidfd is not supported.
Yes. We have no one asking for it right now and we can easily add this later.
Admittedly I haven't gotten around to reviewing the patches here yet completely. But one thing about using POLLIN. FreeBSD is using POLLHUP on process exit which I think is nice as well. How about returning POLLIN | POLLHUP on process exit? We already do things like this. For example, when you proxy between ttys. If the process that you're reading data from has exited and closed it's end you still can't usually simply exit because it might have still buffered data that you want to read. The way one can deal with this from userspace is that you can observe a (POLLHUP | POLLIN) event and you keep on reading until you only observe a POLLHUP without a POLLIN event at which point you know you have read all data. I like the semantics for pidfds as well as it would indicate:
- POLLHUP -> process has exited
or POLLRDHUP. The check you'd usually perform would probably be if ((revents & (POLLIN | POLLPRI)) > 0) && ((revents & (POLLHUP | POLLRDHUP)) > 0) /* keep on trying to read */
I guess you have that set of flags already suggested in another mail?
The code where this pattern is e.g. used is in drivers/tty/n_tty.c:
static __poll_t n_tty_poll(struct tty_struct *tty, struct file *file, poll_table *wait) { __poll_t mask = 0;
poll_wait(file, &tty->read_wait, wait); poll_wait(file, &tty->write_wait, wait); if (input_available_p(tty, 1)) mask |= EPOLLIN | EPOLLRDNORM; else { tty_buffer_flush_work(tty->port); if (input_available_p(tty, 1)) mask |= EPOLLIN | EPOLLRDNORM; } if (tty->packet && tty->link->ctrl_status) mask |= EPOLLPRI | EPOLLIN | EPOLLRDNORM; if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) mask |= EPOLLHUP; if (tty_hung_up_p(file)) mask |= EPOLLHUP; if (tty->ops->write && !tty_is_writelocked(tty) && tty_chars_in_buffer(tty) < WAKEUP_CHARS && tty_write_room(tty) > 0) mask |= EPOLLOUT | EPOLLWRNORM; return mask; }
On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote:
On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote:
On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote:
On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote:
On 04/16, Joel Fernandes wrote:
On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > Could you explain when it should return POLLIN? When the whole
process exits?
It returns POLLIN when the task is dead or doesn't exist anymore,
or when it
is in a zombie state and there's no other thread in the thread
group.
IOW, when the whole thread group exits, so it can't be used to
monitor sub-threads.
just in case... speaking of this patch it doesn't modify
proc_tid_base_operations,
so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are
going to use
the anonymous file returned by CLONE_PIDFD ?
I don't think procfs works that way. /proc/sub-thread-tid has proc_tgid_base_operations despite not being a thread group leader. (Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can be hit trivially, and then the code will misbehave.
@Joel: I think you'll have to either rewrite this to explicitly bail out if you're dealing with a thread group leader, or make the code work for threads, too.
The latter case probably being preferred if this API is supposed to be useable for thread management in userspace.
At the moment, we are not planning to use this for sub-thread management. I am reworking this patch to only work on clone(2) pidfds which makes the above
Indeed and agreed.
discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD patches, CLONE_THREAD with pidfd is not supported.
Yes. We have no one asking for it right now and we can easily add this later.
Admittedly I haven't gotten around to reviewing the patches here yet completely. But one thing about using POLLIN. FreeBSD is using POLLHUP on process exit which I think is nice as well. How about returning POLLIN | POLLHUP on process exit? We already do things like this. For example, when you proxy between ttys. If the process that you're reading data from has exited and closed it's end you still can't usually simply exit because it might have still buffered data that you want to read. The way one can deal with this from userspace is that you can observe a (POLLHUP | POLLIN) event and you keep on reading until you only observe a POLLHUP without a POLLIN event at which point you know you have read all data. I like the semantics for pidfds as well as it would indicate:
- POLLHUP -> process has exited
- POLLIN -> information can be read
Actually I think a bit different about this, in my opinion the pidfd should always be readable (we would store the exit status somewhere in the future which would be readable, even after task_struct is dead). So I was thinking we always return EPOLLIN. If process has not exited, then it blocks.
However, we also are returning EPOLLERR in previous patch if the task_struct has been reaped (task == NULL). I could change that to EPOLLHUP.
So the update patch looks like below. Thoughts?
---8<-----------------------
diff --git a/fs/proc/base.c b/fs/proc/base.c index 6a803a0b75df..eb279b5f4115 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3069,8 +3069,45 @@ static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); }
+static unsigned int proc_tgid_base_poll(struct file *file, struct poll_table_struct *pts) +{ + int poll_flags = 0; + struct task_struct *task; + struct pid *pid; + + task = get_proc_task(file->f_path.dentry->d_inode); + + WARN_ON_ONCE(task && !thread_group_leader(task)); + + /* + * tasklist_lock must be held because to avoid racing with + * changes in exit_state and wake up. Basically to avoid: + * + * P0: read exit_state = 0 + * P1: write exit_state = EXIT_DEAD + * P1: Do a wake up - wq is empty, so do nothing + * P0: Queue for polling - wait forever. + */ + read_lock(&tasklist_lock); + if (!task) + poll_flags = POLLIN | POLLRDNORM | POLLHUP; + else if (task->exit_state && thread_group_empty(task)) + poll_flags = POLLIN | POLLRDNORM; + + if (!poll_flags) { + pid = proc_pid(file->f_path.dentry->d_inode); + poll_wait(file, &pid->wait_pidfd, pts); + } + read_unlock(&tasklist_lock); + + if (task) + put_task_struct(task); + return poll_flags; +} + static const struct file_operations proc_tgid_base_operations = { .read = generic_read_dir, + .poll = proc_tgid_base_poll, .iterate_shared = proc_tgid_base_readdir, .llseek = generic_file_llseek, }; diff --git a/include/linux/pid.h b/include/linux/pid.h index b6f4ba16065a..2e0dcbc6d14e 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -3,6 +3,7 @@ #define _LINUX_PID_H
#include <linux/rculist.h> +#include <linux/wait.h>
enum pid_type { @@ -60,6 +61,8 @@ struct pid unsigned int level; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; + /* wait queue for pidfd pollers */ + wait_queue_head_t wait_pidfd; struct rcu_head rcu; struct upid numbers[1]; }; diff --git a/kernel/pid.c b/kernel/pid.c index 20881598bdfa..5c90c239242f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -214,6 +214,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]);
+ init_waitqueue_head(&pid->wait_pidfd); + upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) diff --git a/kernel/signal.c b/kernel/signal.c index f98448cf2def..e3781703ef7e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1800,6 +1800,17 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) return ret; }
+static void do_wakeup_pidfd_pollers(struct task_struct *task) +{ + struct pid *pid; + + lockdep_assert_held(&tasklist_lock); + + pid = get_task_pid(task, PIDTYPE_PID); + wake_up_all(&pid->wait_pidfd); + put_pid(pid); +} + /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. @@ -1823,6 +1834,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+ /* Wake up all pidfd waiters */ + do_wakeup_pidfd_pollers(tsk); + if (sig != SIGCHLD) { /* * This is only possible if parent == real_parent. diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index deaf8073bc06..4b31c14f273c 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -1,4 +1,4 @@ -CFLAGS += -g -I../../../../usr/include/ +CFLAGS += -g -I../../../../usr/include/ -lpthread
TEST_GEN_PROGS := pidfd_test
diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c index d59378a93782..57ae217339e9 100644 --- a/tools/testing/selftests/pidfd/pidfd_test.c +++ b/tools/testing/selftests/pidfd/pidfd_test.c @@ -4,18 +4,26 @@ #include <errno.h> #include <fcntl.h> #include <linux/types.h> +#include <pthread.h> #include <sched.h> #include <signal.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <syscall.h> +#include <sys/epoll.h> +#include <sys/mman.h> #include <sys/mount.h> #include <sys/wait.h> +#include <time.h> #include <unistd.h>
#include "../kselftest.h"
+#define CHILD_THREAD_MIN_WAIT 3 /* seconds */ +#define MAX_EVENTS 5 +#define __NR_pidfd_send_signal 424 + static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags) { @@ -30,6 +38,22 @@ static void set_signal_received_on_sigusr1(int sig) signal_received = 1; }
+static int open_pidfd(const char *test_name, pid_t pid) +{ + char buf[256]; + int pidfd; + + snprintf(buf, sizeof(buf), "/proc/%d", pid); + pidfd = open(buf, O_DIRECTORY | O_CLOEXEC); + + if (pidfd < 0) + ksft_exit_fail_msg( + "%s test: Failed to open process file descriptor\n", + test_name); + + return pidfd; +} + /* * Straightforward test to see whether pidfd_send_signal() works is to send * a signal to ourself. @@ -87,7 +111,6 @@ static int wait_for_pid(pid_t pid) static int test_pidfd_send_signal_exited_fail(void) { int pidfd, ret, saved_errno; - char buf[256]; pid_t pid; const char *test_name = "pidfd_send_signal signal exited process";
@@ -99,17 +122,10 @@ static int test_pidfd_send_signal_exited_fail(void) if (pid == 0) _exit(EXIT_SUCCESS);
- snprintf(buf, sizeof(buf), "/proc/%d", pid); - - pidfd = open(buf, O_DIRECTORY | O_CLOEXEC); + pidfd = open_pidfd(test_name, pid);
(void)wait_for_pid(pid);
- if (pidfd < 0) - ksft_exit_fail_msg( - "%s test: Failed to open process file descriptor\n", - test_name); - ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0); saved_errno = errno; close(pidfd); @@ -368,10 +384,179 @@ static int test_pidfd_send_signal_syscall_support(void) return 0; }
+void *test_pidfd_poll_exec_thread(void *priv) +{ + char waittime[256]; + + ksft_print_msg("Child Thread: starting. pid %d tid %d ; and sleeping\n", + getpid(), syscall(SYS_gettid)); + ksft_print_msg("Child Thread: doing exec of sleep\n"); + + sprintf(waittime, "%d", CHILD_THREAD_MIN_WAIT); + execl("/bin/sleep", "sleep", waittime, (char *)NULL); + + ksft_print_msg("Child Thread: DONE. pid %d tid %d\n", + getpid(), syscall(SYS_gettid)); + return NULL; +} + +static int poll_pidfd(const char *test_name, int pidfd) +{ + int c; + int epoll_fd = epoll_create1(0); + struct epoll_event event, events[MAX_EVENTS]; + + if (epoll_fd == -1) + ksft_exit_fail_msg("%s test: Failed to create epoll file descriptor\n", + test_name); + + event.events = EPOLLIN; + event.data.fd = pidfd; + + if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pidfd, &event)) { + ksft_print_msg("%s test: Failed to add epoll file descriptor: Skipping\n", + test_name); + _exit(PIDFD_SKIP); + } + + c = epoll_wait(epoll_fd, events, MAX_EVENTS, 5000); + if (c != 1 || !(events[0].events & EPOLLIN)) + ksft_exit_fail_msg("%s test: Unexpected epoll_wait result (c=%d, events=%x)\n", + test_name, c, events[0].events); + + close(epoll_fd); + return events[0].events; + +} + +int test_pidfd_poll_exec(int use_waitpid) +{ + int pid, pidfd; + int status, ret; + pthread_t t1; + time_t prog_start = time(NULL); + const char *test_name = "pidfd_poll check for premature notification on child thread exec"; + + ksft_print_msg("Parent: pid: %d\n", getpid()); + pid = fork(); + if (pid == 0) { + ksft_print_msg("Child: starting. pid %d tid %d\n", getpid(), + syscall(SYS_gettid)); + pthread_create(&t1, NULL, test_pidfd_poll_exec_thread, NULL); + /* + * Exec in the non-leader thread will destroy the leader immediately. + * If the wait in the parent returns too soon, the test fails. + */ + while (1) + ; + } + + ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid); + + if (use_waitpid) { + ret = waitpid(pid, &status, 0); + if (ret == -1) + ksft_print_msg("Parent: error\n"); + + if (ret == pid) + ksft_print_msg("Parent: Child process waited for.\n"); + } else { + pidfd = open_pidfd(test_name, pid); + poll_pidfd(test_name, pidfd); + } + + time_t prog_time = time(NULL) - prog_start; + + ksft_print_msg("Time waited for child: %lu\n", prog_time); + + close(pidfd); + + if (prog_time < CHILD_THREAD_MIN_WAIT || prog_time > CHILD_THREAD_MIN_WAIT + 2) + ksft_exit_fail_msg("%s test: Failed\n", test_name); + else + ksft_test_result_pass("%s test: Passed\n", test_name); +} + +void *test_pidfd_poll_leader_exit_thread(void *priv) +{ + char waittime[256]; + + ksft_print_msg("Child Thread: starting. pid %d tid %d ; and sleeping\n", + getpid(), syscall(SYS_gettid)); + sleep(CHILD_THREAD_MIN_WAIT); + ksft_print_msg("Child Thread: DONE. pid %d tid %d\n", getpid(), syscall(SYS_gettid)); + return NULL; +} + +static time_t *child_exit_secs; +int test_pidfd_poll_leader_exit(int use_waitpid) +{ + int pid, pidfd; + int status, ret; + pthread_t t1, t2; + time_t prog_start = time(NULL); + const char *test_name = "pidfd_poll check for premature notification on non-empty" + "group leader exit"; + + child_exit_secs = mmap(NULL, sizeof *child_exit_secs, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + + ksft_print_msg("Parent: pid: %d\n", getpid()); + pid = fork(); + if (pid == 0) { + ksft_print_msg("Child: starting. pid %d tid %d\n", getpid(), syscall(SYS_gettid)); + pthread_create(&t1, NULL, test_pidfd_poll_leader_exit_thread, NULL); + pthread_create(&t2, NULL, test_pidfd_poll_leader_exit_thread, NULL); + + /* + * glibc exit calls exit_group syscall, so explicity call exit only + * so that only the group leader exits, leaving the threads alone. + */ + *child_exit_secs = time(NULL); + syscall(SYS_exit, 0); + } + + ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid); + + if (use_waitpid) { + ret = waitpid(pid, &status, 0); + if (ret == -1) + ksft_print_msg("Parent: error\n"); + } else { + /* + * This sleep tests for the case where if the child exits, and is in + * EXIT_ZOMBIE, but the thread group leader is non-empty, then the poll + * doesn't prematurely return even though there are active threads + */ + sleep(1); + pidfd = open_pidfd(test_name, pid); + poll_pidfd(test_name, pidfd); + } + + if (ret == pid) + ksft_print_msg("Parent: Child process waited for.\n"); + + time_t since_child_exit = time(NULL) - *child_exit_secs; + + ksft_print_msg("Time since child exit: %lu\n", since_child_exit); + + close(pidfd); + + if (since_child_exit < CHILD_THREAD_MIN_WAIT || + since_child_exit > CHILD_THREAD_MIN_WAIT + 2) + ksft_exit_fail_msg("%s test: Failed\n", test_name); + else + ksft_test_result_pass("%s test: Passed\n", test_name); +} + int main(int argc, char **argv) { ksft_print_header();
+ test_pidfd_poll_exec(0); + test_pidfd_poll_exec(1); + test_pidfd_poll_leader_exit(0); + test_pidfd_poll_leader_exit(1); test_pidfd_send_signal_syscall_support(); test_pidfd_send_signal_simple_success(); test_pidfd_send_signal_exited_fail();
On Fri, Apr 19, 2019 at 03:49:02PM -0400, Joel Fernandes wrote:
On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote:
On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote:
On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote:
On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote:
On 04/16, Joel Fernandes wrote: > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > > > Could you explain when it should return POLLIN? When the whole
process exits?
> > It returns POLLIN when the task is dead or doesn't exist anymore,
or when it
> is in a zombie state and there's no other thread in the thread
group.
IOW, when the whole thread group exits, so it can't be used to
monitor sub-threads.
just in case... speaking of this patch it doesn't modify
proc_tid_base_operations,
so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are
going to use
the anonymous file returned by CLONE_PIDFD ?
I don't think procfs works that way. /proc/sub-thread-tid has proc_tgid_base_operations despite not being a thread group leader. (Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can be hit trivially, and then the code will misbehave.
@Joel: I think you'll have to either rewrite this to explicitly bail out if you're dealing with a thread group leader, or make the code work for threads, too.
The latter case probably being preferred if this API is supposed to be useable for thread management in userspace.
At the moment, we are not planning to use this for sub-thread management. I am reworking this patch to only work on clone(2) pidfds which makes the above
Indeed and agreed.
discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD patches, CLONE_THREAD with pidfd is not supported.
Yes. We have no one asking for it right now and we can easily add this later.
Admittedly I haven't gotten around to reviewing the patches here yet completely. But one thing about using POLLIN. FreeBSD is using POLLHUP on process exit which I think is nice as well. How about returning POLLIN | POLLHUP on process exit? We already do things like this. For example, when you proxy between ttys. If the process that you're reading data from has exited and closed it's end you still can't usually simply exit because it might have still buffered data that you want to read. The way one can deal with this from userspace is that you can observe a (POLLHUP | POLLIN) event and you keep on reading until you only observe a POLLHUP without a POLLIN event at which point you know you have read all data. I like the semantics for pidfds as well as it would indicate:
- POLLHUP -> process has exited
- POLLIN -> information can be read
Actually I think a bit different about this, in my opinion the pidfd should always be readable (we would store the exit status somewhere in the future which would be readable, even after task_struct is dead). So I was thinking
So your idea is that you always get EPOLLIN when the process is alive, i.e. epoll_wait() immediately returns for a pidfd that referes to a live process if you specify EPOLLIN? E.g. if I specify EPOLLIN | EPOLLHUP then epoll_wait() would constantly return. I would then need to check for EPOLLHUP, see that it is not present and then go back into the epoll_wait() loop and play the same game again? What do you need this for? And if you have a valid reason to do this would it make sense to set POLLPRI if the actual exit status can be read? This way one could at least specify POLLPRI | POLLHUP without being constantly woken.
we always return EPOLLIN. If process has not exited, then it blocks.
However, we also are returning EPOLLERR in previous patch if the task_struct has been reaped (task == NULL). I could change that to EPOLLHUP.
That would be here, right?:
- if (!task)
poll_flags = POLLIN | POLLRDNORM | POLLHUP;
That sounds better to me that EPOLLERR.
So the update patch looks like below. Thoughts?
---8<-----------------------
diff --git a/fs/proc/base.c b/fs/proc/base.c index 6a803a0b75df..eb279b5f4115 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3069,8 +3069,45 @@ static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } +static unsigned int proc_tgid_base_poll(struct file *file, struct poll_table_struct *pts) +{
- int poll_flags = 0;
- struct task_struct *task;
- struct pid *pid;
- task = get_proc_task(file->f_path.dentry->d_inode);
- WARN_ON_ONCE(task && !thread_group_leader(task));
- /*
* tasklist_lock must be held because to avoid racing with
* changes in exit_state and wake up. Basically to avoid:
*
* P0: read exit_state = 0
* P1: write exit_state = EXIT_DEAD
* P1: Do a wake up - wq is empty, so do nothing
* P0: Queue for polling - wait forever.
*/
- read_lock(&tasklist_lock);
- if (!task)
poll_flags = POLLIN | POLLRDNORM | POLLHUP;
- else if (task->exit_state && thread_group_empty(task))
poll_flags = POLLIN | POLLRDNORM;
- if (!poll_flags) {
pid = proc_pid(file->f_path.dentry->d_inode);
poll_wait(file, &pid->wait_pidfd, pts);
- }
- read_unlock(&tasklist_lock);
- if (task)
put_task_struct(task);
- return poll_flags;
+}
static const struct file_operations proc_tgid_base_operations = { .read = generic_read_dir,
- .poll = proc_tgid_base_poll, .iterate_shared = proc_tgid_base_readdir, .llseek = generic_file_llseek,
}; diff --git a/include/linux/pid.h b/include/linux/pid.h index b6f4ba16065a..2e0dcbc6d14e 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -3,6 +3,7 @@ #define _LINUX_PID_H #include <linux/rculist.h> +#include <linux/wait.h> enum pid_type { @@ -60,6 +61,8 @@ struct pid unsigned int level; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX];
- /* wait queue for pidfd pollers */
- wait_queue_head_t wait_pidfd; struct rcu_head rcu; struct upid numbers[1];
}; diff --git a/kernel/pid.c b/kernel/pid.c index 20881598bdfa..5c90c239242f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -214,6 +214,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]);
- init_waitqueue_head(&pid->wait_pidfd);
- upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING))
diff --git a/kernel/signal.c b/kernel/signal.c index f98448cf2def..e3781703ef7e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1800,6 +1800,17 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) return ret; } +static void do_wakeup_pidfd_pollers(struct task_struct *task) +{
- struct pid *pid;
- lockdep_assert_held(&tasklist_lock);
- pid = get_task_pid(task, PIDTYPE_PID);
- wake_up_all(&pid->wait_pidfd);
- put_pid(pid);
+}
/*
- Let a parent know about the death of a child.
- For a stopped/continued status change, use do_notify_parent_cldstop instead.
@@ -1823,6 +1834,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk)));
- /* Wake up all pidfd waiters */
- do_wakeup_pidfd_pollers(tsk);
- if (sig != SIGCHLD) { /*
- This is only possible if parent == real_parent.
diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index deaf8073bc06..4b31c14f273c 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -1,4 +1,4 @@ -CFLAGS += -g -I../../../../usr/include/ +CFLAGS += -g -I../../../../usr/include/ -lpthread TEST_GEN_PROGS := pidfd_test diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c index d59378a93782..57ae217339e9 100644 --- a/tools/testing/selftests/pidfd/pidfd_test.c +++ b/tools/testing/selftests/pidfd/pidfd_test.c @@ -4,18 +4,26 @@ #include <errno.h> #include <fcntl.h> #include <linux/types.h> +#include <pthread.h> #include <sched.h> #include <signal.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <syscall.h> +#include <sys/epoll.h> +#include <sys/mman.h> #include <sys/mount.h> #include <sys/wait.h> +#include <time.h> #include <unistd.h> #include "../kselftest.h" +#define CHILD_THREAD_MIN_WAIT 3 /* seconds */ +#define MAX_EVENTS 5 +#define __NR_pidfd_send_signal 424
static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags) { @@ -30,6 +38,22 @@ static void set_signal_received_on_sigusr1(int sig) signal_received = 1; } +static int open_pidfd(const char *test_name, pid_t pid) +{
- char buf[256];
- int pidfd;
- snprintf(buf, sizeof(buf), "/proc/%d", pid);
- pidfd = open(buf, O_DIRECTORY | O_CLOEXEC);
- if (pidfd < 0)
ksft_exit_fail_msg(
"%s test: Failed to open process file descriptor\n",
test_name);
- return pidfd;
+}
/*
- Straightforward test to see whether pidfd_send_signal() works is to send
- a signal to ourself.
@@ -87,7 +111,6 @@ static int wait_for_pid(pid_t pid) static int test_pidfd_send_signal_exited_fail(void) { int pidfd, ret, saved_errno;
- char buf[256]; pid_t pid; const char *test_name = "pidfd_send_signal signal exited process";
@@ -99,17 +122,10 @@ static int test_pidfd_send_signal_exited_fail(void) if (pid == 0) _exit(EXIT_SUCCESS);
- snprintf(buf, sizeof(buf), "/proc/%d", pid);
- pidfd = open(buf, O_DIRECTORY | O_CLOEXEC);
- pidfd = open_pidfd(test_name, pid);
(void)wait_for_pid(pid);
- if (pidfd < 0)
ksft_exit_fail_msg(
"%s test: Failed to open process file descriptor\n",
test_name);
- ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0); saved_errno = errno; close(pidfd);
@@ -368,10 +384,179 @@ static int test_pidfd_send_signal_syscall_support(void) return 0; } +void *test_pidfd_poll_exec_thread(void *priv) +{
- char waittime[256];
- ksft_print_msg("Child Thread: starting. pid %d tid %d ; and sleeping\n",
getpid(), syscall(SYS_gettid));
- ksft_print_msg("Child Thread: doing exec of sleep\n");
- sprintf(waittime, "%d", CHILD_THREAD_MIN_WAIT);
- execl("/bin/sleep", "sleep", waittime, (char *)NULL);
- ksft_print_msg("Child Thread: DONE. pid %d tid %d\n",
getpid(), syscall(SYS_gettid));
- return NULL;
+}
+static int poll_pidfd(const char *test_name, int pidfd) +{
- int c;
- int epoll_fd = epoll_create1(0);
- struct epoll_event event, events[MAX_EVENTS];
- if (epoll_fd == -1)
ksft_exit_fail_msg("%s test: Failed to create epoll file descriptor\n",
test_name);
- event.events = EPOLLIN;
- event.data.fd = pidfd;
- if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pidfd, &event)) {
ksft_print_msg("%s test: Failed to add epoll file descriptor: Skipping\n",
test_name);
_exit(PIDFD_SKIP);
- }
- c = epoll_wait(epoll_fd, events, MAX_EVENTS, 5000);
- if (c != 1 || !(events[0].events & EPOLLIN))
ksft_exit_fail_msg("%s test: Unexpected epoll_wait result (c=%d, events=%x)\n",
test_name, c, events[0].events);
- close(epoll_fd);
- return events[0].events;
+}
+int test_pidfd_poll_exec(int use_waitpid) +{
- int pid, pidfd;
- int status, ret;
- pthread_t t1;
- time_t prog_start = time(NULL);
- const char *test_name = "pidfd_poll check for premature notification on child thread exec";
- ksft_print_msg("Parent: pid: %d\n", getpid());
- pid = fork();
- if (pid == 0) {
ksft_print_msg("Child: starting. pid %d tid %d\n", getpid(),
syscall(SYS_gettid));
pthread_create(&t1, NULL, test_pidfd_poll_exec_thread, NULL);
/*
* Exec in the non-leader thread will destroy the leader immediately.
* If the wait in the parent returns too soon, the test fails.
*/
while (1)
;
- }
- ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid);
- if (use_waitpid) {
ret = waitpid(pid, &status, 0);
if (ret == -1)
ksft_print_msg("Parent: error\n");
if (ret == pid)
ksft_print_msg("Parent: Child process waited for.\n");
- } else {
pidfd = open_pidfd(test_name, pid);
poll_pidfd(test_name, pidfd);
- }
- time_t prog_time = time(NULL) - prog_start;
- ksft_print_msg("Time waited for child: %lu\n", prog_time);
- close(pidfd);
- if (prog_time < CHILD_THREAD_MIN_WAIT || prog_time > CHILD_THREAD_MIN_WAIT + 2)
ksft_exit_fail_msg("%s test: Failed\n", test_name);
- else
ksft_test_result_pass("%s test: Passed\n", test_name);
+}
+void *test_pidfd_poll_leader_exit_thread(void *priv) +{
- char waittime[256];
- ksft_print_msg("Child Thread: starting. pid %d tid %d ; and sleeping\n",
getpid(), syscall(SYS_gettid));
- sleep(CHILD_THREAD_MIN_WAIT);
- ksft_print_msg("Child Thread: DONE. pid %d tid %d\n", getpid(), syscall(SYS_gettid));
- return NULL;
+}
+static time_t *child_exit_secs; +int test_pidfd_poll_leader_exit(int use_waitpid) +{
- int pid, pidfd;
- int status, ret;
- pthread_t t1, t2;
- time_t prog_start = time(NULL);
- const char *test_name = "pidfd_poll check for premature notification on non-empty"
"group leader exit";
- child_exit_secs = mmap(NULL, sizeof *child_exit_secs, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, -1, 0);
- ksft_print_msg("Parent: pid: %d\n", getpid());
- pid = fork();
- if (pid == 0) {
ksft_print_msg("Child: starting. pid %d tid %d\n", getpid(), syscall(SYS_gettid));
pthread_create(&t1, NULL, test_pidfd_poll_leader_exit_thread, NULL);
pthread_create(&t2, NULL, test_pidfd_poll_leader_exit_thread, NULL);
/*
* glibc exit calls exit_group syscall, so explicity call exit only
* so that only the group leader exits, leaving the threads alone.
*/
*child_exit_secs = time(NULL);
syscall(SYS_exit, 0);
- }
- ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid);
- if (use_waitpid) {
ret = waitpid(pid, &status, 0);
if (ret == -1)
ksft_print_msg("Parent: error\n");
- } else {
/*
* This sleep tests for the case where if the child exits, and is in
* EXIT_ZOMBIE, but the thread group leader is non-empty, then the poll
* doesn't prematurely return even though there are active threads
*/
sleep(1);
pidfd = open_pidfd(test_name, pid);
poll_pidfd(test_name, pidfd);
- }
- if (ret == pid)
ksft_print_msg("Parent: Child process waited for.\n");
- time_t since_child_exit = time(NULL) - *child_exit_secs;
- ksft_print_msg("Time since child exit: %lu\n", since_child_exit);
- close(pidfd);
- if (since_child_exit < CHILD_THREAD_MIN_WAIT ||
since_child_exit > CHILD_THREAD_MIN_WAIT + 2)
ksft_exit_fail_msg("%s test: Failed\n", test_name);
- else
ksft_test_result_pass("%s test: Passed\n", test_name);
+}
int main(int argc, char **argv) { ksft_print_header();
- test_pidfd_poll_exec(0);
- test_pidfd_poll_exec(1);
- test_pidfd_poll_leader_exit(0);
- test_pidfd_poll_leader_exit(1); test_pidfd_send_signal_syscall_support(); test_pidfd_send_signal_simple_success(); test_pidfd_send_signal_exited_fail();
On Fri, Apr 19, 2019 at 10:01:00PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 03:49:02PM -0400, Joel Fernandes wrote:
On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote:
On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote:
On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote:
On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > On 04/16, Joel Fernandes wrote: > > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > > > > > Could you explain when it should return POLLIN? When the whole process exits? > > > > It returns POLLIN when the task is dead or doesn't exist anymore, or when it > > is in a zombie state and there's no other thread in the thread group. > > IOW, when the whole thread group exits, so it can't be used to monitor sub-threads. > > just in case... speaking of this patch it doesn't modify proc_tid_base_operations, > so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are going to use > the anonymous file returned by CLONE_PIDFD ?
I don't think procfs works that way. /proc/sub-thread-tid has proc_tgid_base_operations despite not being a thread group leader. (Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can be hit trivially, and then the code will misbehave.
@Joel: I think you'll have to either rewrite this to explicitly bail out if you're dealing with a thread group leader, or make the code work for threads, too.
The latter case probably being preferred if this API is supposed to be useable for thread management in userspace.
At the moment, we are not planning to use this for sub-thread management. I am reworking this patch to only work on clone(2) pidfds which makes the above
Indeed and agreed.
discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD patches, CLONE_THREAD with pidfd is not supported.
Yes. We have no one asking for it right now and we can easily add this later.
Admittedly I haven't gotten around to reviewing the patches here yet completely. But one thing about using POLLIN. FreeBSD is using POLLHUP on process exit which I think is nice as well. How about returning POLLIN | POLLHUP on process exit? We already do things like this. For example, when you proxy between ttys. If the process that you're reading data from has exited and closed it's end you still can't usually simply exit because it might have still buffered data that you want to read. The way one can deal with this from userspace is that you can observe a (POLLHUP | POLLIN) event and you keep on reading until you only observe a POLLHUP without a POLLIN event at which point you know you have read all data. I like the semantics for pidfds as well as it would indicate:
- POLLHUP -> process has exited
- POLLIN -> information can be read
Actually I think a bit different about this, in my opinion the pidfd should always be readable (we would store the exit status somewhere in the future which would be readable, even after task_struct is dead). So I was thinking
So your idea is that you always get EPOLLIN when the process is alive, i.e. epoll_wait() immediately returns for a pidfd that referes to a live process if you specify EPOLLIN? E.g. if I specify EPOLLIN | EPOLLHUP then epoll_wait() would constantly return. I would then need to check for EPOLLHUP, see that it is not present and then go back into the epoll_wait() loop and play the same game again? What do you need this for?
The approach of this patch is we would return EPOLLIN only once the process exits. Until then it blocks.
And if you have a valid reason to do this would it make sense to set POLLPRI if the actual exit status can be read? This way one could at least specify POLLPRI | POLLHUP without being constantly woken.
we always return EPOLLIN. If process has not exited, then it blocks.
However, we also are returning EPOLLERR in previous patch if the task_struct has been reaped (task == NULL). I could change that to EPOLLHUP.
That would be here, right?:
- if (!task)
poll_flags = POLLIN | POLLRDNORM | POLLHUP;
That sounds better to me that EPOLLERR.
I see. Ok I agree with you. It is not really an error, because even though the task_struct doesn't exist, the data such as exit status would still be readable so IMO POLLHUP is better.
On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote:
On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote:
On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote:
On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote:
On 04/16, Joel Fernandes wrote: > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > > > Could you explain when it should return POLLIN? When the whole
process exits?
> > It returns POLLIN when the task is dead or doesn't exist anymore,
or when it
> is in a zombie state and there's no other thread in the thread
group.
IOW, when the whole thread group exits, so it can't be used to
monitor sub-threads.
just in case... speaking of this patch it doesn't modify
proc_tid_base_operations,
so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are
going to use
the anonymous file returned by CLONE_PIDFD ?
I don't think procfs works that way. /proc/sub-thread-tid has proc_tgid_base_operations despite not being a thread group leader. (Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can be hit trivially, and then the code will misbehave.
@Joel: I think you'll have to either rewrite this to explicitly bail out if you're dealing with a thread group leader, or make the code work for threads, too.
The latter case probably being preferred if this API is supposed to be useable for thread management in userspace.
At the moment, we are not planning to use this for sub-thread management. I am reworking this patch to only work on clone(2) pidfds which makes the above
Indeed and agreed.
discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD patches, CLONE_THREAD with pidfd is not supported.
Yes. We have no one asking for it right now and we can easily add this later.
Admittedly I haven't gotten around to reviewing the patches here yet completely. But one thing about using POLLIN. FreeBSD is using POLLHUP on process exit which I think is nice as well. How about returning POLLIN | POLLHUP on process exit? We already do things like this. For example, when you proxy between ttys. If the process that you're reading data from has exited and closed it's end you still can't usually simply exit because it might have still buffered data that you want to read. The way one can deal with this from userspace is that you can observe a (POLLHUP | POLLIN) event and you keep on reading until you only observe a POLLHUP without a POLLIN event at which point you know you have read all data. I like the semantics for pidfds as well as it would indicate:
- POLLHUP -> process has exited
- POLLIN -> information can be read
Actually I think a bit different about this, in my opinion the pidfd should always be readable (we would store the exit status somewhere in the future which would be readable, even after task_struct is dead). So I was thinking we always return EPOLLIN. If process has not exited, then it blocks.
ITYM that a pidfd polls as readable *once a task exits* and stays readable forever. Before a task exit, a poll on a pidfd should *not* yield POLLIN and reading that pidfd should *not* complete immediately. There's no way that, having observed POLLIN on a pidfd, you should ever then *not* see POLLIN on that pidfd in the future --- it's a one-way transition from not-ready-to-get-exit-status to ready-to-get-exit-status.
Besides, didn't Linux say that he wanted waitpid(2) to be the function for exit status on a pidfd, not read(2)? It doesn't really matter: POLLIN on a pidfd would just make "I, the kernel, say that waitpid on this FD won't block", whereas for something like socket, it would mean "I, the kernel, say read(2) on this FD won't block".
I don't see the need for POLLHUP in pidfds at all. IMHO, we shouldn't include it. "Hangup" doesn't have an obvious meaning distinct from exit, and we might want the POLLHUP bit for something else in the future. What would a caller do with POLLHUP? If the answer is "something to do with ptrace", let's defer that to some future work, since ptrace has a ton of moving parts I don't want to consider right now.
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote:
On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote:
On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote:
On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > On 04/16, Joel Fernandes wrote: > > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > > > > > Could you explain when it should return POLLIN? When the whole process exits? > > > > It returns POLLIN when the task is dead or doesn't exist anymore, or when it > > is in a zombie state and there's no other thread in the thread group. > > IOW, when the whole thread group exits, so it can't be used to monitor sub-threads. > > just in case... speaking of this patch it doesn't modify proc_tid_base_operations, > so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are going to use > the anonymous file returned by CLONE_PIDFD ?
I don't think procfs works that way. /proc/sub-thread-tid has proc_tgid_base_operations despite not being a thread group leader. (Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can be hit trivially, and then the code will misbehave.
@Joel: I think you'll have to either rewrite this to explicitly bail out if you're dealing with a thread group leader, or make the code work for threads, too.
The latter case probably being preferred if this API is supposed to be useable for thread management in userspace.
At the moment, we are not planning to use this for sub-thread management. I am reworking this patch to only work on clone(2) pidfds which makes the above
Indeed and agreed.
discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD patches, CLONE_THREAD with pidfd is not supported.
Yes. We have no one asking for it right now and we can easily add this later.
Admittedly I haven't gotten around to reviewing the patches here yet completely. But one thing about using POLLIN. FreeBSD is using POLLHUP on process exit which I think is nice as well. How about returning POLLIN | POLLHUP on process exit? We already do things like this. For example, when you proxy between ttys. If the process that you're reading data from has exited and closed it's end you still can't usually simply exit because it might have still buffered data that you want to read. The way one can deal with this from userspace is that you can observe a (POLLHUP | POLLIN) event and you keep on reading until you only observe a POLLHUP without a POLLIN event at which point you know you have read all data. I like the semantics for pidfds as well as it would indicate:
- POLLHUP -> process has exited
- POLLIN -> information can be read
Actually I think a bit different about this, in my opinion the pidfd should always be readable (we would store the exit status somewhere in the future which would be readable, even after task_struct is dead). So I was thinking we always return EPOLLIN. If process has not exited, then it blocks.
ITYM that a pidfd polls as readable *once a task exits* and stays readable forever. Before a task exit, a poll on a pidfd should *not* yield POLLIN and reading that pidfd should *not* complete immediately. There's no way that, having observed POLLIN on a pidfd, you should ever then *not* see POLLIN on that pidfd in the future --- it's a one-way transition from not-ready-to-get-exit-status to ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd. We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea. EPOLLIN on a pidfd could very well mean that data can be read via a read() on the pidfd *other* than the exit status. The read could e.g. give you a lean struct that indicates the type of state transition: NOTIFY_EXIT, NOTIFY_EXEC, etc.. This way we are not bound to a specific poll event indicating a specific state. Though there's a case to be made that EPOLLHUP could indicate process exit and EPOLLIN a state change + read().
On Fri, Apr 19, 2019 at 10:57:11PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote:
On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote:
On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: >> On 04/16, Joel Fernandes wrote: >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: >> > > >> > > Could you explain when it should return POLLIN? When the whole >process exits? >> > >> > It returns POLLIN when the task is dead or doesn't exist anymore, >or when it >> > is in a zombie state and there's no other thread in the thread >group. >> >> IOW, when the whole thread group exits, so it can't be used to >monitor sub-threads. >> >> just in case... speaking of this patch it doesn't modify >proc_tid_base_operations, >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are >going to use >> the anonymous file returned by CLONE_PIDFD ? > >I don't think procfs works that way. /proc/sub-thread-tid has >proc_tgid_base_operations despite not being a thread group leader. >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can >be hit trivially, and then the code will misbehave. > >@Joel: I think you'll have to either rewrite this to explicitly bail >out if you're dealing with a thread group leader, or make the code >work for threads, too.
The latter case probably being preferred if this API is supposed to be useable for thread management in userspace.
At the moment, we are not planning to use this for sub-thread management. I am reworking this patch to only work on clone(2) pidfds which makes the above
Indeed and agreed.
discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD patches, CLONE_THREAD with pidfd is not supported.
Yes. We have no one asking for it right now and we can easily add this later.
Admittedly I haven't gotten around to reviewing the patches here yet completely. But one thing about using POLLIN. FreeBSD is using POLLHUP on process exit which I think is nice as well. How about returning POLLIN | POLLHUP on process exit? We already do things like this. For example, when you proxy between ttys. If the process that you're reading data from has exited and closed it's end you still can't usually simply exit because it might have still buffered data that you want to read. The way one can deal with this from userspace is that you can observe a (POLLHUP | POLLIN) event and you keep on reading until you only observe a POLLHUP without a POLLIN event at which point you know you have read all data. I like the semantics for pidfds as well as it would indicate:
- POLLHUP -> process has exited
- POLLIN -> information can be read
Actually I think a bit different about this, in my opinion the pidfd should always be readable (we would store the exit status somewhere in the future which would be readable, even after task_struct is dead). So I was thinking we always return EPOLLIN. If process has not exited, then it blocks.
ITYM that a pidfd polls as readable *once a task exits* and stays readable forever. Before a task exit, a poll on a pidfd should *not* yield POLLIN and reading that pidfd should *not* complete immediately. There's no way that, having observed POLLIN on a pidfd, you should ever then *not* see POLLIN on that pidfd in the future --- it's a one-way transition from not-ready-to-get-exit-status to ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd. We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea. EPOLLIN on a pidfd could very well mean that data can be read via a read() on the pidfd *other* than the exit status. The read could e.g. give you a lean struct that indicates the type of state transition: NOTIFY_EXIT, NOTIFY_EXEC, etc.. This way we are not bound to a specific poll event indicating a specific state. Though there's a case to be made that EPOLLHUP could indicate process exit and EPOLLIN a state change + read().
According to Linus, POLLHUP usually indicates that something is readable: https://lkml.org/lkml/2019/4/18/1181 "So generally a HUP condition should mean that POLLIN and POLLOUT also get set. Not because there's any actual _data_ to be read, but simply because the read will not block."
I feel the future state changes such as for NOTIFY_EXEC can easily be implemented on top of this patch.
Just for the exit notification purposes, the states are: if process has exit_state == 0, block. if process is zombie/dead but not reaped, then return POLLIN if process is reaped, then return POLLIN | POLLHUP
for the exec notification case, that could be implemnted along with this with something like: if process has exit_state == 0, or has not exec'd since poll was called, block. if process exec'd, then return POLLIN if process is zombie/dead but not reaped, then return POLLIN if process is reaped, then return POLLIN | POLLHUP
Do you agree or did I miss something?
thanks,
- Joel
On Fri, Apr 19, 2019 at 2:20 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 10:57:11PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote:
On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote: > On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: > >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > >> On 04/16, Joel Fernandes wrote: > >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > >> > > > >> > > Could you explain when it should return POLLIN? When the whole > >process exits? > >> > > >> > It returns POLLIN when the task is dead or doesn't exist anymore, > >or when it > >> > is in a zombie state and there's no other thread in the thread > >group. > >> > >> IOW, when the whole thread group exits, so it can't be used to > >monitor sub-threads. > >> > >> just in case... speaking of this patch it doesn't modify > >proc_tid_base_operations, > >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are > >going to use > >> the anonymous file returned by CLONE_PIDFD ? > > > >I don't think procfs works that way. /proc/sub-thread-tid has > >proc_tgid_base_operations despite not being a thread group leader. > >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can > >be hit trivially, and then the code will misbehave. > > > >@Joel: I think you'll have to either rewrite this to explicitly bail > >out if you're dealing with a thread group leader, or make the code > >work for threads, too. > > The latter case probably being preferred if this API is supposed to be > useable for thread management in userspace.
At the moment, we are not planning to use this for sub-thread management. I am reworking this patch to only work on clone(2) pidfds which makes the above
Indeed and agreed.
discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD patches, CLONE_THREAD with pidfd is not supported.
Yes. We have no one asking for it right now and we can easily add this later.
Admittedly I haven't gotten around to reviewing the patches here yet completely. But one thing about using POLLIN. FreeBSD is using POLLHUP on process exit which I think is nice as well. How about returning POLLIN | POLLHUP on process exit? We already do things like this. For example, when you proxy between ttys. If the process that you're reading data from has exited and closed it's end you still can't usually simply exit because it might have still buffered data that you want to read. The way one can deal with this from userspace is that you can observe a (POLLHUP | POLLIN) event and you keep on reading until you only observe a POLLHUP without a POLLIN event at which point you know you have read all data. I like the semantics for pidfds as well as it would indicate:
- POLLHUP -> process has exited
- POLLIN -> information can be read
Actually I think a bit different about this, in my opinion the pidfd should always be readable (we would store the exit status somewhere in the future which would be readable, even after task_struct is dead). So I was thinking we always return EPOLLIN. If process has not exited, then it blocks.
ITYM that a pidfd polls as readable *once a task exits* and stays readable forever. Before a task exit, a poll on a pidfd should *not* yield POLLIN and reading that pidfd should *not* complete immediately. There's no way that, having observed POLLIN on a pidfd, you should ever then *not* see POLLIN on that pidfd in the future --- it's a one-way transition from not-ready-to-get-exit-status to ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd. We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea. EPOLLIN on a pidfd could very well mean that data can be read via a read() on the pidfd *other* than the exit status. The read could e.g. give you a lean struct that indicates the type of state transition: NOTIFY_EXIT, NOTIFY_EXEC, etc.. This way we are not bound to a specific poll event indicating a specific state. Though there's a case to be made that EPOLLHUP could indicate process exit and EPOLLIN a state change + read().
According to Linus, POLLHUP usually indicates that something is readable:
I don't think Linus said that POLLHUP means readable. He did say that it usually doesn't make sense to set POLLHUP without POLLIN, but that's not the same as POLLHUP indicating readability.
https://lkml.org/lkml/2019/4/18/1181 "So generally a HUP condition should mean that POLLIN and POLLOUT also get set. Not because there's any actual _data_ to be read, but simply because the read will not block."
I feel the future state changes such as for NOTIFY_EXEC can easily be implemented on top of this patch.
Just for the exit notification purposes, the states are: if process has exit_state == 0, block. if process is zombie/dead but not reaped, then return POLLIN if process is reaped, then return POLLIN | POLLHUP
Setting POLLHUP when the process is reaped is harmless, but I don't think it's useful. I can't think of a reason that anyone would care. You can't block and wait on reaping, so you could only busy-wait, and you can look for ESRCH on any proc file today to detect reaping. I'd rather keep POLLHUP available for some other use than use it to signal whether a process is reaped.
On Fri, Apr 19, 2019 at 02:24:09PM -0700, Daniel Colascione wrote:
On Fri, Apr 19, 2019 at 2:20 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 10:57:11PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote: > On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote: > > On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: > > >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > > >> On 04/16, Joel Fernandes wrote: > > >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > >> > > > > >> > > Could you explain when it should return POLLIN? When the whole > > >process exits? > > >> > > > >> > It returns POLLIN when the task is dead or doesn't exist anymore, > > >or when it > > >> > is in a zombie state and there's no other thread in the thread > > >group. > > >> > > >> IOW, when the whole thread group exits, so it can't be used to > > >monitor sub-threads. > > >> > > >> just in case... speaking of this patch it doesn't modify > > >proc_tid_base_operations, > > >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are > > >going to use > > >> the anonymous file returned by CLONE_PIDFD ? > > > > > >I don't think procfs works that way. /proc/sub-thread-tid has > > >proc_tgid_base_operations despite not being a thread group leader. > > >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can > > >be hit trivially, and then the code will misbehave. > > > > > >@Joel: I think you'll have to either rewrite this to explicitly bail > > >out if you're dealing with a thread group leader, or make the code > > >work for threads, too. > > > > The latter case probably being preferred if this API is supposed to be > > useable for thread management in userspace. > > At the moment, we are not planning to use this for sub-thread management. I > am reworking this patch to only work on clone(2) pidfds which makes the above
Indeed and agreed.
> discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD > patches, CLONE_THREAD with pidfd is not supported.
Yes. We have no one asking for it right now and we can easily add this later.
Admittedly I haven't gotten around to reviewing the patches here yet completely. But one thing about using POLLIN. FreeBSD is using POLLHUP on process exit which I think is nice as well. How about returning POLLIN | POLLHUP on process exit? We already do things like this. For example, when you proxy between ttys. If the process that you're reading data from has exited and closed it's end you still can't usually simply exit because it might have still buffered data that you want to read. The way one can deal with this from userspace is that you can observe a (POLLHUP | POLLIN) event and you keep on reading until you only observe a POLLHUP without a POLLIN event at which point you know you have read all data. I like the semantics for pidfds as well as it would indicate:
- POLLHUP -> process has exited
- POLLIN -> information can be read
Actually I think a bit different about this, in my opinion the pidfd should always be readable (we would store the exit status somewhere in the future which would be readable, even after task_struct is dead). So I was thinking we always return EPOLLIN. If process has not exited, then it blocks.
ITYM that a pidfd polls as readable *once a task exits* and stays readable forever. Before a task exit, a poll on a pidfd should *not* yield POLLIN and reading that pidfd should *not* complete immediately. There's no way that, having observed POLLIN on a pidfd, you should ever then *not* see POLLIN on that pidfd in the future --- it's a one-way transition from not-ready-to-get-exit-status to ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd. We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea. EPOLLIN on a pidfd could very well mean that data can be read via a read() on the pidfd *other* than the exit status. The read could e.g. give you a lean struct that indicates the type of state transition: NOTIFY_EXIT, NOTIFY_EXEC, etc.. This way we are not bound to a specific poll event indicating a specific state. Though there's a case to be made that EPOLLHUP could indicate process exit and EPOLLIN a state change + read().
According to Linus, POLLHUP usually indicates that something is readable:
I don't think Linus said that POLLHUP means readable. He did say that it usually doesn't make sense to set POLLHUP without POLLIN, but that's not the same as POLLHUP indicating readability.
Ok, fair enough.
https://lkml.org/lkml/2019/4/18/1181 "So generally a HUP condition should mean that POLLIN and POLLOUT also get set. Not because there's any actual _data_ to be read, but simply because the read will not block."
I feel the future state changes such as for NOTIFY_EXEC can easily be implemented on top of this patch.
Just for the exit notification purposes, the states are: if process has exit_state == 0, block. if process is zombie/dead but not reaped, then return POLLIN if process is reaped, then return POLLIN | POLLHUP
Setting POLLHUP when the process is reaped is harmless, but I don't think it's useful. I can't think of a reason that anyone would care.
We can also outright remove it. Oleg seemed to not mind it, in fact he said it may be useful to indicate the reap status so at least I am inclined to leave it in.
You can't block and wait on reaping, so you could only busy-wait, and you can look for ESRCH on any proc file today to detect reaping. I'd
proc file reading is racy though. We shouldn't even talk about that since the point of pidfd is to avoid such raw "pid" related races.
rather keep POLLHUP available for some other use than use it to signal whether a process is reaped.
Like what? I feel it is semantically better than POLLERR.
thanks,
- Joel
On Fri, Apr 19, 2019 at 2:45 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 02:24:09PM -0700, Daniel Colascione wrote:
On Fri, Apr 19, 2019 at 2:20 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 10:57:11PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote: > On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote: > > On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote: > > > On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: > > > >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > > > >> On 04/16, Joel Fernandes wrote: > > > >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > > >> > > > > > >> > > Could you explain when it should return POLLIN? When the whole > > > >process exits? > > > >> > > > > >> > It returns POLLIN when the task is dead or doesn't exist anymore, > > > >or when it > > > >> > is in a zombie state and there's no other thread in the thread > > > >group. > > > >> > > > >> IOW, when the whole thread group exits, so it can't be used to > > > >monitor sub-threads. > > > >> > > > >> just in case... speaking of this patch it doesn't modify > > > >proc_tid_base_operations, > > > >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are > > > >going to use > > > >> the anonymous file returned by CLONE_PIDFD ? > > > > > > > >I don't think procfs works that way. /proc/sub-thread-tid has > > > >proc_tgid_base_operations despite not being a thread group leader. > > > >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can > > > >be hit trivially, and then the code will misbehave. > > > > > > > >@Joel: I think you'll have to either rewrite this to explicitly bail > > > >out if you're dealing with a thread group leader, or make the code > > > >work for threads, too. > > > > > > The latter case probably being preferred if this API is supposed to be > > > useable for thread management in userspace. > > > > At the moment, we are not planning to use this for sub-thread management. I > > am reworking this patch to only work on clone(2) pidfds which makes the above > > Indeed and agreed. > > > discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD > > patches, CLONE_THREAD with pidfd is not supported. > > Yes. We have no one asking for it right now and we can easily add this > later. > > Admittedly I haven't gotten around to reviewing the patches here yet > completely. But one thing about using POLLIN. FreeBSD is using POLLHUP > on process exit which I think is nice as well. How about returning > POLLIN | POLLHUP on process exit? > We already do things like this. For example, when you proxy between > ttys. If the process that you're reading data from has exited and closed > it's end you still can't usually simply exit because it might have still > buffered data that you want to read. The way one can deal with this > from userspace is that you can observe a (POLLHUP | POLLIN) event and > you keep on reading until you only observe a POLLHUP without a POLLIN > event at which point you know you have read > all data. > I like the semantics for pidfds as well as it would indicate: > - POLLHUP -> process has exited > - POLLIN -> information can be read
Actually I think a bit different about this, in my opinion the pidfd should always be readable (we would store the exit status somewhere in the future which would be readable, even after task_struct is dead). So I was thinking we always return EPOLLIN. If process has not exited, then it blocks.
ITYM that a pidfd polls as readable *once a task exits* and stays readable forever. Before a task exit, a poll on a pidfd should *not* yield POLLIN and reading that pidfd should *not* complete immediately. There's no way that, having observed POLLIN on a pidfd, you should ever then *not* see POLLIN on that pidfd in the future --- it's a one-way transition from not-ready-to-get-exit-status to ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd. We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea. EPOLLIN on a pidfd could very well mean that data can be read via a read() on the pidfd *other* than the exit status. The read could e.g. give you a lean struct that indicates the type of state transition: NOTIFY_EXIT, NOTIFY_EXEC, etc.. This way we are not bound to a specific poll event indicating a specific state. Though there's a case to be made that EPOLLHUP could indicate process exit and EPOLLIN a state change + read().
According to Linus, POLLHUP usually indicates that something is readable:
I don't think Linus said that POLLHUP means readable. He did say that it usually doesn't make sense to set POLLHUP without POLLIN, but that's not the same as POLLHUP indicating readability.
Ok, fair enough.
https://lkml.org/lkml/2019/4/18/1181 "So generally a HUP condition should mean that POLLIN and POLLOUT also get set. Not because there's any actual _data_ to be read, but simply because the read will not block."
I feel the future state changes such as for NOTIFY_EXEC can easily be implemented on top of this patch.
Just for the exit notification purposes, the states are: if process has exit_state == 0, block. if process is zombie/dead but not reaped, then return POLLIN if process is reaped, then return POLLIN | POLLHUP
Setting POLLHUP when the process is reaped is harmless, but I don't think it's useful. I can't think of a reason that anyone would care.
We can also outright remove it. Oleg seemed to not mind it, in fact he said it may be useful to indicate the reap status so at least I am inclined to leave it in.
You can't block and wait on reaping, so you could only busy-wait, and you can look for ESRCH on any proc file today to detect reaping. I'd
proc file reading is racy though. We shouldn't even talk about that since the point of pidfd is to avoid such raw "pid" related races.
It's not racy if you have a procfs dirfd handle open anyway. But since we moved to the two-kinds-of-file-descriptor model, there's no way to reliably open the procfs directory FD for a process that's exited but that's not yet been reaped. The fdinfo mechanism doesn't solve this problem. But adding a reap flag to poll output doesn't close the race either.
I don't see any actual benefit to knowing via poll whether a process was reaped. If you can identify such a use case, great. Otherwise, I want to keep the poll interface simple and not specify semantics for POLLHUP.
On Sat, Apr 20, 2019 at 12:08 AM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 2:45 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 02:24:09PM -0700, Daniel Colascione wrote:
On Fri, Apr 19, 2019 at 2:20 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 10:57:11PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote: > > On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote: > > On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote: > > > On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote: > > > > On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: > > > > >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > > > > >> On 04/16, Joel Fernandes wrote: > > > > >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > > > >> > > > > > > >> > > Could you explain when it should return POLLIN? When the whole > > > > >process exits? > > > > >> > > > > > >> > It returns POLLIN when the task is dead or doesn't exist anymore, > > > > >or when it > > > > >> > is in a zombie state and there's no other thread in the thread > > > > >group. > > > > >> > > > > >> IOW, when the whole thread group exits, so it can't be used to > > > > >monitor sub-threads. > > > > >> > > > > >> just in case... speaking of this patch it doesn't modify > > > > >proc_tid_base_operations, > > > > >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are > > > > >going to use > > > > >> the anonymous file returned by CLONE_PIDFD ? > > > > > > > > > >I don't think procfs works that way. /proc/sub-thread-tid has > > > > >proc_tgid_base_operations despite not being a thread group leader. > > > > >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can > > > > >be hit trivially, and then the code will misbehave. > > > > > > > > > >@Joel: I think you'll have to either rewrite this to explicitly bail > > > > >out if you're dealing with a thread group leader, or make the code > > > > >work for threads, too. > > > > > > > > The latter case probably being preferred if this API is supposed to be > > > > useable for thread management in userspace. > > > > > > At the moment, we are not planning to use this for sub-thread management. I > > > am reworking this patch to only work on clone(2) pidfds which makes the above > > > > Indeed and agreed. > > > > > discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD > > > patches, CLONE_THREAD with pidfd is not supported. > > > > Yes. We have no one asking for it right now and we can easily add this > > later. > > > > Admittedly I haven't gotten around to reviewing the patches here yet > > completely. But one thing about using POLLIN. FreeBSD is using POLLHUP > > on process exit which I think is nice as well. How about returning > > POLLIN | POLLHUP on process exit? > > We already do things like this. For example, when you proxy between > > ttys. If the process that you're reading data from has exited and closed > > it's end you still can't usually simply exit because it might have still > > buffered data that you want to read. The way one can deal with this > > from userspace is that you can observe a (POLLHUP | POLLIN) event and > > you keep on reading until you only observe a POLLHUP without a POLLIN > > event at which point you know you have read > > all data. > > I like the semantics for pidfds as well as it would indicate: > > - POLLHUP -> process has exited > > - POLLIN -> information can be read > > Actually I think a bit different about this, in my opinion the pidfd should > always be readable (we would store the exit status somewhere in the future > which would be readable, even after task_struct is dead). So I was thinking > we always return EPOLLIN. If process has not exited, then it blocks.
ITYM that a pidfd polls as readable *once a task exits* and stays readable forever. Before a task exit, a poll on a pidfd should *not* yield POLLIN and reading that pidfd should *not* complete immediately. There's no way that, having observed POLLIN on a pidfd, you should ever then *not* see POLLIN on that pidfd in the future --- it's a one-way transition from not-ready-to-get-exit-status to ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd. We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea. EPOLLIN on a pidfd could very well mean that data can be read via a read() on the pidfd *other* than the exit status. The read could e.g. give you a lean struct that indicates the type of state transition: NOTIFY_EXIT, NOTIFY_EXEC, etc.. This way we are not bound to a specific poll event indicating a specific state. Though there's a case to be made that EPOLLHUP could indicate process exit and EPOLLIN a state change + read().
According to Linus, POLLHUP usually indicates that something is readable:
I don't think Linus said that POLLHUP means readable. He did say that it usually doesn't make sense to set POLLHUP without POLLIN, but that's not the same as POLLHUP indicating readability.
Ok, fair enough.
https://lkml.org/lkml/2019/4/18/1181 "So generally a HUP condition should mean that POLLIN and POLLOUT also get set. Not because there's any actual _data_ to be read, but simply because the read will not block."
I feel the future state changes such as for NOTIFY_EXEC can easily be implemented on top of this patch.
Just for the exit notification purposes, the states are: if process has exit_state == 0, block. if process is zombie/dead but not reaped, then return POLLIN if process is reaped, then return POLLIN | POLLHUP
Setting POLLHUP when the process is reaped is harmless, but I don't think it's useful. I can't think of a reason that anyone would care.
We can also outright remove it. Oleg seemed to not mind it, in fact he said it may be useful to indicate the reap status so at least I am inclined to leave it in.
You can't block and wait on reaping, so you could only busy-wait, and you can look for ESRCH on any proc file today to detect reaping. I'd
proc file reading is racy though. We shouldn't even talk about that since the point of pidfd is to avoid such raw "pid" related races.
It's not racy if you have a procfs dirfd handle open anyway. But since we moved to the two-kinds-of-file-descriptor model, there's no way to reliably open the procfs directory FD for a process that's exited but that's not yet been reaped. The fdinfo mechanism doesn't solve this
Uhm, maybe I'm slow but why?
static int do_child(void *p) { exit(EXIT_SUCCESS); }
pid_t pid = clone(do_child, CLONE_PIDFD, &pidfd);
sleep(10);
/* process exits but is not reaped yet */ procfd = open("/proc/pid");
if (pidfd_send_signal(pidfd, 0, NULL, 0) == 0) /* procfd is valid */
waitpid(pid);
problem. But adding a reap flag to poll output doesn't close the race either.
I don't see any actual benefit to knowing via poll whether a process was reaped. If you can identify such a use case, great. Otherwise, I want to keep the poll interface simple and not specify semantics for POLLHUP.
On Fri, Apr 19, 2019 at 3:18 PM Christian Brauner christian@brauner.io wrote:
On Sat, Apr 20, 2019 at 12:08 AM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 2:45 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 02:24:09PM -0700, Daniel Colascione wrote:
On Fri, Apr 19, 2019 at 2:20 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 10:57:11PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote: > > On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote: > > > > On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote: > > > On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote: > > > > On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote: > > > > > On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: > > > > > >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > > > > > >> On 04/16, Joel Fernandes wrote: > > > > > >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > > > > >> > > > > > > > >> > > Could you explain when it should return POLLIN? When the whole > > > > > >process exits? > > > > > >> > > > > > > >> > It returns POLLIN when the task is dead or doesn't exist anymore, > > > > > >or when it > > > > > >> > is in a zombie state and there's no other thread in the thread > > > > > >group. > > > > > >> > > > > > >> IOW, when the whole thread group exits, so it can't be used to > > > > > >monitor sub-threads. > > > > > >> > > > > > >> just in case... speaking of this patch it doesn't modify > > > > > >proc_tid_base_operations, > > > > > >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are > > > > > >going to use > > > > > >> the anonymous file returned by CLONE_PIDFD ? > > > > > > > > > > > >I don't think procfs works that way. /proc/sub-thread-tid has > > > > > >proc_tgid_base_operations despite not being a thread group leader. > > > > > >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can > > > > > >be hit trivially, and then the code will misbehave. > > > > > > > > > > > >@Joel: I think you'll have to either rewrite this to explicitly bail > > > > > >out if you're dealing with a thread group leader, or make the code > > > > > >work for threads, too. > > > > > > > > > > The latter case probably being preferred if this API is supposed to be > > > > > useable for thread management in userspace. > > > > > > > > At the moment, we are not planning to use this for sub-thread management. I > > > > am reworking this patch to only work on clone(2) pidfds which makes the above > > > > > > Indeed and agreed. > > > > > > > discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD > > > > patches, CLONE_THREAD with pidfd is not supported. > > > > > > Yes. We have no one asking for it right now and we can easily add this > > > later. > > > > > > Admittedly I haven't gotten around to reviewing the patches here yet > > > completely. But one thing about using POLLIN. FreeBSD is using POLLHUP > > > on process exit which I think is nice as well. How about returning > > > POLLIN | POLLHUP on process exit? > > > We already do things like this. For example, when you proxy between > > > ttys. If the process that you're reading data from has exited and closed > > > it's end you still can't usually simply exit because it might have still > > > buffered data that you want to read. The way one can deal with this > > > from userspace is that you can observe a (POLLHUP | POLLIN) event and > > > you keep on reading until you only observe a POLLHUP without a POLLIN > > > event at which point you know you have read > > > all data. > > > I like the semantics for pidfds as well as it would indicate: > > > - POLLHUP -> process has exited > > > - POLLIN -> information can be read > > > > Actually I think a bit different about this, in my opinion the pidfd should > > always be readable (we would store the exit status somewhere in the future > > which would be readable, even after task_struct is dead). So I was thinking > > we always return EPOLLIN. If process has not exited, then it blocks. > > ITYM that a pidfd polls as readable *once a task exits* and stays > readable forever. Before a task exit, a poll on a pidfd should *not* > yield POLLIN and reading that pidfd should *not* complete immediately. > There's no way that, having observed POLLIN on a pidfd, you should > ever then *not* see POLLIN on that pidfd in the future --- it's a > one-way transition from not-ready-to-get-exit-status to > ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd. We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea. EPOLLIN on a pidfd could very well mean that data can be read via a read() on the pidfd *other* than the exit status. The read could e.g. give you a lean struct that indicates the type of state transition: NOTIFY_EXIT, NOTIFY_EXEC, etc.. This way we are not bound to a specific poll event indicating a specific state. Though there's a case to be made that EPOLLHUP could indicate process exit and EPOLLIN a state change + read().
According to Linus, POLLHUP usually indicates that something is readable:
I don't think Linus said that POLLHUP means readable. He did say that it usually doesn't make sense to set POLLHUP without POLLIN, but that's not the same as POLLHUP indicating readability.
Ok, fair enough.
https://lkml.org/lkml/2019/4/18/1181 "So generally a HUP condition should mean that POLLIN and POLLOUT also get set. Not because there's any actual _data_ to be read, but simply because the read will not block."
I feel the future state changes such as for NOTIFY_EXEC can easily be implemented on top of this patch.
Just for the exit notification purposes, the states are: if process has exit_state == 0, block. if process is zombie/dead but not reaped, then return POLLIN if process is reaped, then return POLLIN | POLLHUP
Setting POLLHUP when the process is reaped is harmless, but I don't think it's useful. I can't think of a reason that anyone would care.
We can also outright remove it. Oleg seemed to not mind it, in fact he said it may be useful to indicate the reap status so at least I am inclined to leave it in.
You can't block and wait on reaping, so you could only busy-wait, and you can look for ESRCH on any proc file today to detect reaping. I'd
proc file reading is racy though. We shouldn't even talk about that since the point of pidfd is to avoid such raw "pid" related races.
It's not racy if you have a procfs dirfd handle open anyway. But since we moved to the two-kinds-of-file-descriptor model, there's no way to reliably open the procfs directory FD for a process that's exited but that's not yet been reaped. The fdinfo mechanism doesn't solve this
Uhm, maybe I'm slow but why?
static int do_child(void *p) { exit(EXIT_SUCCESS); }
pid_t pid = clone(do_child, CLONE_PIDFD, &pidfd);
sleep(10);
/* process exits but is not reaped yet */ procfd = open("/proc/pid");
if (pidfd_send_signal(pidfd, 0, NULL, 0) == 0) /* procfd is valid */
waitpid(pid);
pidfd_send_signal will fail on a zombie, won't it? Or am I slow? :-)
On 19.04.19 23:24, Daniel Colascione wrote:
Hi folks,
<big_snip>
I haven't followed the whole thread, so forgive me if I'm going to write something dumb here ... but: this all feels a bit complicated to me.
Why not just making the pidfd present a stream of events (which are encoded as text lines) ? It would behave just like any stream, eg. socket, chardev, ...
An `cat` on a pidfd could look like this:
<timestamp> EXIT <exitcode> <timestamp> EXEC <cmdline> <timestamp> SIGNAL <sigid> ...
IOW: just using POLLIN. Let the listener just read all events and decide which ones he's actually acting on.
In the other direction we could write in commands, eg. for sending signals.
--mtx
On Fri, Apr 19, 2019 at 11:20 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 10:57:11PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote:
On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote: > On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: > >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > >> On 04/16, Joel Fernandes wrote: > >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > >> > > > >> > > Could you explain when it should return POLLIN? When the whole > >process exits? > >> > > >> > It returns POLLIN when the task is dead or doesn't exist anymore, > >or when it > >> > is in a zombie state and there's no other thread in the thread > >group. > >> > >> IOW, when the whole thread group exits, so it can't be used to > >monitor sub-threads. > >> > >> just in case... speaking of this patch it doesn't modify > >proc_tid_base_operations, > >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are > >going to use > >> the anonymous file returned by CLONE_PIDFD ? > > > >I don't think procfs works that way. /proc/sub-thread-tid has > >proc_tgid_base_operations despite not being a thread group leader. > >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can > >be hit trivially, and then the code will misbehave. > > > >@Joel: I think you'll have to either rewrite this to explicitly bail > >out if you're dealing with a thread group leader, or make the code > >work for threads, too. > > The latter case probably being preferred if this API is supposed to be > useable for thread management in userspace.
At the moment, we are not planning to use this for sub-thread management. I am reworking this patch to only work on clone(2) pidfds which makes the above
Indeed and agreed.
discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD patches, CLONE_THREAD with pidfd is not supported.
Yes. We have no one asking for it right now and we can easily add this later.
Admittedly I haven't gotten around to reviewing the patches here yet completely. But one thing about using POLLIN. FreeBSD is using POLLHUP on process exit which I think is nice as well. How about returning POLLIN | POLLHUP on process exit? We already do things like this. For example, when you proxy between ttys. If the process that you're reading data from has exited and closed it's end you still can't usually simply exit because it might have still buffered data that you want to read. The way one can deal with this from userspace is that you can observe a (POLLHUP | POLLIN) event and you keep on reading until you only observe a POLLHUP without a POLLIN event at which point you know you have read all data. I like the semantics for pidfds as well as it would indicate:
- POLLHUP -> process has exited
- POLLIN -> information can be read
Actually I think a bit different about this, in my opinion the pidfd should always be readable (we would store the exit status somewhere in the future which would be readable, even after task_struct is dead). So I was thinking we always return EPOLLIN. If process has not exited, then it blocks.
ITYM that a pidfd polls as readable *once a task exits* and stays readable forever. Before a task exit, a poll on a pidfd should *not* yield POLLIN and reading that pidfd should *not* complete immediately. There's no way that, having observed POLLIN on a pidfd, you should ever then *not* see POLLIN on that pidfd in the future --- it's a one-way transition from not-ready-to-get-exit-status to ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd. We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea. EPOLLIN on a pidfd could very well mean that data can be read via a read() on the pidfd *other* than the exit status. The read could e.g. give you a lean struct that indicates the type of state transition: NOTIFY_EXIT, NOTIFY_EXEC, etc.. This way we are not bound to a specific poll event indicating a specific state. Though there's a case to be made that EPOLLHUP could indicate process exit and EPOLLIN a state change + read().
According to Linus, POLLHUP usually indicates that something is readable: https://lkml.org/lkml/2019/4/18/1181 "So generally a HUP condition should mean that POLLIN and POLLOUT also get set. Not because there's any actual _data_ to be read, but simply because the read will not block."
I feel the future state changes such as for NOTIFY_EXEC can easily be implemented on top of this patch.
Just for the exit notification purposes, the states are: if process has exit_state == 0, block. if process is zombie/dead but not reaped, then return POLLIN if process is reaped, then return POLLIN | POLLHUP
Oleg was explicitly against EXIT_ZOMBIE/DEAD thing, no? He said so in a prior mail. Has this been addressed?
for the exec notification case, that could be implemnted along with this with something like: if process has exit_state == 0, or has not exec'd since poll was called, block. if process exec'd, then return POLLIN if process is zombie/dead but not reaped, then return POLLIN if process is reaped, then return POLLIN | POLLHUP
Do you agree or did I miss something?
I'm not sure why a combination of flags is nicer than having a simple read method that is more flexible but as the author you should send the patch how you would like it to be for review. :)
Christian
On 04/19, Christian Brauner wrote:
Just for the exit notification purposes, the states are: if process has exit_state == 0, block. if process is zombie/dead but not reaped, then return POLLIN if process is reaped, then return POLLIN | POLLHUP
Oleg was explicitly against EXIT_ZOMBIE/DEAD thing, no? He said so in a prior mail. Has this been addressed?
Yes, please don't use EXIT_XXX codes, nobody should ever use them except the core kernel exit/wait paths. For example, EXIT_DEAD means that the task auto- reaps or its parent had to temporary drop tasklist.
Just check ->exit_state != 0 && thread_group_empty(). Note that we need thread_group_empty() only for the case when the task is traced, in this case we have an extra notification for debugger which can confuse pidfd_poll().
And of course, everything will differ if/when we will need to monitor the sub-threads.
And btw I don't think it needs tasklist_lock, but lets discuss this when we have a new version based on pidfd.
Oleg.
On 04/20, Oleg Nesterov wrote:
Note that we need thread_group_empty() only for the case when the task is traced,
Damn, sorry for noise... of course this is not true, I tried to say that notification won't come otherwise. Nevermind, lets discuss this when we have a patch. I already got lost in this thread and I have no idea what are you all talking about ;)
Oleg.
On Sat, Apr 20, 2019 at 02:26:15PM +0200, Oleg Nesterov wrote:
On 04/20, Oleg Nesterov wrote:
Note that we need thread_group_empty() only for the case when the task is traced,
Damn, sorry for noise... of course this is not true, I tried to say that notification won't come otherwise. Nevermind, lets discuss this when we have a patch. I already got lost in this thread and I have no idea what
Agreed. The polling work is not targeted for the 5.2 merge window anyway so there's no need to rush this.
are you all talking about ;)
Nothing patch-worthy comes without a solid portion of controversy and confusion at first. ;)
Christian
On Fri, Apr 19, 2019 at 2:20 PM Joel Fernandes joel@joelfernandes.org wrote:
According to Linus, POLLHUP usually indicates that something is readable:
Note that if you use the legacy interfaces (ie "select()"), then even just a plain POLLHUP will always just show as "readable".
So for a lot of applications, it won't even matter. Returning POLLHUP will mean it's readable whether POLLIN is set or not (and EPOLLERR will automatically mean that it's marked both readable and writable).
In fact, I'd argue that not a lot of people care about the more esoteric bits at all. If your poll function does POLLIN and POLLOUT, it's fine. Anything else is specialized enough that most people simply won't care. Don't overdesign things too much. You need to have a major reason to care about the other POLL bits.
It's also worth noting that POLLERR/POLLHUP/POLLNVAL cannot be masked for "poll()". Even if you only ask for POLLIN/POLLOUT, you will always get POLLERR/POLLHUP notification. That is again historical behavior, and it's kind of a "you can't poll a hung up fd". But it once again means that you should consider POLLHUP to be something *exceptional* and final, where no further or other state changes can happen or are relevant.
That may well work fine for pidfd when the task is gone, but it's really worth noting that any *normal* state should be about POLLIN/POLLOUT. People should not think that "POLLHUP sounds like the appropriate name", they should realize that POLLHUP is basically a terminal error condition, not a "input is available".
So just keep that in mind.
Linus
On Sat, Apr 20, 2019 at 1:11 AM Linus Torvalds torvalds@linux-foundation.org wrote:
On Fri, Apr 19, 2019 at 2:20 PM Joel Fernandes joel@joelfernandes.org wrote:
According to Linus, POLLHUP usually indicates that something is readable:
Note that if you use the legacy interfaces (ie "select()"), then even just a plain POLLHUP will always just show as "readable".
So for a lot of applications, it won't even matter. Returning POLLHUP will mean it's readable whether POLLIN is set or not (and EPOLLERR will automatically mean that it's marked both readable and writable).
In fact, I'd argue that not a lot of people care about the more esoteric bits at all. If your poll function does POLLIN and POLLOUT, it's fine. Anything else is specialized enough that most people simply won't care. Don't overdesign things too much. You need to have a major reason to care about the other POLL bits.
It's also worth noting that POLLERR/POLLHUP/POLLNVAL cannot be masked for "poll()". Even if you only ask for POLLIN/POLLOUT, you will always get POLLERR/POLLHUP notification. That is again historical behavior, and it's kind of a "you can't poll a hung up fd". But it once again means that you should consider POLLHUP to be something *exceptional* and final, where no further or other state changes can happen or are relevant.
Which kind of makes sense for process exit. So the historical behavior here is in our favor and having POLLIN | POLLHUP rather fitting. It just seems right that POLLHUP indicates "there can be no more state transitions".
Christian
On Fri, Apr 19, 2019 at 4:20 PM Christian Brauner christian@brauner.io wrote:
On Sat, Apr 20, 2019 at 1:11 AM Linus Torvalds torvalds@linux-foundation.org wrote:
It's also worth noting that POLLERR/POLLHUP/POLLNVAL cannot be masked for "poll()". Even if you only ask for POLLIN/POLLOUT, you will always get POLLERR/POLLHUP notification. That is again historical behavior, and it's kind of a "you can't poll a hung up fd". But it once again means that you should consider POLLHUP to be something *exceptional* and final, where no further or other state changes can happen or are relevant.
Which kind of makes sense for process exit. So the historical behavior here is in our favor and having POLLIN | POLLHUP rather fitting. It just seems right that POLLHUP indicates "there can be no more state transitions".
Note that that is *not* true of process exit.
The final state transition isn't "exit", it is actually "process has been reaped". That's the point where data no longer exists.
Arguably "exit()" just means "pidfd is now readable - you can read the status". That sounds very much like a normal POLLIN condition to me, since the whole *point* of read() on pidfd is presumably to read the status.
Now, if you want to have other state transitions (ie read execve/fork/whatever state details), then you could say that _those_ state transitions are just POLLIN, but that the exit state transition is POLLIN | POLLHUP. But logically to me it still smells like the process being reaped should be POLLHUP.
You could also say that the execve/fork/whatever state is out of band data, and use EPOLLRDBAND for it.
But in fact EPOLLPRI might be better for that, because that works well with select() (ei if you want to select for execve/fork, you use the "ex" bitmask).
That said, if FreeBSD already has something like this, and people actually have code that uses it, there's also simply a strong argument for "don't be needlessly different".
Linus
On Fri, Apr 19, 2019 at 4:33 PM Linus Torvalds torvalds@linux-foundation.org wrote:
On Fri, Apr 19, 2019 at 4:20 PM Christian Brauner christian@brauner.io wrote:
On Sat, Apr 20, 2019 at 1:11 AM Linus Torvalds torvalds@linux-foundation.org wrote:
It's also worth noting that POLLERR/POLLHUP/POLLNVAL cannot be masked for "poll()". Even if you only ask for POLLIN/POLLOUT, you will always get POLLERR/POLLHUP notification. That is again historical behavior, and it's kind of a "you can't poll a hung up fd". But it once again means that you should consider POLLHUP to be something *exceptional* and final, where no further or other state changes can happen or are relevant.
Which kind of makes sense for process exit. So the historical behavior here is in our favor and having POLLIN | POLLHUP rather fitting. It just seems right that POLLHUP indicates "there can be no more state transitions".
Note that that is *not* true of process exit.
The final state transition isn't "exit", it is actually "process has been reaped". That's the point where data no longer exists.
FWIW, I think the exit status should be available via pidfd even after process reaping. A non-parent holder of a pidfd has no ability to control when the parent reaps the child, or even if reaping is necessary at all --- the parent could make SIGCHLD SIG_IGN. Someone trying to read exit status via a pidfd shouldn't fail to get that exit status just because he lost the race with a concurrent waitpid().
On Fri, Apr 19, 2019 at 04:11:37PM -0700, Linus Torvalds wrote:
On Fri, Apr 19, 2019 at 2:20 PM Joel Fernandes joel@joelfernandes.org wrote:
According to Linus, POLLHUP usually indicates that something is readable:
Note that if you use the legacy interfaces (ie "select()"), then even just a plain POLLHUP will always just show as "readable".
So for a lot of applications, it won't even matter. Returning POLLHUP will mean it's readable whether POLLIN is set or not (and EPOLLERR will automatically mean that it's marked both readable and writable).
In fact, I'd argue that not a lot of people care about the more esoteric bits at all. If your poll function does POLLIN and POLLOUT, it's fine. Anything else is specialized enough that most people simply won't care. Don't overdesign things too much. You need to have a major reason to care about the other POLL bits.
It's also worth noting that POLLERR/POLLHUP/POLLNVAL cannot be masked for "poll()". Even if you only ask for POLLIN/POLLOUT, you will always get POLLERR/POLLHUP notification. That is again historical behavior, and it's kind of a "you can't poll a hung up fd". But it once again means that you should consider POLLHUP to be something *exceptional* and final, where no further or other state changes can happen or are relevant.
That may well work fine for pidfd when the task is gone, but it's really worth noting that any *normal* state should be about POLLIN/POLLOUT. People should not think that "POLLHUP sounds like the appropriate name", they should realize that POLLHUP is basically a terminal error condition, not a "input is available".
So just keep that in mind.
Got it, thanks a lot for the detailed explanation of the flags. So then I feel I should not return POLLHUP or POLLERR because the task_struct getting reaped is not an error condition. I will simplify the patch and repost it. It seems that would be the best thing to do to serve the usecase. We just want to know when the task exited and a reaped task also counts as one that exited.
thanks,
- Joel
On Fri, Apr 19, 2019 at 1:57 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote:
On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote:
On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: >> On 04/16, Joel Fernandes wrote: >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: >> > > >> > > Could you explain when it should return POLLIN? When the whole >process exits? >> > >> > It returns POLLIN when the task is dead or doesn't exist anymore, >or when it >> > is in a zombie state and there's no other thread in the thread >group. >> >> IOW, when the whole thread group exits, so it can't be used to >monitor sub-threads. >> >> just in case... speaking of this patch it doesn't modify >proc_tid_base_operations, >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are >going to use >> the anonymous file returned by CLONE_PIDFD ? > >I don't think procfs works that way. /proc/sub-thread-tid has >proc_tgid_base_operations despite not being a thread group leader. >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can >be hit trivially, and then the code will misbehave. > >@Joel: I think you'll have to either rewrite this to explicitly bail >out if you're dealing with a thread group leader, or make the code >work for threads, too.
The latter case probably being preferred if this API is supposed to be useable for thread management in userspace.
At the moment, we are not planning to use this for sub-thread management. I am reworking this patch to only work on clone(2) pidfds which makes the above
Indeed and agreed.
discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD patches, CLONE_THREAD with pidfd is not supported.
Yes. We have no one asking for it right now and we can easily add this later.
Admittedly I haven't gotten around to reviewing the patches here yet completely. But one thing about using POLLIN. FreeBSD is using POLLHUP on process exit which I think is nice as well. How about returning POLLIN | POLLHUP on process exit? We already do things like this. For example, when you proxy between ttys. If the process that you're reading data from has exited and closed it's end you still can't usually simply exit because it might have still buffered data that you want to read. The way one can deal with this from userspace is that you can observe a (POLLHUP | POLLIN) event and you keep on reading until you only observe a POLLHUP without a POLLIN event at which point you know you have read all data. I like the semantics for pidfds as well as it would indicate:
- POLLHUP -> process has exited
- POLLIN -> information can be read
Actually I think a bit different about this, in my opinion the pidfd should always be readable (we would store the exit status somewhere in the future which would be readable, even after task_struct is dead). So I was thinking we always return EPOLLIN. If process has not exited, then it blocks.
ITYM that a pidfd polls as readable *once a task exits* and stays readable forever. Before a task exit, a poll on a pidfd should *not* yield POLLIN and reading that pidfd should *not* complete immediately. There's no way that, having observed POLLIN on a pidfd, you should ever then *not* see POLLIN on that pidfd in the future --- it's a one-way transition from not-ready-to-get-exit-status to ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd.
Sure, but systemd is specialized.
There are two broad classes of programs that care about process exit status: 1) those that just want to do something and wait for it to complete, and 2) programs that want to perform detailed monitoring of processes and intervention in their state. #1 is overwhelmingly more common. The basic pidfd feature should take care of case #1 only, as wait*() in file descriptor form. I definitely don't think we should be complicating the interface and making it more error-prone (see below) for the sake of that rare program that cares about non-exit notification conditions. You're proposing a complicated combination of poll bit flags that most users (the ones who just wait to wait for processes) don't care about and that risk making the facility hard to use with existing event loops, which generally recognize readability and writability as the only properties that are worth monitoring.
We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea.
Exit status information should be *at least* as broadly available through pidfds as it is through the last field of /proc/pid/stat today, and probably more broadly. I've been saying for six months now that we need to talk about *who* should have access to exit status information. We haven't had that conversation yet. My preference is to just make exit status information globally available, as FreeBSD seems to do. I think it would be broadly useful for something like pkill to wait for processes to exit and to retrieve their exit information.
Speaking of pkill: AIUI, in your current patch set, one can get a pidfd *only* via clone. Joel indicated that he believes poll(2) shouldn't be supported on procfs pidfds. Is that your thinking as well? If that's the case, then we're in a state where non-parents can't wait for process exit, and providing this facility is an important goal of the whole project.
EPOLLIN on a pidfd could very well mean that data can be read via a read() on the pidfd *other* than the exit status. The read could e.g. give you a lean struct that indicates the type of state transition: NOTIFY_EXIT, NOTIFY_EXEC, etc.. This way we are not bound to a specific poll event indicating a specific state. Though there's a case to be made that EPOLLHUP could indicate process exit and EPOLLIN a state change + read().
And do you imagine making read() destructive? Does that read() then reset the POLLIN state? You're essentially proposing that a pidfd provide an "event stream" interface, delivering notifications packets that indicate state changes like "process exited" or "process stopped" or "process execed". While this sort of interface is powerful and has some nice properties that tools like debuggers and daemon monitors might want to use, I think it's too complicated and error prone for the overwhelmingly common case of wanting to monitor process lifetime. I'd much rather pidfd provide a simple one-state-transition level-triggered (not edge-triggered, as your suggestion implies) facility. If we want to let sophisticated programs read a stream of notification packets indicating changes in process state, we can provide that as a separate interface in future work.
I like Linus' idea of just making waitid(2) (not waitpid(2), as I mistakenly mentioned earlier) on a pidfd act *exactly* like a waitid(2) on the corresponding process and making POLLIN just mean "waitid will succeed". It's a nice simple model that's easy to reason about and that makes it easy to port existing code to pidfds.
I am very much against signaling additional information on basic pidfds using non-POLLIN poll flags.
On Fri, Apr 19, 2019 at 11:21 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 1:57 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote:
On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote: > On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: > >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > >> On 04/16, Joel Fernandes wrote: > >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > >> > > > >> > > Could you explain when it should return POLLIN? When the whole > >process exits? > >> > > >> > It returns POLLIN when the task is dead or doesn't exist anymore, > >or when it > >> > is in a zombie state and there's no other thread in the thread > >group. > >> > >> IOW, when the whole thread group exits, so it can't be used to > >monitor sub-threads. > >> > >> just in case... speaking of this patch it doesn't modify > >proc_tid_base_operations, > >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are > >going to use > >> the anonymous file returned by CLONE_PIDFD ? > > > >I don't think procfs works that way. /proc/sub-thread-tid has > >proc_tgid_base_operations despite not being a thread group leader. > >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can > >be hit trivially, and then the code will misbehave. > > > >@Joel: I think you'll have to either rewrite this to explicitly bail > >out if you're dealing with a thread group leader, or make the code > >work for threads, too. > > The latter case probably being preferred if this API is supposed to be > useable for thread management in userspace.
At the moment, we are not planning to use this for sub-thread management. I am reworking this patch to only work on clone(2) pidfds which makes the above
Indeed and agreed.
discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD patches, CLONE_THREAD with pidfd is not supported.
Yes. We have no one asking for it right now and we can easily add this later.
Admittedly I haven't gotten around to reviewing the patches here yet completely. But one thing about using POLLIN. FreeBSD is using POLLHUP on process exit which I think is nice as well. How about returning POLLIN | POLLHUP on process exit? We already do things like this. For example, when you proxy between ttys. If the process that you're reading data from has exited and closed it's end you still can't usually simply exit because it might have still buffered data that you want to read. The way one can deal with this from userspace is that you can observe a (POLLHUP | POLLIN) event and you keep on reading until you only observe a POLLHUP without a POLLIN event at which point you know you have read all data. I like the semantics for pidfds as well as it would indicate:
- POLLHUP -> process has exited
- POLLIN -> information can be read
Actually I think a bit different about this, in my opinion the pidfd should always be readable (we would store the exit status somewhere in the future which would be readable, even after task_struct is dead). So I was thinking we always return EPOLLIN. If process has not exited, then it blocks.
ITYM that a pidfd polls as readable *once a task exits* and stays readable forever. Before a task exit, a poll on a pidfd should *not* yield POLLIN and reading that pidfd should *not* complete immediately. There's no way that, having observed POLLIN on a pidfd, you should ever then *not* see POLLIN on that pidfd in the future --- it's a one-way transition from not-ready-to-get-exit-status to ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd.
Sure, but systemd is specialized.
So is Android and we're not designing an interface for Android but for all of userspace. I hope this is clear. Service managers are quite important and systemd is the largest one and they can make good use of this feature.
There are two broad classes of programs that care about process exit status: 1) those that just want to do something and wait for it to complete, and 2) programs that want to perform detailed monitoring of processes and intervention in their state. #1 is overwhelmingly more common. The basic pidfd feature should take care of case #1 only, as wait*() in file descriptor form. I definitely don't think we should be complicating the interface and making it more error-prone (see below) for the sake of that rare program that cares about non-exit notification conditions. You're proposing a complicated combination of poll bit flags that most users (the ones who just wait to wait for processes) don't care about and that risk making the facility hard to use with existing event loops, which generally recognize readability and writability as the only properties that are worth monitoring.
That whole pargraph is about dismissing a range of valid use-cases based on assumptions such as "way more common" and even argues that service managers are special cases and therefore not really worth considering. I would like to be more open to other use cases.
We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea.
Exit status information should be *at least* as broadly available through pidfds as it is through the last field of /proc/pid/stat today, and probably more broadly. I've been saying for six months now that we need to talk about *who* should have access to exit status information. We haven't had that conversation yet. My preference is to
just make exit status information globally available, as FreeBSD seems to do. I think it would be broadly useful for something like pkill to
From the pdfork() FreeBSD manpage:
"poll(2) and select(2) allow waiting for process state transitions; currently only POLLHUP is defined, and will be raised when the process dies. Process state transitions can also be monitored using kqueue(2) filter EVFILT_PROCDESC; currently only NOTE_EXIT is implemented."
wait for processes to exit and to retrieve their exit information.
Speaking of pkill: AIUI, in your current patch set, one can get a pidfd *only* via clone. Joel indicated that he believes poll(2) shouldn't be supported on procfs pidfds. Is that your thinking as well? If that's the case, then we're in a state where non-parents
Yes, it is.
can't wait for process exit, and providing this facility is an important goal of the whole project.
That's your goal.
EPOLLIN on a pidfd could very well mean that data can be read via a read() on the pidfd *other* than the exit status. The read could e.g. give you a lean struct that indicates the type of state transition: NOTIFY_EXIT, NOTIFY_EXEC, etc.. This way we are not bound to a specific poll event indicating a specific state. Though there's a case to be made that EPOLLHUP could indicate process exit and EPOLLIN a state change + read().
And do you imagine making read() destructive? Does that read() then reset the POLLIN state? You're essentially proposing that a pidfd provide an "event stream" interface, delivering notifications packets that indicate state changes like "process exited" or "process stopped" or "process execed". While this sort of interface is powerful and has some nice properties that tools like debuggers and daemon monitors might want to use, I think it's too complicated and error prone for
That's an assumption again.
the overwhelmingly common case of wanting to monitor process lifetime.
Again where is this assumption backed up? systemd is a valid example where they care about this, various container managers are another one, and FreeBSD already does this.
I'd much rather pidfd provide a simple one-state-transition level-triggered (not edge-triggered, as your suggestion implies) facility. If we want to let sophisticated programs read a stream of notification packets indicating changes in process state, we can provide that as a separate interface in future work.
I like Linus' idea of just making waitid(2) (not waitpid(2), as I mistakenly mentioned earlier) on a pidfd act *exactly* like a waitid(2) on the corresponding process and making POLLIN just mean "waitid will succeed". It's a nice simple model that's easy to reason about and that makes it easy to port existing code to pidfds.
I am very much against signaling additional information on basic pidfds using non-POLLIN poll flags.
On Fri, Apr 19, 2019 at 11:48 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 11:21 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 1:57 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote: > On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote: > > On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: > > >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > > >> On 04/16, Joel Fernandes wrote: > > >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > >> > > > > >> > > Could you explain when it should return POLLIN? When the whole > > >process exits? > > >> > > > >> > It returns POLLIN when the task is dead or doesn't exist anymore, > > >or when it > > >> > is in a zombie state and there's no other thread in the thread > > >group. > > >> > > >> IOW, when the whole thread group exits, so it can't be used to > > >monitor sub-threads. > > >> > > >> just in case... speaking of this patch it doesn't modify > > >proc_tid_base_operations, > > >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are > > >going to use > > >> the anonymous file returned by CLONE_PIDFD ? > > > > > >I don't think procfs works that way. /proc/sub-thread-tid has > > >proc_tgid_base_operations despite not being a thread group leader. > > >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can > > >be hit trivially, and then the code will misbehave. > > > > > >@Joel: I think you'll have to either rewrite this to explicitly bail > > >out if you're dealing with a thread group leader, or make the code > > >work for threads, too. > > > > The latter case probably being preferred if this API is supposed to be > > useable for thread management in userspace. > > At the moment, we are not planning to use this for sub-thread management. I > am reworking this patch to only work on clone(2) pidfds which makes the above
Indeed and agreed.
> discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD > patches, CLONE_THREAD with pidfd is not supported.
Yes. We have no one asking for it right now and we can easily add this later.
Admittedly I haven't gotten around to reviewing the patches here yet completely. But one thing about using POLLIN. FreeBSD is using POLLHUP on process exit which I think is nice as well. How about returning POLLIN | POLLHUP on process exit? We already do things like this. For example, when you proxy between ttys. If the process that you're reading data from has exited and closed it's end you still can't usually simply exit because it might have still buffered data that you want to read. The way one can deal with this from userspace is that you can observe a (POLLHUP | POLLIN) event and you keep on reading until you only observe a POLLHUP without a POLLIN event at which point you know you have read all data. I like the semantics for pidfds as well as it would indicate:
- POLLHUP -> process has exited
- POLLIN -> information can be read
Actually I think a bit different about this, in my opinion the pidfd should always be readable (we would store the exit status somewhere in the future which would be readable, even after task_struct is dead). So I was thinking we always return EPOLLIN. If process has not exited, then it blocks.
ITYM that a pidfd polls as readable *once a task exits* and stays readable forever. Before a task exit, a poll on a pidfd should *not* yield POLLIN and reading that pidfd should *not* complete immediately. There's no way that, having observed POLLIN on a pidfd, you should ever then *not* see POLLIN on that pidfd in the future --- it's a one-way transition from not-ready-to-get-exit-status to ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd.
Sure, but systemd is specialized.
So is Android and we're not designing an interface for Android but for all of userspace. I hope this is clear. Service managers are quite important and systemd is the largest one and they can make good use of this feature.
There are two broad classes of programs that care about process exit status: 1) those that just want to do something and wait for it to complete, and 2) programs that want to perform detailed monitoring of processes and intervention in their state. #1 is overwhelmingly more common. The basic pidfd feature should take care of case #1 only, as wait*() in file descriptor form. I definitely don't think we should be complicating the interface and making it more error-prone (see below) for the sake of that rare program that cares about non-exit notification conditions. You're proposing a complicated combination of poll bit flags that most users (the ones who just wait to wait for processes) don't care about and that risk making the facility hard to use with existing event loops, which generally recognize readability and writability as the only properties that are worth monitoring.
That whole pargraph is about dismissing a range of valid use-cases based on assumptions such as "way more common" and even argues that service managers are special cases and therefore not really worth considering. I would like to be more open to other use cases.
We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea.
Exit status information should be *at least* as broadly available through pidfds as it is through the last field of /proc/pid/stat today, and probably more broadly. I've been saying for six months now that we need to talk about *who* should have access to exit status information. We haven't had that conversation yet. My preference is to just make exit status information globally available, as FreeBSD seems
Totally aside from whether or not this is a good idea but since you keep bringing this up and I'm really curious about this where is this documented and how does this work, please?
On Fri, Apr 19, 2019 at 3:02 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 11:48 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 11:21 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 1:57 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote: > On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote: > > On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote: > > > On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: > > > >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > > > >> On 04/16, Joel Fernandes wrote: > > > >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > > >> > > > > > >> > > Could you explain when it should return POLLIN? When the whole > > > >process exits? > > > >> > > > > >> > It returns POLLIN when the task is dead or doesn't exist anymore, > > > >or when it > > > >> > is in a zombie state and there's no other thread in the thread > > > >group. > > > >> > > > >> IOW, when the whole thread group exits, so it can't be used to > > > >monitor sub-threads. > > > >> > > > >> just in case... speaking of this patch it doesn't modify > > > >proc_tid_base_operations, > > > >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are > > > >going to use > > > >> the anonymous file returned by CLONE_PIDFD ? > > > > > > > >I don't think procfs works that way. /proc/sub-thread-tid has > > > >proc_tgid_base_operations despite not being a thread group leader. > > > >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can > > > >be hit trivially, and then the code will misbehave. > > > > > > > >@Joel: I think you'll have to either rewrite this to explicitly bail > > > >out if you're dealing with a thread group leader, or make the code > > > >work for threads, too. > > > > > > The latter case probably being preferred if this API is supposed to be > > > useable for thread management in userspace. > > > > At the moment, we are not planning to use this for sub-thread management. I > > am reworking this patch to only work on clone(2) pidfds which makes the above > > Indeed and agreed. > > > discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD > > patches, CLONE_THREAD with pidfd is not supported. > > Yes. We have no one asking for it right now and we can easily add this > later. > > Admittedly I haven't gotten around to reviewing the patches here yet > completely. But one thing about using POLLIN. FreeBSD is using POLLHUP > on process exit which I think is nice as well. How about returning > POLLIN | POLLHUP on process exit? > We already do things like this. For example, when you proxy between > ttys. If the process that you're reading data from has exited and closed > it's end you still can't usually simply exit because it might have still > buffered data that you want to read. The way one can deal with this > from userspace is that you can observe a (POLLHUP | POLLIN) event and > you keep on reading until you only observe a POLLHUP without a POLLIN > event at which point you know you have read > all data. > I like the semantics for pidfds as well as it would indicate: > - POLLHUP -> process has exited > - POLLIN -> information can be read
Actually I think a bit different about this, in my opinion the pidfd should always be readable (we would store the exit status somewhere in the future which would be readable, even after task_struct is dead). So I was thinking we always return EPOLLIN. If process has not exited, then it blocks.
ITYM that a pidfd polls as readable *once a task exits* and stays readable forever. Before a task exit, a poll on a pidfd should *not* yield POLLIN and reading that pidfd should *not* complete immediately. There's no way that, having observed POLLIN on a pidfd, you should ever then *not* see POLLIN on that pidfd in the future --- it's a one-way transition from not-ready-to-get-exit-status to ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd.
Sure, but systemd is specialized.
So is Android and we're not designing an interface for Android but for all of userspace. I hope this is clear. Service managers are quite important and systemd is the largest one and they can make good use of this feature.
There are two broad classes of programs that care about process exit status: 1) those that just want to do something and wait for it to complete, and 2) programs that want to perform detailed monitoring of processes and intervention in their state. #1 is overwhelmingly more common. The basic pidfd feature should take care of case #1 only, as wait*() in file descriptor form. I definitely don't think we should be complicating the interface and making it more error-prone (see below) for the sake of that rare program that cares about non-exit notification conditions. You're proposing a complicated combination of poll bit flags that most users (the ones who just wait to wait for processes) don't care about and that risk making the facility hard to use with existing event loops, which generally recognize readability and writability as the only properties that are worth monitoring.
That whole pargraph is about dismissing a range of valid use-cases based on assumptions such as "way more common" and even argues that service managers are special cases and therefore not really worth considering. I would like to be more open to other use cases.
We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea.
Exit status information should be *at least* as broadly available through pidfds as it is through the last field of /proc/pid/stat today, and probably more broadly. I've been saying for six months now that we need to talk about *who* should have access to exit status information. We haven't had that conversation yet. My preference is to just make exit status information globally available, as FreeBSD seems
Totally aside from whether or not this is a good idea but since you keep bringing this up and I'm really curious about this where is this documented and how does this work, please?
According to the kqueue FreeBSD man page [1] (I'm reading the FreeBSD 12 version), it's possible to register in a kqueue instead a PID of interest via EVFILT_PROC and receive a NOTE_EXIT notification when that process dies. NOTE_EXIT comes with the exit status of the process that died. I don't see any requirement that EVFILT_PROC work only on child processes of the waiter: on the contrary, the man page states that "if a process can normally see another process, it can attach an event to it.". This documentation reads to me like on FreeBSD process exit status is much more widely available than it is on Linux. Am I missing something?
[1] https://www.freebsd.org/cgi/man.cgi?query=kqueue&apropos=0&sektion=0...
On Sat, Apr 20, 2019 at 12:46 AM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 3:02 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 11:48 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 11:21 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 1:57 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote: > > On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote: > > On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote: > > > On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote: > > > > On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: > > > > >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > > > > >> On 04/16, Joel Fernandes wrote: > > > > >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > > > >> > > > > > > >> > > Could you explain when it should return POLLIN? When the whole > > > > >process exits? > > > > >> > > > > > >> > It returns POLLIN when the task is dead or doesn't exist anymore, > > > > >or when it > > > > >> > is in a zombie state and there's no other thread in the thread > > > > >group. > > > > >> > > > > >> IOW, when the whole thread group exits, so it can't be used to > > > > >monitor sub-threads. > > > > >> > > > > >> just in case... speaking of this patch it doesn't modify > > > > >proc_tid_base_operations, > > > > >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are > > > > >going to use > > > > >> the anonymous file returned by CLONE_PIDFD ? > > > > > > > > > >I don't think procfs works that way. /proc/sub-thread-tid has > > > > >proc_tgid_base_operations despite not being a thread group leader. > > > > >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can > > > > >be hit trivially, and then the code will misbehave. > > > > > > > > > >@Joel: I think you'll have to either rewrite this to explicitly bail > > > > >out if you're dealing with a thread group leader, or make the code > > > > >work for threads, too. > > > > > > > > The latter case probably being preferred if this API is supposed to be > > > > useable for thread management in userspace. > > > > > > At the moment, we are not planning to use this for sub-thread management. I > > > am reworking this patch to only work on clone(2) pidfds which makes the above > > > > Indeed and agreed. > > > > > discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD > > > patches, CLONE_THREAD with pidfd is not supported. > > > > Yes. We have no one asking for it right now and we can easily add this > > later. > > > > Admittedly I haven't gotten around to reviewing the patches here yet > > completely. But one thing about using POLLIN. FreeBSD is using POLLHUP > > on process exit which I think is nice as well. How about returning > > POLLIN | POLLHUP on process exit? > > We already do things like this. For example, when you proxy between > > ttys. If the process that you're reading data from has exited and closed > > it's end you still can't usually simply exit because it might have still > > buffered data that you want to read. The way one can deal with this > > from userspace is that you can observe a (POLLHUP | POLLIN) event and > > you keep on reading until you only observe a POLLHUP without a POLLIN > > event at which point you know you have read > > all data. > > I like the semantics for pidfds as well as it would indicate: > > - POLLHUP -> process has exited > > - POLLIN -> information can be read > > Actually I think a bit different about this, in my opinion the pidfd should > always be readable (we would store the exit status somewhere in the future > which would be readable, even after task_struct is dead). So I was thinking > we always return EPOLLIN. If process has not exited, then it blocks.
ITYM that a pidfd polls as readable *once a task exits* and stays readable forever. Before a task exit, a poll on a pidfd should *not* yield POLLIN and reading that pidfd should *not* complete immediately. There's no way that, having observed POLLIN on a pidfd, you should ever then *not* see POLLIN on that pidfd in the future --- it's a one-way transition from not-ready-to-get-exit-status to ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd.
Sure, but systemd is specialized.
So is Android and we're not designing an interface for Android but for all of userspace. I hope this is clear. Service managers are quite important and systemd is the largest one and they can make good use of this feature.
There are two broad classes of programs that care about process exit status: 1) those that just want to do something and wait for it to complete, and 2) programs that want to perform detailed monitoring of processes and intervention in their state. #1 is overwhelmingly more common. The basic pidfd feature should take care of case #1 only, as wait*() in file descriptor form. I definitely don't think we should be complicating the interface and making it more error-prone (see below) for the sake of that rare program that cares about non-exit notification conditions. You're proposing a complicated combination of poll bit flags that most users (the ones who just wait to wait for processes) don't care about and that risk making the facility hard to use with existing event loops, which generally recognize readability and writability as the only properties that are worth monitoring.
That whole pargraph is about dismissing a range of valid use-cases based on assumptions such as "way more common" and even argues that service managers are special cases and therefore not really worth considering. I would like to be more open to other use cases.
We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea.
Exit status information should be *at least* as broadly available through pidfds as it is through the last field of /proc/pid/stat today, and probably more broadly. I've been saying for six months now that we need to talk about *who* should have access to exit status information. We haven't had that conversation yet. My preference is to just make exit status information globally available, as FreeBSD seems
Totally aside from whether or not this is a good idea but since you keep bringing this up and I'm really curious about this where is this documented and how does this work, please?
According to the kqueue FreeBSD man page [1] (I'm reading the FreeBSD 12 version), it's possible to register in a kqueue instead a PID of interest via EVFILT_PROC and receive a NOTE_EXIT notification when that process dies. NOTE_EXIT comes with the exit status of the process that died. I don't see any requirement that EVFILT_PROC work only on child processes of the waiter: on the contrary, the man page states that "if a process can normally see another process, it can attach an event to it.". This documentation reads to me like on FreeBSD process exit status is much more widely available than it is on Linux. Am I missing something?
So in fact FreeBSD has what I'm proposing fully for pids but partial for pidfds: state transition montoring NOTE_EXIT, NOTE_FORK, NOTE_EXEC and with NOTE_TRACK even more. For NOTE_EXIT you register a pid or pidfd in an epoll_wait()/kqueue loop you get an event and you can get access to that data in the case of kqueue by look at the "data" member or by getting another event flag. I was putting the idea on the table to do this via EPOLLIN and then looking at a simple struct that contains that information. I like this idea to be honest.
[1] https://www.freebsd.org/cgi/man.cgi?query=kqueue&apropos=0&sektion=0...
On Fri, Apr 19, 2019 at 4:12 PM Christian Brauner christian@brauner.io wrote:
On Sat, Apr 20, 2019 at 12:46 AM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 3:02 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 11:48 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 11:21 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 1:57 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote: > > On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote: > > > > On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote: > > > On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote: > > > > On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote: > > > > > On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: > > > > > >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > > > > > >> On 04/16, Joel Fernandes wrote: > > > > > >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > > > > >> > > > > > > > >> > > Could you explain when it should return POLLIN? When the whole > > > > > >process exits? > > > > > >> > > > > > > >> > It returns POLLIN when the task is dead or doesn't exist anymore, > > > > > >or when it > > > > > >> > is in a zombie state and there's no other thread in the thread > > > > > >group. > > > > > >> > > > > > >> IOW, when the whole thread group exits, so it can't be used to > > > > > >monitor sub-threads. > > > > > >> > > > > > >> just in case... speaking of this patch it doesn't modify > > > > > >proc_tid_base_operations, > > > > > >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are > > > > > >going to use > > > > > >> the anonymous file returned by CLONE_PIDFD ? > > > > > > > > > > > >I don't think procfs works that way. /proc/sub-thread-tid has > > > > > >proc_tgid_base_operations despite not being a thread group leader. > > > > > >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can > > > > > >be hit trivially, and then the code will misbehave. > > > > > > > > > > > >@Joel: I think you'll have to either rewrite this to explicitly bail > > > > > >out if you're dealing with a thread group leader, or make the code > > > > > >work for threads, too. > > > > > > > > > > The latter case probably being preferred if this API is supposed to be > > > > > useable for thread management in userspace. > > > > > > > > At the moment, we are not planning to use this for sub-thread management. I > > > > am reworking this patch to only work on clone(2) pidfds which makes the above > > > > > > Indeed and agreed. > > > > > > > discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD > > > > patches, CLONE_THREAD with pidfd is not supported. > > > > > > Yes. We have no one asking for it right now and we can easily add this > > > later. > > > > > > Admittedly I haven't gotten around to reviewing the patches here yet > > > completely. But one thing about using POLLIN. FreeBSD is using POLLHUP > > > on process exit which I think is nice as well. How about returning > > > POLLIN | POLLHUP on process exit? > > > We already do things like this. For example, when you proxy between > > > ttys. If the process that you're reading data from has exited and closed > > > it's end you still can't usually simply exit because it might have still > > > buffered data that you want to read. The way one can deal with this > > > from userspace is that you can observe a (POLLHUP | POLLIN) event and > > > you keep on reading until you only observe a POLLHUP without a POLLIN > > > event at which point you know you have read > > > all data. > > > I like the semantics for pidfds as well as it would indicate: > > > - POLLHUP -> process has exited > > > - POLLIN -> information can be read > > > > Actually I think a bit different about this, in my opinion the pidfd should > > always be readable (we would store the exit status somewhere in the future > > which would be readable, even after task_struct is dead). So I was thinking > > we always return EPOLLIN. If process has not exited, then it blocks. > > ITYM that a pidfd polls as readable *once a task exits* and stays > readable forever. Before a task exit, a poll on a pidfd should *not* > yield POLLIN and reading that pidfd should *not* complete immediately. > There's no way that, having observed POLLIN on a pidfd, you should > ever then *not* see POLLIN on that pidfd in the future --- it's a > one-way transition from not-ready-to-get-exit-status to > ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd.
Sure, but systemd is specialized.
So is Android and we're not designing an interface for Android but for all of userspace. I hope this is clear. Service managers are quite important and systemd is the largest one and they can make good use of this feature.
There are two broad classes of programs that care about process exit status: 1) those that just want to do something and wait for it to complete, and 2) programs that want to perform detailed monitoring of processes and intervention in their state. #1 is overwhelmingly more common. The basic pidfd feature should take care of case #1 only, as wait*() in file descriptor form. I definitely don't think we should be complicating the interface and making it more error-prone (see below) for the sake of that rare program that cares about non-exit notification conditions. You're proposing a complicated combination of poll bit flags that most users (the ones who just wait to wait for processes) don't care about and that risk making the facility hard to use with existing event loops, which generally recognize readability and writability as the only properties that are worth monitoring.
That whole pargraph is about dismissing a range of valid use-cases based on assumptions such as "way more common" and even argues that service managers are special cases and therefore not really worth considering. I would like to be more open to other use cases.
We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea.
Exit status information should be *at least* as broadly available through pidfds as it is through the last field of /proc/pid/stat today, and probably more broadly. I've been saying for six months now that we need to talk about *who* should have access to exit status information. We haven't had that conversation yet. My preference is to just make exit status information globally available, as FreeBSD seems
Totally aside from whether or not this is a good idea but since you keep bringing this up and I'm really curious about this where is this documented and how does this work, please?
According to the kqueue FreeBSD man page [1] (I'm reading the FreeBSD 12 version), it's possible to register in a kqueue instead a PID of interest via EVFILT_PROC and receive a NOTE_EXIT notification when that process dies. NOTE_EXIT comes with the exit status of the process that died. I don't see any requirement that EVFILT_PROC work only on child processes of the waiter: on the contrary, the man page states that "if a process can normally see another process, it can attach an event to it.". This documentation reads to me like on FreeBSD process exit status is much more widely available than it is on Linux. Am I missing something?
So in fact FreeBSD has what I'm proposing fully for pids but partial for pidfds: state transition montoring NOTE_EXIT, NOTE_FORK, NOTE_EXEC and with NOTE_TRACK even more. For NOTE_EXIT you register a pid or pidfd in an epoll_wait()/kqueue loop you get an event and you can get access to that data in the case of kqueue by look at the "data" member or by getting another event flag. I was putting the idea on the table to do this via EPOLLIN and then looking at a simple struct that contains that information.
If you turn pidfd into an event stream, reads have to be destructive. If reads are destructive, you can't share pidfds instances between multiple readers. If you can't get a pidfd except via clone, you can't have more than one pidfd instance for a single process. The overall result is that we're back in the same place we were before with the old wait system, i.e., only one entity can monitor a process for interesting state transitions and everyone else gets a racy, inadequate interface via /proc. FreeBSD doesn't have this problem because you can create an *arbitrary* number of *different* kqueue objects, register a PID in each of them, and get an independent destructively-read event stream in each context. It's worth noting that the FreeBSD process file descriptor from pdfork(2) is *NOT* an event stream, as you're describing, but a level-triggered one-transition facility of the sort that I'm advocating.
In other words, FreeBSD already implements the model I'm describing: level-triggered simple exit notification for pidfd and a separate edge-triggered monitoring facility.
I like this idea to be honest.
I'm not opposed to some facility that delivers a stream of events relating to some process. That could even be epoll, as our rough equivalent to kqueue. I don't see a need to make the pidfd the channel through which we deliver these events. There's room for both an event stream like the one FreeBSD provides and a level-triggered "did this process exit or not?" indication via pidfd.
On Sat, Apr 20, 2019 at 1:47 AM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 4:12 PM Christian Brauner christian@brauner.io wrote:
On Sat, Apr 20, 2019 at 12:46 AM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 3:02 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 11:48 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 11:21 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 1:57 PM Christian Brauner christian@brauner.io wrote: > > On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote: > > > > On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote: > > > > > > On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote: > > > > On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote: > > > > > On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote: > > > > > > On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: > > > > > > >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > > > > > > >> On 04/16, Joel Fernandes wrote: > > > > > > >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > > > > > >> > > > > > > > > >> > > Could you explain when it should return POLLIN? When the whole > > > > > > >process exits? > > > > > > >> > > > > > > > >> > It returns POLLIN when the task is dead or doesn't exist anymore, > > > > > > >or when it > > > > > > >> > is in a zombie state and there's no other thread in the thread > > > > > > >group. > > > > > > >> > > > > > > >> IOW, when the whole thread group exits, so it can't be used to > > > > > > >monitor sub-threads. > > > > > > >> > > > > > > >> just in case... speaking of this patch it doesn't modify > > > > > > >proc_tid_base_operations, > > > > > > >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are > > > > > > >going to use > > > > > > >> the anonymous file returned by CLONE_PIDFD ? > > > > > > > > > > > > > >I don't think procfs works that way. /proc/sub-thread-tid has > > > > > > >proc_tgid_base_operations despite not being a thread group leader. > > > > > > >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can > > > > > > >be hit trivially, and then the code will misbehave. > > > > > > > > > > > > > >@Joel: I think you'll have to either rewrite this to explicitly bail > > > > > > >out if you're dealing with a thread group leader, or make the code > > > > > > >work for threads, too. > > > > > > > > > > > > The latter case probably being preferred if this API is supposed to be > > > > > > useable for thread management in userspace. > > > > > > > > > > At the moment, we are not planning to use this for sub-thread management. I > > > > > am reworking this patch to only work on clone(2) pidfds which makes the above > > > > > > > > Indeed and agreed. > > > > > > > > > discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD > > > > > patches, CLONE_THREAD with pidfd is not supported. > > > > > > > > Yes. We have no one asking for it right now and we can easily add this > > > > later. > > > > > > > > Admittedly I haven't gotten around to reviewing the patches here yet > > > > completely. But one thing about using POLLIN. FreeBSD is using POLLHUP > > > > on process exit which I think is nice as well. How about returning > > > > POLLIN | POLLHUP on process exit? > > > > We already do things like this. For example, when you proxy between > > > > ttys. If the process that you're reading data from has exited and closed > > > > it's end you still can't usually simply exit because it might have still > > > > buffered data that you want to read. The way one can deal with this > > > > from userspace is that you can observe a (POLLHUP | POLLIN) event and > > > > you keep on reading until you only observe a POLLHUP without a POLLIN > > > > event at which point you know you have read > > > > all data. > > > > I like the semantics for pidfds as well as it would indicate: > > > > - POLLHUP -> process has exited > > > > - POLLIN -> information can be read > > > > > > Actually I think a bit different about this, in my opinion the pidfd should > > > always be readable (we would store the exit status somewhere in the future > > > which would be readable, even after task_struct is dead). So I was thinking > > > we always return EPOLLIN. If process has not exited, then it blocks. > > > > ITYM that a pidfd polls as readable *once a task exits* and stays > > readable forever. Before a task exit, a poll on a pidfd should *not* > > yield POLLIN and reading that pidfd should *not* complete immediately. > > There's no way that, having observed POLLIN on a pidfd, you should > > ever then *not* see POLLIN on that pidfd in the future --- it's a > > one-way transition from not-ready-to-get-exit-status to > > ready-to-get-exit-status. > > What do you consider interesting state transitions? A listener on a pidfd > in epoll_wait() might be interested if the process execs for example. > That's a very valid use-case for e.g. systemd.
Sure, but systemd is specialized.
So is Android and we're not designing an interface for Android but for all of userspace. I hope this is clear. Service managers are quite important and systemd is the largest one and they can make good use of this feature.
There are two broad classes of programs that care about process exit status: 1) those that just want to do something and wait for it to complete, and 2) programs that want to perform detailed monitoring of processes and intervention in their state. #1 is overwhelmingly more common. The basic pidfd feature should take care of case #1 only, as wait*() in file descriptor form. I definitely don't think we should be complicating the interface and making it more error-prone (see below) for the sake of that rare program that cares about non-exit notification conditions. You're proposing a complicated combination of poll bit flags that most users (the ones who just wait to wait for processes) don't care about and that risk making the facility hard to use with existing event loops, which generally recognize readability and writability as the only properties that are worth monitoring.
That whole pargraph is about dismissing a range of valid use-cases based on assumptions such as "way more common" and even argues that service managers are special cases and therefore not really worth considering. I would like to be more open to other use cases.
> We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) > to check whether an exit status can be read which is not nice and then you > multiplex different meanings on the same bit. > I would prefer if the exit status can only be read from the parent which is > clean and the least complicated semantics, i.e. Linus waitid() idea.
Exit status information should be *at least* as broadly available through pidfds as it is through the last field of /proc/pid/stat today, and probably more broadly. I've been saying for six months now that we need to talk about *who* should have access to exit status information. We haven't had that conversation yet. My preference is to just make exit status information globally available, as FreeBSD seems
Totally aside from whether or not this is a good idea but since you keep bringing this up and I'm really curious about this where is this documented and how does this work, please?
According to the kqueue FreeBSD man page [1] (I'm reading the FreeBSD 12 version), it's possible to register in a kqueue instead a PID of interest via EVFILT_PROC and receive a NOTE_EXIT notification when that process dies. NOTE_EXIT comes with the exit status of the process that died. I don't see any requirement that EVFILT_PROC work only on child processes of the waiter: on the contrary, the man page states that "if a process can normally see another process, it can attach an event to it.". This documentation reads to me like on FreeBSD process exit status is much more widely available than it is on Linux. Am I missing something?
So in fact FreeBSD has what I'm proposing fully for pids but partial for pidfds: state transition montoring NOTE_EXIT, NOTE_FORK, NOTE_EXEC and with NOTE_TRACK even more. For NOTE_EXIT you register a pid or pidfd in an epoll_wait()/kqueue loop you get an event and you can get access to that data in the case of kqueue by look at the "data" member or by getting another event flag. I was putting the idea on the table to do this via EPOLLIN and then looking at a simple struct that contains that information.
If you turn pidfd into an event stream, reads have to be destructive. If reads are destructive, you can't share pidfds instances between multiple readers. If you can't get a pidfd except via clone, you can't have more than one pidfd instance for a single process. The overall
It's not off the table that we can add a pidfd_open() if that becomes a real thing.
result is that we're back in the same place we were before with the old wait system, i.e., only one entity can monitor a process for interesting state transitions and everyone else gets a racy, inadequate interface via /proc. FreeBSD doesn't have this problem because you can create an *arbitrary* number of *different* kqueue objects, register a PID in each of them, and get an independent destructively-read event stream in each context. It's worth noting
If we add pidfd_open() we should be able to do this too though.
that the FreeBSD process file descriptor from pdfork(2) is *NOT* an event stream, as you're describing, but a level-triggered
Because it only implements the exit event, that's what I said before but you can do it with pids already. And they thought about this use-case for pidfds at least. My point is that we don't end up with an interface that then doesn't allow us to extend this to cover such an api.
one-transition facility of the sort that I'm advocating.
In other words, FreeBSD already implements the model I'm describing: level-triggered simple exit notification for pidfd and a separate edge-triggered monitoring facility.
I like this idea to be honest.
I'm not opposed to some facility that delivers a stream of events relating to some process. That could even be epoll, as our rough equivalent to kqueue. I don't see a need to make the pidfd the channel through which we deliver these events. There's room for both an event
The problem is that epoll_wait() does currently not return data that the user hasn't already passed in via the "data" argument ad ADD or MOD time. And it seems nice to deliver this through the pidfd itself.
stream like the one FreeBSD provides and a level-triggered "did this process exit or not?" indication via pidfd.
On 20.04.19 02:17, Christian Brauner wrote:
It's not off the table that we can add a pidfd_open() if that becomes> a real thing.
What are the exact semantics of pidfd_open() ?
--mtx
On 20.04.19 01:46, Daniel Colascione wrote:
If you turn pidfd into an event stream, reads have to be destructive. If reads are destructive, you can't share pidfds instances between multiple readers.
Can't we find a way to duplicate them ? Someting like dup(), but with actually duplicating the open file ? (maybe add a flag for that to dup3 ?)
--mtx
On Fri, Apr 19, 2019 at 2:48 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 11:21 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 1:57 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote:
On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote: > On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote: > > On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: > > >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > > >> On 04/16, Joel Fernandes wrote: > > >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > >> > > > > >> > > Could you explain when it should return POLLIN? When the whole > > >process exits? > > >> > > > >> > It returns POLLIN when the task is dead or doesn't exist anymore, > > >or when it > > >> > is in a zombie state and there's no other thread in the thread > > >group. > > >> > > >> IOW, when the whole thread group exits, so it can't be used to > > >monitor sub-threads. > > >> > > >> just in case... speaking of this patch it doesn't modify > > >proc_tid_base_operations, > > >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are > > >going to use > > >> the anonymous file returned by CLONE_PIDFD ? > > > > > >I don't think procfs works that way. /proc/sub-thread-tid has > > >proc_tgid_base_operations despite not being a thread group leader. > > >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can > > >be hit trivially, and then the code will misbehave. > > > > > >@Joel: I think you'll have to either rewrite this to explicitly bail > > >out if you're dealing with a thread group leader, or make the code > > >work for threads, too. > > > > The latter case probably being preferred if this API is supposed to be > > useable for thread management in userspace. > > At the moment, we are not planning to use this for sub-thread management. I > am reworking this patch to only work on clone(2) pidfds which makes the above
Indeed and agreed.
> discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD > patches, CLONE_THREAD with pidfd is not supported.
Yes. We have no one asking for it right now and we can easily add this later.
Admittedly I haven't gotten around to reviewing the patches here yet completely. But one thing about using POLLIN. FreeBSD is using POLLHUP on process exit which I think is nice as well. How about returning POLLIN | POLLHUP on process exit? We already do things like this. For example, when you proxy between ttys. If the process that you're reading data from has exited and closed it's end you still can't usually simply exit because it might have still buffered data that you want to read. The way one can deal with this from userspace is that you can observe a (POLLHUP | POLLIN) event and you keep on reading until you only observe a POLLHUP without a POLLIN event at which point you know you have read all data. I like the semantics for pidfds as well as it would indicate:
- POLLHUP -> process has exited
- POLLIN -> information can be read
Actually I think a bit different about this, in my opinion the pidfd should always be readable (we would store the exit status somewhere in the future which would be readable, even after task_struct is dead). So I was thinking we always return EPOLLIN. If process has not exited, then it blocks.
ITYM that a pidfd polls as readable *once a task exits* and stays readable forever. Before a task exit, a poll on a pidfd should *not* yield POLLIN and reading that pidfd should *not* complete immediately. There's no way that, having observed POLLIN on a pidfd, you should ever then *not* see POLLIN on that pidfd in the future --- it's a one-way transition from not-ready-to-get-exit-status to ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd.
Sure, but systemd is specialized.
So is Android and we're not designing an interface for Android but for all of userspace.
Nothing in my post is Android-specific. Waiting for non-child processes is something that lots of people want to do, which is why patches to enable it have been getting posted every few years for many years (e.g., Andy's from 2011). I, too, want to make an API for all over userspace. Don't attribute to me arguments that I'm not actually making.
I hope this is clear. Service managers are quite important and systemd is the largest one and they can make good use of this feature.
Service managers already have the tools they need to do their job. The kind of monitoring you're talking about is a niche case and an improved API for this niche --- which amounts to a rethought ptrace --- can wait for a future date, when it can be done right. Nothing in the model I'm advocating precludes adding an event stream API in the future. I don't think we should gate the ability to wait for process exit via pidfd on pidfds providing an entire ptrace replacement facility.
There are two broad classes of programs that care about process exit status: 1) those that just want to do something and wait for it to complete, and 2) programs that want to perform detailed monitoring of processes and intervention in their state. #1 is overwhelmingly more common. The basic pidfd feature should take care of case #1 only, as wait*() in file descriptor form. I definitely don't think we should be complicating the interface and making it more error-prone (see below) for the sake of that rare program that cares about non-exit notification conditions. You're proposing a complicated combination of poll bit flags that most users (the ones who just wait to wait for processes) don't care about and that risk making the facility hard to use with existing event loops, which generally recognize readability and writability as the only properties that are worth monitoring.
That whole pargraph is about dismissing a range of valid use-cases based on assumptions such as "way more common" and
It really ought not to be controversial to say that process managers make up a small fraction of the programs that wait for child processes.
even argues that service managers are special cases and therefore not really worth considering. I would like to be more open to other use cases.
It's not my position that service managers are "not worth considering" and you know that, so I'd appreciate your not attributing to me views hat I don't hold. I *am* saying that an event-based process-monitoring API is out of scope and that it should be separate work: the overwhelmingly majority of process manipulation (say, in libraries wanting private helper processes, which is something I thought we all agreed would be beneficial to support) is waiting for exit.
We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea.
Exit status information should be *at least* as broadly available through pidfds as it is through the last field of /proc/pid/stat today, and probably more broadly. I've been saying for six months now that we need to talk about *who* should have access to exit status information. We haven't had that conversation yet. My preference is to
just make exit status information globally available, as FreeBSD seems to do. I think it would be broadly useful for something like pkill to
From the pdfork() FreeBSD manpage: "poll(2) and select(2) allow waiting for process state transitions; currently only POLLHUP is defined, and will be raised when the process dies. Process state transitions can also be monitored using kqueue(2) filter EVFILT_PROCDESC; currently only NOTE_EXIT is implemented."
I don't understand what you're trying to demonstrate by quoting that passage.
wait for processes to exit and to retrieve their exit information.
Speaking of pkill: AIUI, in your current patch set, one can get a pidfd *only* via clone. Joel indicated that he believes poll(2) shouldn't be supported on procfs pidfds. Is that your thinking as well? If that's the case, then we're in a state where non-parents
Yes, it is.
If reading process status information from a pidfd is destructive, it's dangerous to share pidfds between processes. If reading information *isn't* destructive, how are you supposed to use poll(2) to wait for the next transition? Is poll destructive? If you can only make a new pidfd via clone, you can't get two separate event streams for two different users. Sharing a single pidfd via dup or SCM_RIGHTS becomes dangerous, because if reading status is destructive, only one reader can observe each event. Your proposed edge-triggered design makes pidfds significantly less useful, because in your design, it's unsafe to share a single pidfd open file description *and* there's no way to create a new pidfd open file description for an existing process.
I think we should make an API for all of userspace and not just for container managers and systemd.
can't wait for process exit, and providing this facility is an important goal of the whole project.
That's your goal.
I thought we all agreed on that months ago that it's reasonable to allow processes to wait for non-child processes to exit. Now, out of the blue, you're saying that 1) actually, we want a rich API for all kinds of things that aren't process exit, because systemd, and 2) actually, non-parents shouldn't be able to wait for process death. I don't know what to say. Can you point to something that might have changed your mind? I'd appreciate other people weighing in on this subject.
On Sat, Apr 20, 2019 at 12:35 AM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 2:48 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 11:21 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 1:57 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote:
On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote: > On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote: > > On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote: > > > On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: > > > >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > > > >> On 04/16, Joel Fernandes wrote: > > > >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > > >> > > > > > >> > > Could you explain when it should return POLLIN? When the whole > > > >process exits? > > > >> > > > > >> > It returns POLLIN when the task is dead or doesn't exist anymore, > > > >or when it > > > >> > is in a zombie state and there's no other thread in the thread > > > >group. > > > >> > > > >> IOW, when the whole thread group exits, so it can't be used to > > > >monitor sub-threads. > > > >> > > > >> just in case... speaking of this patch it doesn't modify > > > >proc_tid_base_operations, > > > >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are > > > >going to use > > > >> the anonymous file returned by CLONE_PIDFD ? > > > > > > > >I don't think procfs works that way. /proc/sub-thread-tid has > > > >proc_tgid_base_operations despite not being a thread group leader. > > > >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can > > > >be hit trivially, and then the code will misbehave. > > > > > > > >@Joel: I think you'll have to either rewrite this to explicitly bail > > > >out if you're dealing with a thread group leader, or make the code > > > >work for threads, too. > > > > > > The latter case probably being preferred if this API is supposed to be > > > useable for thread management in userspace. > > > > At the moment, we are not planning to use this for sub-thread management. I > > am reworking this patch to only work on clone(2) pidfds which makes the above > > Indeed and agreed. > > > discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD > > patches, CLONE_THREAD with pidfd is not supported. > > Yes. We have no one asking for it right now and we can easily add this > later. > > Admittedly I haven't gotten around to reviewing the patches here yet > completely. But one thing about using POLLIN. FreeBSD is using POLLHUP > on process exit which I think is nice as well. How about returning > POLLIN | POLLHUP on process exit? > We already do things like this. For example, when you proxy between > ttys. If the process that you're reading data from has exited and closed > it's end you still can't usually simply exit because it might have still > buffered data that you want to read. The way one can deal with this > from userspace is that you can observe a (POLLHUP | POLLIN) event and > you keep on reading until you only observe a POLLHUP without a POLLIN > event at which point you know you have read > all data. > I like the semantics for pidfds as well as it would indicate: > - POLLHUP -> process has exited > - POLLIN -> information can be read
Actually I think a bit different about this, in my opinion the pidfd should always be readable (we would store the exit status somewhere in the future which would be readable, even after task_struct is dead). So I was thinking we always return EPOLLIN. If process has not exited, then it blocks.
ITYM that a pidfd polls as readable *once a task exits* and stays readable forever. Before a task exit, a poll on a pidfd should *not* yield POLLIN and reading that pidfd should *not* complete immediately. There's no way that, having observed POLLIN on a pidfd, you should ever then *not* see POLLIN on that pidfd in the future --- it's a one-way transition from not-ready-to-get-exit-status to ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd.
Sure, but systemd is specialized.
So is Android and we're not designing an interface for Android but for all of userspace.
Nothing in my post is Android-specific. Waiting for non-child processes is something that lots of people want to do, which is why patches to enable it have been getting posted every few years for many years (e.g., Andy's from 2011). I, too, want to make an API for all over userspace. Don't attribute to me arguments that I'm not actually making.
I hope this is clear. Service managers are quite important and systemd is the largest one and they can make good use of this feature.
Service managers already have the tools they need to do their job. The
No they don't. Even if they quite often have kludges and run into a lot of problems. That's why there's interest in these features as well.
kind of monitoring you're talking about is a niche case and an improved API for this niche --- which amounts to a rethought ptrace --- can wait for a future date, when it can be done right. Nothing in the model I'm advocating precludes adding an event stream API in the future. I don't think we should gate the ability to wait for process exit via pidfd on pidfds providing an entire ptrace replacement facility.
There are two broad classes of programs that care about process exit status: 1) those that just want to do something and wait for it to complete, and 2) programs that want to perform detailed monitoring of processes and intervention in their state. #1 is overwhelmingly more common. The basic pidfd feature should take care of case #1 only, as wait*() in file descriptor form. I definitely don't think we should be complicating the interface and making it more error-prone (see below) for the sake of that rare program that cares about non-exit notification conditions. You're proposing a complicated combination of poll bit flags that most users (the ones who just wait to wait for processes) don't care about and that risk making the facility hard to use with existing event loops, which generally recognize readability and writability as the only properties that are worth monitoring.
That whole pargraph is about dismissing a range of valid use-cases based on assumptions such as "way more common" and
It really ought not to be controversial to say that process managers make up a small fraction of the programs that wait for child processes.
Well, daemons tend to do those things do. System managers and container managers are just an example of a whole class. Even if you just consider system managers like openrc, systemd you have gotten yourself quite a large userbase.
even argues that service managers are special cases and therefore not really worth considering. I would like to be more open to other use cases.
It's not my position that service managers are "not worth considering" and you know that, so I'd appreciate your not attributing to me views hat I don't hold. I *am* saying that an event-based process-monitoring
It very much sounded like it. Calling them a "niche" case didn't help given that they run quite a lot of workloads everywhere.
API is out of scope and that it should be separate work: the overwhelmingly majority of process manipulation (say, in libraries wanting private helper processes, which is something I thought we all agreed would be beneficial to support) is waiting for exit.
We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea.
Exit status information should be *at least* as broadly available through pidfds as it is through the last field of /proc/pid/stat today, and probably more broadly. I've been saying for six months now that we need to talk about *who* should have access to exit status information. We haven't had that conversation yet. My preference is to
just make exit status information globally available, as FreeBSD seems to do. I think it would be broadly useful for something like pkill to
From the pdfork() FreeBSD manpage: "poll(2) and select(2) allow waiting for process state transitions; currently only POLLHUP is defined, and will be raised when the process dies. Process state transitions can also be monitored using kqueue(2) filter EVFILT_PROCDESC; currently only NOTE_EXIT is implemented."
I don't understand what you're trying to demonstrate by quoting that passage.
FreeBSD obviously has thought about being able to observe more than just NOTE_EXIT in the future.
wait for processes to exit and to retrieve their exit information.
Speaking of pkill: AIUI, in your current patch set, one can get a pidfd *only* via clone. Joel indicated that he believes poll(2) shouldn't be supported on procfs pidfds. Is that your thinking as well? If that's the case, then we're in a state where non-parents
Yes, it is.
If reading process status information from a pidfd is destructive, it's dangerous to share pidfds between processes. If reading information *isn't* destructive, how are you supposed to use poll(2) to wait for the next transition? Is poll destructive? If you can only make a new pidfd via clone, you can't get two separate event streams for two different users. Sharing a single pidfd via dup or SCM_RIGHTS becomes dangerous, because if reading status is destructive, only one reader can observe each event. Your proposed edge-triggered design makes pidfds significantly less useful, because in your design, it's unsafe to share a single pidfd open file description *and* there's no way to create a new pidfd open file description for an existing process.
I think we should make an API for all of userspace and not just for container managers and systemd.
I mean, you can go and try making arguments based on syntactical rearrangements of things I said but I'm going to pass.
My point simply was: There are more users that would be interested in observing more state transitions in the future. Your argument made it sound like they are not worth considering. I disagree.
can't wait for process exit, and providing this facility is an important goal of the whole project.
That's your goal.
I thought we all agreed on that months ago that it's reasonable to allow processes to wait for non-child processes to exit. Now, out of
Uhm, I can't remember being privy to that agreement but the threads get so long that maybe I forgot what I wrote?
the blue, you're saying that 1) actually, we want a rich API for all kinds of things that aren't process exit, because systemd, and 2)
- I'm not saying we have to. It just makes it more flexible and is something we can at least consider. - systemd is an example of another *huge* user of this api. That neither implies this api is "because systemd" it simply makes it worth that we consider this use-case.
actually, non-parents shouldn't be able to wait for process death. I
I'm sorry, who has agreed that a non-parent should be able to wait for process death? I know you proposed that but has anyone ever substantially supported this? I'm happy if you can gather the necessary support for this but I just haven't seen that yet.
On Fri, Apr 19, 2019 at 4:02 PM Christian Brauner christian@brauner.io wrote:
On Sat, Apr 20, 2019 at 12:35 AM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 2:48 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 11:21 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 1:57 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote: > > On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote: > > On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote: > > > On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote: > > > > On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: > > > > >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > > > > >> On 04/16, Joel Fernandes wrote: > > > > >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > > > >> > > > > > > >> > > Could you explain when it should return POLLIN? When the whole > > > > >process exits? > > > > >> > > > > > >> > It returns POLLIN when the task is dead or doesn't exist anymore, > > > > >or when it > > > > >> > is in a zombie state and there's no other thread in the thread > > > > >group. > > > > >> > > > > >> IOW, when the whole thread group exits, so it can't be used to > > > > >monitor sub-threads. > > > > >> > > > > >> just in case... speaking of this patch it doesn't modify > > > > >proc_tid_base_operations, > > > > >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are > > > > >going to use > > > > >> the anonymous file returned by CLONE_PIDFD ? > > > > > > > > > >I don't think procfs works that way. /proc/sub-thread-tid has > > > > >proc_tgid_base_operations despite not being a thread group leader. > > > > >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can > > > > >be hit trivially, and then the code will misbehave. > > > > > > > > > >@Joel: I think you'll have to either rewrite this to explicitly bail > > > > >out if you're dealing with a thread group leader, or make the code > > > > >work for threads, too. > > > > > > > > The latter case probably being preferred if this API is supposed to be > > > > useable for thread management in userspace. > > > > > > At the moment, we are not planning to use this for sub-thread management. I > > > am reworking this patch to only work on clone(2) pidfds which makes the above > > > > Indeed and agreed. > > > > > discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD > > > patches, CLONE_THREAD with pidfd is not supported. > > > > Yes. We have no one asking for it right now and we can easily add this > > later. > > > > Admittedly I haven't gotten around to reviewing the patches here yet > > completely. But one thing about using POLLIN. FreeBSD is using POLLHUP > > on process exit which I think is nice as well. How about returning > > POLLIN | POLLHUP on process exit? > > We already do things like this. For example, when you proxy between > > ttys. If the process that you're reading data from has exited and closed > > it's end you still can't usually simply exit because it might have still > > buffered data that you want to read. The way one can deal with this > > from userspace is that you can observe a (POLLHUP | POLLIN) event and > > you keep on reading until you only observe a POLLHUP without a POLLIN > > event at which point you know you have read > > all data. > > I like the semantics for pidfds as well as it would indicate: > > - POLLHUP -> process has exited > > - POLLIN -> information can be read > > Actually I think a bit different about this, in my opinion the pidfd should > always be readable (we would store the exit status somewhere in the future > which would be readable, even after task_struct is dead). So I was thinking > we always return EPOLLIN. If process has not exited, then it blocks.
ITYM that a pidfd polls as readable *once a task exits* and stays readable forever. Before a task exit, a poll on a pidfd should *not* yield POLLIN and reading that pidfd should *not* complete immediately. There's no way that, having observed POLLIN on a pidfd, you should ever then *not* see POLLIN on that pidfd in the future --- it's a one-way transition from not-ready-to-get-exit-status to ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd.
Sure, but systemd is specialized.
So is Android and we're not designing an interface for Android but for all of userspace.
Nothing in my post is Android-specific. Waiting for non-child processes is something that lots of people want to do, which is why patches to enable it have been getting posted every few years for many years (e.g., Andy's from 2011). I, too, want to make an API for all over userspace. Don't attribute to me arguments that I'm not actually making.
I hope this is clear. Service managers are quite important and systemd is the largest one and they can make good use of this feature.
Service managers already have the tools they need to do their job. The
No they don't. Even if they quite often have kludges and run into a lot of problems. That's why there's interest in these features as well.
Yes, and these facilities should have a powerful toolkit that they can use to do their job in the right way. This toolkit will probably involve pretty powerful kinds of process monitoring. I don't see a reason to gate the ability to wait for process death via pidfd on that toolkit. Please don't interpret my position as saying that the service monitor usecase is unimportant or not worth adding to Linux.
kind of monitoring you're talking about is a niche case and an improved API for this niche --- which amounts to a rethought ptrace --- can wait for a future date, when it can be done right. Nothing in the model I'm advocating precludes adding an event stream API in the future. I don't think we should gate the ability to wait for process exit via pidfd on pidfds providing an entire ptrace replacement facility.
There are two broad classes of programs that care about process exit status: 1) those that just want to do something and wait for it to complete, and 2) programs that want to perform detailed monitoring of processes and intervention in their state. #1 is overwhelmingly more common. The basic pidfd feature should take care of case #1 only, as wait*() in file descriptor form. I definitely don't think we should be complicating the interface and making it more error-prone (see below) for the sake of that rare program that cares about non-exit notification conditions. You're proposing a complicated combination of poll bit flags that most users (the ones who just wait to wait for processes) don't care about and that risk making the facility hard to use with existing event loops, which generally recognize readability and writability as the only properties that are worth monitoring.
That whole pargraph is about dismissing a range of valid use-cases based on assumptions such as "way more common" and
It really ought not to be controversial to say that process managers make up a small fraction of the programs that wait for child processes.
Well, daemons tend to do those things do. System managers and container managers are just an example of a whole class. Even if you just consider system managers like openrc, systemd you have gotten yourself quite a large userbase.
When I said "niche", I didn't mean "unimportant". I meant "specialized", as in the people who write these sorts of programs are willing to dive in to low-level operational details and get things right. I also think the community of people *writing* programs like systemd is relatively small.
even argues that service managers are special cases and therefore not really worth considering. I would like to be more open to other use cases.
It's not my position that service managers are "not worth considering" and you know that, so I'd appreciate your not attributing to me views hat I don't hold. I *am* saying that an event-based process-monitoring
It very much sounded like it. Calling them a "niche" case didn't help given that they run quite a lot of workloads everywhere.
API is out of scope and that it should be separate work: the overwhelmingly majority of process manipulation (say, in libraries wanting private helper processes, which is something I thought we all agreed would be beneficial to support) is waiting for exit.
We can't use EPOLLIN for that too otherwise you'd need to to waitid(_WNOHANG) to check whether an exit status can be read which is not nice and then you multiplex different meanings on the same bit. I would prefer if the exit status can only be read from the parent which is clean and the least complicated semantics, i.e. Linus waitid() idea.
Exit status information should be *at least* as broadly available through pidfds as it is through the last field of /proc/pid/stat today, and probably more broadly. I've been saying for six months now that we need to talk about *who* should have access to exit status information. We haven't had that conversation yet. My preference is to
just make exit status information globally available, as FreeBSD seems to do. I think it would be broadly useful for something like pkill to
From the pdfork() FreeBSD manpage: "poll(2) and select(2) allow waiting for process state transitions; currently only POLLHUP is defined, and will be raised when the process dies. Process state transitions can also be monitored using kqueue(2) filter EVFILT_PROCDESC; currently only NOTE_EXIT is implemented."
I don't understand what you're trying to demonstrate by quoting that passage.
FreeBSD obviously has thought about being able to observe more than just NOTE_EXIT in the future.
Yes. Did I say anywhere that we should never be able to observe execs and forks? I think that what FreeBSD got *right* is making process exit status broadly available. What I think they got wrong is the mixing of exit information with other EVFILT_PROCDESC messages.
wait for processes to exit and to retrieve their exit information.
Speaking of pkill: AIUI, in your current patch set, one can get a pidfd *only* via clone. Joel indicated that he believes poll(2) shouldn't be supported on procfs pidfds. Is that your thinking as well? If that's the case, then we're in a state where non-parents
Yes, it is.
If reading process status information from a pidfd is destructive, it's dangerous to share pidfds between processes. If reading information *isn't* destructive, how are you supposed to use poll(2) to wait for the next transition? Is poll destructive? If you can only make a new pidfd via clone, you can't get two separate event streams for two different users. Sharing a single pidfd via dup or SCM_RIGHTS becomes dangerous, because if reading status is destructive, only one reader can observe each event. Your proposed edge-triggered design makes pidfds significantly less useful, because in your design, it's unsafe to share a single pidfd open file description *and* there's no way to create a new pidfd open file description for an existing process.
I think we should make an API for all of userspace and not just for container managers and systemd.
I mean, you can go and try making arguments based on syntactical rearrangements of things I said but I'm going to pass.
I'd prefer if we focused on substantive technical issues instead of unproductive comments on syntax. I'd prefer to talk about the technical concerns I raised regarding an edge-triggered event-queue design making pidfd file descriptor sharing dangerous.
My point simply was: There are more users that would be interested in observing more state transitions in the future. Your argument made it sound like they are not worth considering. I disagree.
I believe you've misunderstood me. I've never said that this use case is "not worth considering". I would appreciate it if you wouldn't claim that I believe these use cases aren't worth considering. My point is that these uses would be better served through a dedicated process event monitoring facility, one that could replace ptrace. I would be thrilled by something like that. The point I'm making, to be very clear, is *NOT* that process monitoring is "not worth considering", but that process monitoring is subtle and complicated enough that it ought to be considered as a standalone project, independent of pidfds proper and of the very simple and effective pidfd system that Joel has proposed in his patch series.
can't wait for process exit, and providing this facility is an important goal of the whole project.
That's your goal.
I thought we all agreed on that months ago that it's reasonable to allow processes to wait for non-child processes to exit. Now, out of
Uhm, I can't remember being privy to that agreement but the threads get so long that maybe I forgot what I wrote?
the blue, you're saying that 1) actually, we want a rich API for all kinds of things that aren't process exit, because systemd, and 2)
- I'm not saying we have to. It just makes it more flexible and is something
we can at least consider.
I've spent a few emails and a lot of thought considering the idea. If I weren't considering it, I wouldn't have thought through the implications of destructive event reads above. My position, *after due consideration*, is that we're better off making the pidfd poll() system a level-triggered exit-only signal and defer more sophisticated monitoring to a separate facility. I'm not saying that we shouldn't be able to monitor processes. I'm saying that there are important technical and API design reasons why the initial version of the pidfd wait system should be simple and support only one thing. The model you've proposed has important technical disadvantages, and you haven't addressed these technical disadvantages.
- systemd is an example of another *huge* user of this api. That neither implies
this api is "because systemd" it simply makes it worth that we consider this use-case.
actually, non-parents shouldn't be able to wait for process death. I
I'm sorry, who has agreed that a non-parent should be able to wait for process death?
Aleksa Sarai, for starters. See [1]. In any case, I don't see where your sudden opposition to the idea is coming from. I've been saying over and over that it's important that we allow processes to wait for the death of non-children. Why are you just objecting now? And on what basis are you objecting? Why *shouldn't* we be able to wait for process death in a general way? Both FreeBSD and Windows can do it.
I know you proposed that but has anyone ever substantially supported this? I'm happy if you can gather the necessary support for this but I just haven't seen that yet.
I recommend re-reading the whole exithand thread, since we covered a lot of the ground that we're also covering here. In any case, there's a legitimate technical reason why we'd want to wait for non-child death, and I would appreciate it if you didn't just summarily dismiss it as "[my] goal". Let's talk about what's best for users, not use that kind of unproductive rhetoric.
[1] https://lore.kernel.org/lkml/20181101070036.l24c2p432ohuwmqf@yavin/
On Sat, Apr 20, 2019 at 1:30 AM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 4:02 PM Christian Brauner christian@brauner.io wrote:
On Sat, Apr 20, 2019 at 12:35 AM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 2:48 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 11:21 PM Daniel Colascione dancol@google.com wrote:
On Fri, Apr 19, 2019 at 1:57 PM Christian Brauner christian@brauner.io wrote:
On Fri, Apr 19, 2019 at 10:34 PM Daniel Colascione dancol@google.com wrote: > > On Fri, Apr 19, 2019 at 12:49 PM Joel Fernandes joel@joelfernandes.org wrote: > > > > On Fri, Apr 19, 2019 at 09:18:59PM +0200, Christian Brauner wrote: > > > On Fri, Apr 19, 2019 at 03:02:47PM -0400, Joel Fernandes wrote: > > > > On Thu, Apr 18, 2019 at 07:26:44PM +0200, Christian Brauner wrote: > > > > > On April 18, 2019 7:23:38 PM GMT+02:00, Jann Horn jannh@google.com wrote: > > > > > >On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote: > > > > > >> On 04/16, Joel Fernandes wrote: > > > > > >> > On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote: > > > > > >> > > > > > > > >> > > Could you explain when it should return POLLIN? When the whole > > > > > >process exits? > > > > > >> > > > > > > >> > It returns POLLIN when the task is dead or doesn't exist anymore, > > > > > >or when it > > > > > >> > is in a zombie state and there's no other thread in the thread > > > > > >group. > > > > > >> > > > > > >> IOW, when the whole thread group exits, so it can't be used to > > > > > >monitor sub-threads. > > > > > >> > > > > > >> just in case... speaking of this patch it doesn't modify > > > > > >proc_tid_base_operations, > > > > > >> so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are > > > > > >going to use > > > > > >> the anonymous file returned by CLONE_PIDFD ? > > > > > > > > > > > >I don't think procfs works that way. /proc/sub-thread-tid has > > > > > >proc_tgid_base_operations despite not being a thread group leader. > > > > > >(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can > > > > > >be hit trivially, and then the code will misbehave. > > > > > > > > > > > >@Joel: I think you'll have to either rewrite this to explicitly bail > > > > > >out if you're dealing with a thread group leader, or make the code > > > > > >work for threads, too. > > > > > > > > > > The latter case probably being preferred if this API is supposed to be > > > > > useable for thread management in userspace. > > > > > > > > At the moment, we are not planning to use this for sub-thread management. I > > > > am reworking this patch to only work on clone(2) pidfds which makes the above > > > > > > Indeed and agreed. > > > > > > > discussion about /proc a bit unnecessary I think. Per the latest CLONE_PIDFD > > > > patches, CLONE_THREAD with pidfd is not supported. > > > > > > Yes. We have no one asking for it right now and we can easily add this > > > later. > > > > > > Admittedly I haven't gotten around to reviewing the patches here yet > > > completely. But one thing about using POLLIN. FreeBSD is using POLLHUP > > > on process exit which I think is nice as well. How about returning > > > POLLIN | POLLHUP on process exit? > > > We already do things like this. For example, when you proxy between > > > ttys. If the process that you're reading data from has exited and closed > > > it's end you still can't usually simply exit because it might have still > > > buffered data that you want to read. The way one can deal with this > > > from userspace is that you can observe a (POLLHUP | POLLIN) event and > > > you keep on reading until you only observe a POLLHUP without a POLLIN > > > event at which point you know you have read > > > all data. > > > I like the semantics for pidfds as well as it would indicate: > > > - POLLHUP -> process has exited > > > - POLLIN -> information can be read > > > > Actually I think a bit different about this, in my opinion the pidfd should > > always be readable (we would store the exit status somewhere in the future > > which would be readable, even after task_struct is dead). So I was thinking > > we always return EPOLLIN. If process has not exited, then it blocks. > > ITYM that a pidfd polls as readable *once a task exits* and stays > readable forever. Before a task exit, a poll on a pidfd should *not* > yield POLLIN and reading that pidfd should *not* complete immediately. > There's no way that, having observed POLLIN on a pidfd, you should > ever then *not* see POLLIN on that pidfd in the future --- it's a > one-way transition from not-ready-to-get-exit-status to > ready-to-get-exit-status.
What do you consider interesting state transitions? A listener on a pidfd in epoll_wait() might be interested if the process execs for example. That's a very valid use-case for e.g. systemd.
Sure, but systemd is specialized.
So is Android and we're not designing an interface for Android but for all of userspace.
Nothing in my post is Android-specific. Waiting for non-child processes is something that lots of people want to do, which is why patches to enable it have been getting posted every few years for many years (e.g., Andy's from 2011). I, too, want to make an API for all over userspace. Don't attribute to me arguments that I'm not actually making.
I hope this is clear. Service managers are quite important and systemd is the largest one and they can make good use of this feature.
Service managers already have the tools they need to do their job. The
No they don't. Even if they quite often have kludges and run into a lot of problems. That's why there's interest in these features as well.
Yes, and these facilities should have a powerful toolkit that they can use to do their job in the right way. This toolkit will probably involve pretty powerful kinds of process monitoring. I don't see a reason to gate the ability to wait for process death via pidfd on that toolkit. Please don't interpret my position as saying that the service monitor usecase is unimportant or not worth adding to Linux.
Great. It sounded like it.
<snip>
Well, daemons tend to do those things do. System managers and container managers are just an example of a whole class. Even if you just consider system managers like openrc, systemd you have gotten yourself quite a large userbase.
When I said "niche", I didn't mean "unimportant". I meant "specialized", as in the people who write these sorts of programs are willing to dive in to low-level operational details and get things right. I also think the community of people *writing* programs like systemd is relatively small.
The point is that these tools are widely used not how many people develop it and making their live easier and listening to their use-cases as well is warranted.
<snip-it-just-gets-too-long>
FreeBSD obviously has thought about being able to observe more than just NOTE_EXIT in the future.
Yes. Did I say anywhere that we should never be able to observe execs and forks? I think that what FreeBSD got *right* is making process exit status broadly available. What I think they got wrong is the mixing of exit information with other EVFILT_PROCDESC messages.
wait for processes to exit and to retrieve their exit information.
Speaking of pkill: AIUI, in your current patch set, one can get a pidfd *only* via clone. Joel indicated that he believes poll(2) shouldn't be supported on procfs pidfds. Is that your thinking as well? If that's the case, then we're in a state where non-parents
Yes, it is.
If reading process status information from a pidfd is destructive, it's dangerous to share pidfds between processes. If reading information *isn't* destructive, how are you supposed to use poll(2) to wait for the next transition? Is poll destructive? If you can only make a new pidfd via clone, you can't get two separate event streams for two different users. Sharing a single pidfd via dup or SCM_RIGHTS becomes dangerous, because if reading status is destructive, only one reader can observe each event. Your proposed edge-triggered design makes pidfds significantly less useful, because in your design, it's unsafe to share a single pidfd open file description *and* there's no way to create a new pidfd open file description for an existing process.
I think we should make an API for all of userspace and not just for container managers and systemd.
I mean, you can go and try making arguments based on syntactical rearrangements of things I said but I'm going to pass.
I'd prefer if we focused on substantive technical issues instead of unproductive comments on syntax. I'd prefer to talk about the
Great. Your
"I think we should make an API for all of userspace and not just for container managers and systemd."
very much sounded like you're mocking my point about being aware of other usecases apart from Android. Maybe you did not intend it that way. Very much felt like it.
technical concerns I raised regarding an edge-triggered event-queue design making pidfd file descriptor sharing dangerous.
My point simply was: There are more users that would be interested in observing more state transitions in the future. Your argument made it sound like they are not worth considering. I disagree.
I believe you've misunderstood me. I've never said that this use case is "not worth considering". I would appreciate it if you wouldn't claim that I believe these use cases aren't worth considering. My
I didn't claim you believe it rather you made it sound like and you did dismiss those use-cases as driving factors when thinking about this api.
point is that these uses would be better served through a dedicated process event monitoring facility, one that could replace ptrace. I would be thrilled by something like that. The point I'm making, to be very clear, is *NOT* that process monitoring is "not worth considering", but that process monitoring is subtle and complicated enough that it ought to be considered as a standalone project, independent of pidfds proper and of the very simple and effective pidfd system that Joel has proposed in his patch series.
can't wait for process exit, and providing this facility is an important goal of the whole project.
That's your goal.
I thought we all agreed on that months ago that it's reasonable to allow processes to wait for non-child processes to exit. Now, out of
Uhm, I can't remember being privy to that agreement but the threads get so long that maybe I forgot what I wrote?
the blue, you're saying that 1) actually, we want a rich API for all kinds of things that aren't process exit, because systemd, and 2)
- I'm not saying we have to. It just makes it more flexible and is something
we can at least consider.
I've spent a few emails and a lot of thought considering the idea. If I weren't considering it, I wouldn't have thought through the implications of destructive event reads above. My position, *after due consideration*, is that we're better off making the pidfd poll() system a level-triggered exit-only signal and defer more sophisticated
That's something you'd need to explain in more depth and why if it works for FreeBSD we can't just do the same.
monitoring to a separate facility. I'm not saying that we shouldn't be able to monitor processes. I'm saying that there are important technical and API design reasons why the initial version of the pidfd wait system should be simple and support only one thing. The model you've proposed has important technical disadvantages, and you haven't addressed these technical disadvantages.
- systemd is an example of another *huge* user of this api. That neither implies
this api is "because systemd" it simply makes it worth that we consider this use-case.
actually, non-parents shouldn't be able to wait for process death. I
I'm sorry, who has agreed that a non-parent should be able to wait for process death?
Aleksa Sarai, for starters. See [1]. In any case, I don't see where your sudden opposition to the idea is coming from. I've been saying over and over that it's important that we allow processes to wait for the death of non-children. Why are you just objecting now? And on what
First of all, because I haven't been Cced on that thread so I didn't follow it.
basis are you objecting? Why *shouldn't* we be able to wait for process death in a general way? Both FreeBSD and Windows can do it.
Because I consider this to be quite a controversial change with a lot of implications. And as I said in the previous mail: If you have the support and the Acks and Reviews you are more than welcome to do this. I don't think this is relevant for the polling patchset though. But please, see my comment below because I think we might have talked about different things...
I know you proposed that but has anyone ever substantially supported this? I'm happy if you can gather the necessary support for this but I just haven't seen that yet.
I recommend re-reading the whole exithand thread, since we covered a lot of the ground that we're also covering here. In any case, there's a legitimate technical reason why we'd want to wait for non-child death, and I would appreciate it if you didn't just summarily dismiss it as "[my] goal". Let's talk about what's best for users, not use that kind of unproductive rhetoric.
Wait, to clarify: Are you saying any process should be able to call waitid(pid/pidfd) on a non-child or are you saying that a process just needs a way to block until another process has exited and then - if the parent has reaped it - read its exit status. I agree with the latter it's the former I find strange. The mail reads like the latter though.
[1] https://lore.kernel.org/lkml/20181101070036.l24c2p432ohuwmqf@yavin/
On 20.04.19 01:29, Daniel Colascione wrote:
The point I'm making, to be> very clear, is *NOT* that process monitoring is "not worth>
considering", but that process monitoring is subtle and complicated> enough that it ought to be considered as a standalone project,> independent of pidfds proper and of the very simple and effective> pidfd system that Joel has proposed in his patch series.
At that point I'm wondering: what pidfd is actually meant for, if not process monitoring ?
--mtx
On 20.04.19 01:02, Christian Brauner wrote:
I'm sorry, who has agreed that a non-parent should be able to wait for process death? I know you proposed that but has anyone ever substantially supported this?
I actually support this.
One practical usecase I have in mind for some upcoming projects is monitoring processes in a container from the outside.
--mtx
On 19.04.19 23:48, Christian Brauner wrote:
So is Android and we're not designing an interface for Android but for all of userspace. I hope this is clear. Service managers are quite important and systemd is the largest one and they can make good use of this feature.
Maybe just talk about talking about service managers and not even mentioning systemd ;-)
That whole pargraph is about dismissing a range of valid use-cases based on assumptions such as "way more common" and even argues that service managers are special cases and therefore not really worth considering. I would like to be more open to other use cases.
ACK. Things like pidfd can make life for service managers a lot easier, and even allow whole new scenarios.
For example, the monitoring part doesn't need to be the spawning process anymore. The spawner could send the pidfd to the monitor, eg. via unix socket. If we had plan9-like srv filesystem (something I've got on my todo list) we could even make the pidfd's available in the fs and so replace the old pidfiles (pidfile now would be really a process handle instead of just a scratchpad for writing down the pid).
Joel indicated that he believes poll(2) shouldn't be supported on procfs pidfds. Is that your thinking as well? If that's the case, then we're in a state where non-parents
Yes, it is.
Why so ?
IMHO, it would be much more useful, if another process can get the same process handle via procfs. This also would allow the scenario described above w/o srvfs: pidfiles could just be symlinks to the corresponding /proc files.
--mtx
On 19.04.19 23:21, Daniel Colascione wrote:
EPOLLIN on a pidfd could very well mean that data can be read via a read() on the pidfd *other* than the exit status. The read could e.g. give you a lean struct that indicates the type of state transition: NOTIFY_EXIT, NOTIFY_EXEC, etc.. This way we are not bound to a specific poll event indicating a specific state. Though there's a case to be made that EPOLLHUP could indicate process exit and EPOLLIN a state change + read().
And do you imagine making read() destructive? Does that read() then reset the POLLIN state? You're essentially proposing that a pidfd provide an "event stream" interface, delivering notifications packets that indicate state changes like "process exited" or "process stopped" or "process execed". While this sort of interface is powerful and has some nice properties that tools like debuggers and daemon monitors might want to use, I think it's too complicated and error prone for the overwhelmingly common case of wanting to monitor process lifetime.
I don't think it's so complicated. read() + comparing a few bits (eg. strncmp(), if the packets are strings) really isn't a big deal.
Actually, I proposed quite the same (before I read this mail).
I like Linus' idea of just making waitid(2) (not waitpid(2), as I mistakenly mentioned earlier) on a pidfd act *exactly* like a waitid(2) on the corresponding process and making POLLIN just mean "waitid will succeed". It's a nice simple model that's easy to reason about and that makes it easy to port existing code to pidfds.
Okay, that's probably a good starting point. We could add more sophisticated monitoring later.
--mtx
On 04/18, Jann Horn wrote:
On Wed, Apr 17, 2019 at 3:09 PM Oleg Nesterov oleg@redhat.com wrote:
On 04/16, Joel Fernandes wrote:
On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote:
Could you explain when it should return POLLIN? When the whole process exits?
It returns POLLIN when the task is dead or doesn't exist anymore, or when it is in a zombie state and there's no other thread in the thread group.
IOW, when the whole thread group exits, so it can't be used to monitor sub-threads.
just in case... speaking of this patch it doesn't modify proc_tid_base_operations, so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are going to use the anonymous file returned by CLONE_PIDFD ?
I don't think procfs works that way. /proc/sub-thread-tid has proc_tgid_base_operations despite not being a thread group leader.
Yes, sorry, I meant /proc/pid/task/sub-thread-tid.
But poll("/proc/sub-thread-tid") won't work too, we can't rely on do_notify_parent() if the task is not a group leader.
(Yes, that's kinda weird.) AFAICS the WARN_ON_ONCE() in this code can be hit trivially, and then the code will misbehave.
Heh, I didn't even notice that WARN_ON_ONCE(task && !thread_group_leader(task)) ;)
@Joel: I think you'll have to either rewrite this to explicitly bail out if you're dealing with a thread group leader, or make the code work for threads, too.
The last version of CLONE_PIDFD doesn't allow CLONE_THREAD, so we can forget about this problem for now.
Oleg.
Just returned to work today dealing with "life" issues, apologies for the delays in replying. :)
On Wed, Apr 17, 2019 at 03:09:41PM +0200, Oleg Nesterov wrote:
On 04/16, Joel Fernandes wrote:
On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote:
Could you explain when it should return POLLIN? When the whole process exits?
It returns POLLIN when the task is dead or doesn't exist anymore, or when it is in a zombie state and there's no other thread in the thread group.
IOW, when the whole thread group exits, so it can't be used to monitor sub-threads.
just in case... speaking of this patch it doesn't modify proc_tid_base_operations, so you can't poll("/proc/sub-thread-tid") anyway, but iiuc you are going to use the anonymous file returned by CLONE_PIDFD ?
Yes, I am going to be converting to non-proc file returned by CLONE_PIDFD, yes. (But I am still catching up with all threads and will read the latest on whether we are still consider proc pidfds, last I understand - we are not).
Then all you need is
!task || task->exit_state && thread_group_empty(task)
Yes this works as well, all the tests pass with your suggestion so I'll change it to that. Although I will the be giving up returing EPOLLERR if the task_struct doesn't exit. We don't need that, but I thought it was cool to return it anyway.
OK, task == NULL means that it was already reaped by parent, pid_nr is free, probably useful....
Ok I will add that semantic as well then.
Please do not use EXIT_DEAD/EXIT_ZOMBIE. And ->wait_pidfd should probably live in task->signal_struct.
About wait_pidfd living in signal_struct, that wont work since the waitqueue has to survive for the duration of the poll system call.
That is why I said this will need the additional cleanup in free_signal_struct(). But I was wrong, somehow I forgot that free_poll_entry() needs wq_head->lock ;) so this will need much more complications, lets forget it...
Ok np :)
Also the waitqueue living in struct pid solves the de_thread() issue I mentioned later in the following thread and in the commit message: https://lore.kernel.org/patchwork/comment/1257175/
Hmm...
- By including the struct pid for the waitqueue means that during
de_exec, the thread doing de_thread() automatically gets the new waitqueue/pid even though its task_struct is different.
this one?
this is not true, or I do not understand...
it gets the _same_ (old, not new) PIDTYPE_TGID pid even if it changes task_struct. But probably this is what you actually meant, because this is what your patch wants or I am totally confused.
Yes, that's what I meant, sorry.
And note that exec/de_thread doesn't change ->signal_struct, so I do not understand you anyway. Nevermind.
Yes right, but the signal_struct would suffer from the waitqueue lifetime issue anyway so we can't use it. The current patch works well for everything.
thanks,
- Joel
On Tue, Apr 16, 2019 at 8:21 PM Joel Fernandes joel@joelfernandes.org wrote:
On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote:
On 04/11, Joel Fernandes (Google) wrote:
+static unsigned int proc_tgid_base_poll(struct file *file, struct poll_table_struct *pts) +{
- int poll_flags = 0;
- struct task_struct *task;
- struct pid *pid;
- task = get_proc_task(file->f_path.dentry->d_inode);
- WARN_ON_ONCE(task && !thread_group_leader(task));
- /*
- tasklist_lock must be held because to avoid racing with
- changes in exit_state and wake up. Basically to avoid:
- P0: read exit_state = 0
- P1: write exit_state = EXIT_DEAD
- P1: Do a wake up - wq is empty, so do nothing
- P0: Queue for polling - wait forever.
- */
- read_lock(&tasklist_lock);
- if (!task)
poll_flags = POLLIN | POLLRDNORM | POLLERR;
- else if (task->exit_state == EXIT_DEAD)
poll_flags = POLLIN | POLLRDNORM;
- else if (task->exit_state == EXIT_ZOMBIE && thread_group_empty(task))
poll_flags = POLLIN | POLLRDNORM;
- if (!poll_flags) {
pid = proc_pid(file->f_path.dentry->d_inode);
poll_wait(file, &pid->wait_pidfd, pts);
- }
can't understand...
Could you explain when it should return POLLIN? When the whole process exits?
It returns POLLIN when the task is dead or doesn't exist anymore, or when it is in a zombie state and there's no other thread in the thread group.
Would using something other than POLLIN be an option (maybe POLLPRI)? The convention is to use it to indicate readability on the descriptor, and also possibly POLLHUP instead of POLLERR (the latter is less of a problem, but FreeBSD also does the same, so it'd help with some consistency for libraries wanting to use this, which aren't interested in other sub states).
- Joel
On Thu, Apr 18, 2019 at 11:44 AM Jonathan Kowalski bl0pbl33p@gmail.com wrote:
On Tue, Apr 16, 2019 at 8:21 PM Joel Fernandes joel@joelfernandes.org wrote:
On Tue, Apr 16, 2019 at 02:04:31PM +0200, Oleg Nesterov wrote:
On 04/11, Joel Fernandes (Google) wrote:
+static unsigned int proc_tgid_base_poll(struct file *file, struct poll_table_struct *pts) +{
- int poll_flags = 0;
- struct task_struct *task;
- struct pid *pid;
- task = get_proc_task(file->f_path.dentry->d_inode);
- WARN_ON_ONCE(task && !thread_group_leader(task));
- /*
- tasklist_lock must be held because to avoid racing with
- changes in exit_state and wake up. Basically to avoid:
- P0: read exit_state = 0
- P1: write exit_state = EXIT_DEAD
- P1: Do a wake up - wq is empty, so do nothing
- P0: Queue for polling - wait forever.
- */
- read_lock(&tasklist_lock);
- if (!task)
poll_flags = POLLIN | POLLRDNORM | POLLERR;
- else if (task->exit_state == EXIT_DEAD)
poll_flags = POLLIN | POLLRDNORM;
- else if (task->exit_state == EXIT_ZOMBIE && thread_group_empty(task))
poll_flags = POLLIN | POLLRDNORM;
- if (!poll_flags) {
pid = proc_pid(file->f_path.dentry->d_inode);
poll_wait(file, &pid->wait_pidfd, pts);
- }
can't understand...
Could you explain when it should return POLLIN? When the whole process exits?
It returns POLLIN when the task is dead or doesn't exist anymore, or when it is in a zombie state and there's no other thread in the thread group.
Would using something other than POLLIN be an option (maybe POLLPRI)? The convention is to use it to indicate readability on the descriptor, and also possibly POLLHUP instead of POLLERR (the latter is less of a problem, but FreeBSD also does the same, so it'd help with some consistency for libraries wanting to use this, which aren't interested in other sub states).
Existing event loop libraries generally support checking only for readability and writability. Not setting POLLIN would make these FDs more difficult to use with existing event loop libraries. What advantage would compensate for this difficulty?
On Thu, Apr 18, 2019 at 11:58 AM Daniel Colascione dancol@google.com wrote:
On Thu, Apr 18, 2019 at 11:44 AM Jonathan Kowalski bl0pbl33p@gmail.com wrote:
Would using something other than POLLIN be an option (maybe POLLPRI)? The convention is to use it to indicate readability on the descriptor, and also possibly POLLHUP instead of POLLERR (the latter is less of a problem, but FreeBSD also does the same, so it'd help with some consistency for libraries wanting to use this, which aren't interested in other sub states).
Existing event loop libraries generally support checking only for readability and writability. Not setting POLLIN would make these FDs more difficult to use with existing event loop libraries. What advantage would compensate for this difficulty?
Right.
Usually you'd set POLLIN in _addition_ to any other more specialized poll flag.
For example, when a socket has shut down the read side, we do
if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
because while it's true that EPOLLRDHUP is the most _specific_ poll bit, it's _also_ true that a read shutdown means that the read() will return immediately.
So generally a HUP condition should mean that POLLIN and POLLOUT also get set. Not because there's any actual _data_ to be read, but simply because the read will not block.
Linus
On Thu, Apr 18, 2019 at 12:14:58PM -0700, Linus Torvalds wrote:
On Thu, Apr 18, 2019 at 11:58 AM Daniel Colascione dancol@google.com wrote:
On Thu, Apr 18, 2019 at 11:44 AM Jonathan Kowalski bl0pbl33p@gmail.com wrote:
Would using something other than POLLIN be an option (maybe POLLPRI)? The convention is to use it to indicate readability on the descriptor, and also possibly POLLHUP instead of POLLERR (the latter is less of a problem, but FreeBSD also does the same, so it'd help with some consistency for libraries wanting to use this, which aren't interested in other sub states).
Existing event loop libraries generally support checking only for readability and writability. Not setting POLLIN would make these FDs more difficult to use with existing event loop libraries. What advantage would compensate for this difficulty?
Right.
Usually you'd set POLLIN in _addition_ to any other more specialized poll flag.
For example, when a socket has shut down the read side, we do
if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
because while it's true that EPOLLRDHUP is the most _specific_ poll bit, it's _also_ true that a read shutdown means that the read() will return immediately.
So generally a HUP condition should mean that POLLIN and POLLOUT also get set. Not because there's any actual _data_ to be read, but simply because the read will not block.
Sounds great and I agree with Linus and Daniel. So I am guessing you are Ok with the current set of flags proposed this patch, so I will keep them intact in future patch postings. But please let me know if you want me to change something about the flags.
thanks!
- Joel
linux-kselftest-mirror@lists.linaro.org