The primary motivation for the need for this flag is container runtimes which have to interact with malicious root filesystems in the host namespaces. One of the first requirements for a container runtime to be secure against a malicious rootfs is that they correctly scope symlinks (that is, they should be scoped as though they are chroot(2)ed into the container's rootfs) and ".."-style paths. The already-existing AT_XDEV and AT_NO_PROCLINKS help defend against other potential attacks in a malicious rootfs scenario.
Currently most container runtimes try to do this resolution in userspace[1], causing many potential race conditions. In addition, the "obvious" alternative (actually performing a {ch,pivot_}root(2)) requires a fork+exec which is *very* costly if necessary for every filesystem operation involving a container.
The most significant change in semantics with AT_THIS_ROOT is that *at(2) syscalls now no longer have the property that an absolute pathname causes the dirfd to be ignored completely (if LOOKUP_CHROOT is specified). The reasoning behind this is that AT_THIS_ROOT necessarily has to chroot-scope symlinks with absolute paths to dirfd, and so doing it for the base path seems to be the most consistent behaviour (and also avoids foot-gunning users who want to chroot-scope paths that might be absolute).
Currently this is only enabled for the stat(2) and openat(2) family (the latter has its own flag O_THISROOT with the same semantics). Ideally this flag would be supported by all *at(2) syscalls, but this will require adding flags arguments to many of them (and will be done in a separate patchset).
[1]: https://github.com/cyphar/filepath-securejoin
Cc: Eric Biederman ebiederm@xmission.com Cc: Christian Brauner christian@brauner.io Signed-off-by: Aleksa Sarai cyphar@cyphar.com --- fs/fcntl.c | 2 +- fs/namei.c | 121 +++++++++++++++++-------------- fs/open.c | 2 + fs/stat.c | 4 +- include/linux/fcntl.h | 2 +- include/linux/namei.h | 1 + include/uapi/asm-generic/fcntl.h | 3 + include/uapi/linux/fcntl.h | 2 + 8 files changed, 81 insertions(+), 56 deletions(-)
diff --git a/fs/fcntl.c b/fs/fcntl.c index e343618736f7..4c36c5b9fdb9 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -1031,7 +1031,7 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(25 - 1 /* for O_RDONLY being 0 */ != + BUILD_BUG_ON(26 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) | __FMODE_EXEC | __FMODE_NONOTIFY)); diff --git a/fs/namei.c b/fs/namei.c index 757dd783771c..1b984f0dbbb4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2193,9 +2193,64 @@ static int link_path_walk(const char *name, struct nameidata *nd) } }
+/* + * Configure nd->path based on the nd->dfd. This is only used as part of + * path_init(). + */ +static inline int dirfd_path_init(struct nameidata *nd) +{ + if (nd->dfd == AT_FDCWD) { + if (nd->flags & LOOKUP_RCU) { + struct fs_struct *fs = current->fs; + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + nd->path = fs->pwd; + nd->inode = nd->path.dentry->d_inode; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + } else { + get_fs_pwd(current->fs, &nd->path); + nd->inode = nd->path.dentry->d_inode; + } + } else { + /* Caller must check execute permissions on the starting path component */ + struct fd f = fdget_raw(nd->dfd); + struct dentry *dentry; + + if (!f.file) + return -EBADF; + + dentry = f.file->f_path.dentry; + + if (*nd->name->name && unlikely(!d_can_lookup(dentry))) { + fdput(f); + return -ENOTDIR; + } + + nd->path = f.file->f_path; + if (nd->flags & LOOKUP_RCU) { + nd->inode = nd->path.dentry->d_inode; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + } else { + path_get(&nd->path); + nd->inode = nd->path.dentry->d_inode; + } + fdput(f); + } + if (unlikely(nd->flags & (LOOKUP_CHROOT | LOOKUP_BENEATH))) { + nd->root = nd->path; + if (!(nd->flags & LOOKUP_RCU)) + path_get(&nd->root); + } + return 0; +} + /* must be paired with terminate_walk() */ static const char *path_init(struct nameidata *nd, unsigned flags) { + int error; const char *s = nd->name->name;
if (!*s) @@ -2230,65 +2285,25 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->path.dentry = NULL;
nd->m_seq = read_seqbegin(&mount_lock); + if (unlikely(flags & LOOKUP_CHROOT)) { + error = dirfd_path_init(nd); + if (unlikely(error)) + return ERR_PTR(error); + } if (*s == '/') { - int error; - set_root(nd); + if (likely(!nd->root.mnt)) + set_root(nd); error = nd_jump_root(nd); if (unlikely(error)) s = ERR_PTR(error); return s; - } else if (nd->dfd == AT_FDCWD) { - if (flags & LOOKUP_RCU) { - struct fs_struct *fs = current->fs; - unsigned seq; - - do { - seq = read_seqcount_begin(&fs->seq); - nd->path = fs->pwd; - nd->inode = nd->path.dentry->d_inode; - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); - } else { - get_fs_pwd(current->fs, &nd->path); - nd->inode = nd->path.dentry->d_inode; - } - if (unlikely(flags & LOOKUP_BENEATH)) { - nd->root = nd->path; - if (!(flags & LOOKUP_RCU)) - path_get(&nd->root); - } - return s; - } else { - /* Caller must check execute permissions on the starting path component */ - struct fd f = fdget_raw(nd->dfd); - struct dentry *dentry; - - if (!f.file) - return ERR_PTR(-EBADF); - - dentry = f.file->f_path.dentry; - - if (*s && unlikely(!d_can_lookup(dentry))) { - fdput(f); - return ERR_PTR(-ENOTDIR); - } - - nd->path = f.file->f_path; - if (flags & LOOKUP_RCU) { - nd->inode = nd->path.dentry->d_inode; - nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); - } else { - path_get(&nd->path); - nd->inode = nd->path.dentry->d_inode; - } - if (unlikely(flags & LOOKUP_BENEATH)) { - nd->root = nd->path; - if (!(flags & LOOKUP_RCU)) - path_get(&nd->root); - } - fdput(f); - return s; } + if (likely(!nd->path.mnt)) { + error = dirfd_path_init(nd); + if (unlikely(error)) + return ERR_PTR(error); + } + return s; }
static const char *trailing_symlink(struct nameidata *nd) diff --git a/fs/open.c b/fs/open.c index 80f5f566a5ff..81d148f626cd 100644 --- a/fs/open.c +++ b/fs/open.c @@ -996,6 +996,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o lookup_flags |= LOOKUP_NO_PROCLINKS; if (flags & O_NOSYMLINKS) lookup_flags |= LOOKUP_NO_SYMLINKS; + if (flags & O_THISROOT) + lookup_flags |= LOOKUP_CHROOT; op->lookup_flags = lookup_flags; return 0; } diff --git a/fs/stat.c b/fs/stat.c index 791e61b916ae..e8366e4812c3 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -172,7 +172,7 @@ int vfs_statx(int dfd, const char __user *filename, int flags,
if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH | KSTAT_QUERY_FLAGS | AT_BENEATH | AT_XDEV | - AT_NO_PROCLINKS | AT_NO_SYMLINKS)) + AT_NO_PROCLINKS | AT_NO_SYMLINKS | AT_THIS_ROOT)) return -EINVAL;
if (flags & AT_SYMLINK_NOFOLLOW) @@ -189,6 +189,8 @@ int vfs_statx(int dfd, const char __user *filename, int flags, lookup_flags |= LOOKUP_NO_PROCLINKS; if (flags & AT_NO_SYMLINKS) lookup_flags |= LOOKUP_NO_SYMLINKS; + if (flags & AT_THIS_ROOT) + lookup_flags |= LOOKUP_CHROOT;
retry: error = user_path_at(dfd, filename, lookup_flags, &path); diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h index ad5bba4b5b12..95480cd4c09d 100644 --- a/include/linux/fcntl.h +++ b/include/linux/fcntl.h @@ -10,7 +10,7 @@ O_APPEND | O_NDELAY | O_NONBLOCK | O_NDELAY | __O_SYNC | O_DSYNC | \ FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \ O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE | O_BENEATH | O_XDEV | \ - O_NOPROCLINKS | O_NOSYMLINKS) + O_NOPROCLINKS | O_NOSYMLINKS | O_THISROOT)
#ifndef force_o_largefile #define force_o_largefile() (BITS_PER_LONG != 32) diff --git a/include/linux/namei.h b/include/linux/namei.h index 5ff7f3362d1b..7ec9e2d84649 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -53,6 +53,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_NO_PROCLINKS 0x040000 /* No /proc/$pid/fd/ "symlink" crossing. */ #define LOOKUP_NO_SYMLINKS 0x080000 /* No symlink crossing *at all*. Implies LOOKUP_NO_PROCLINKS. */ +#define LOOKUP_CHROOT 0x100000 /* Treat dirfd as %current->fs->root. */
extern int path_pts(struct path *path);
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h index c2bf5983e46a..11206b0e927c 100644 --- a/include/uapi/asm-generic/fcntl.h +++ b/include/uapi/asm-generic/fcntl.h @@ -113,6 +113,9 @@ #ifndef O_NOSYMLINKS #define O_NOSYMLINKS 01000000000 #endif +#ifndef O_THISROOT +#define O_THISROOT 02000000000 +#endif
#define F_DUPFD 0 /* dup */ #define F_GETFD 1 /* get close_on_exec */ diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 551a9e2166a8..ea978457b68f 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -99,6 +99,8 @@ #define AT_NO_PROCLINKS 0x40000 /* No /proc/$pid/fd/... "symlinks". */ #define AT_NO_SYMLINKS 0x80000 /* No symlinks *at all*. Implies AT_NO_PROCLINKS. */ +#define AT_THIS_ROOT 0x100000 /* Path resolution acts as though + it is chroot-ed into dirfd. */
#endif /* _UAPI_LINUX_FCNTL_H */