This introduces signal->exec_bprm, which is used to
fix the case when at least one of the sibling threads
is traced, and therefore the trace process may dead-lock
in ptrace_attach, but de_thread will need to wait for the
tracer to continue execution.
The solution is to detect this situation and allow
ptrace_attach to continue by temporarily releasing the
cred_guard_mutex, while de_thread() is still waiting for
traced zombies to be eventually released by the tracer.
In the case of the thread group leader we only have to wait
for the thread to become a zombie, which may also need
co-operation from the tracer due to PTRACE_O_TRACEEXIT.
When a tracer wants to ptrace_attach a task that already
is in execve, we simply retry the ptrace_may_access
check while temporarily installing the new credentials
and dumpability which are about to be used after execve
completes. If the ptrace_attach happens on a thread that
is a sibling-thread of the thread doing execve, it is
sufficient to check against the old credentials, as this
thread will be waited for, before the new credentials are
installed.
Other threads die quickly since the cred_guard_mutex is
released, but a deadly signal is already pending. In case
the mutex_lock_killable misses the signal, the non-zero
current->signal->exec_bprm makes sure they release the
mutex immediately and return with -ERESTARTNOINTR.
This means there is no API change, unlike the previous
version of this patch which was discussed here:
https://lore.kernel.org/lkml/b6537ae6-31b1-5c50-f32b-8b8332ace882@hotmail.d…
See tools/testing/selftests/ptrace/vmaccess.c
for a test case that gets fixed by this change.
Note that since the test case was originally designed to
test the ptrace_attach returning an error in this situation,
the test expectation needed to be adjusted, to allow the
API to succeed at the first attempt.
Signed-off-by: Bernd Edlinger <bernd.edlinger(a)hotmail.de>
---
fs/exec.c | 69 ++++++++++++++++-------
fs/proc/base.c | 6 ++
include/linux/cred.h | 1 +
include/linux/sched/signal.h | 18 ++++++
kernel/cred.c | 28 +++++++--
kernel/ptrace.c | 32 +++++++++++
kernel/seccomp.c | 12 +++-
tools/testing/selftests/ptrace/vmaccess.c | 23 +++++---
8 files changed, 155 insertions(+), 34 deletions(-)
v10: Changes to previous version, make the PTRACE_ATTACH
retun -EAGAIN, instead of execve return -ERESTARTSYS.
Added some lessions learned to the description.
v11: Check old and new credentials in PTRACE_ATTACH again without
changing the API.
Note: I got actually one response from an automatic checker to the v11 patch,
https://lore.kernel.org/lkml/202107121344.wu68hEPF-lkp@intel.com/
which is complaining about:
>> kernel/ptrace.c:425:26: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct cred const *old_cred @@ got struct cred const [noderef] __rcu *real_cred @@
417 struct linux_binprm *bprm = task->signal->exec_bprm;
418 const struct cred *old_cred;
419 struct mm_struct *old_mm;
420
421 retval = down_write_killable(&task->signal->exec_update_lock);
422 if (retval)
423 goto unlock_creds;
424 task_lock(task);
> 425 old_cred = task->real_cred;
v12: Essentially identical to v11.
- Fixed a minor merge conflict in linux v5.17, and fixed the
above mentioned nit by adding __rcu to the declaration.
- re-tested the patch with all linux versions from v5.11 to v6.6
v10 was an alternative approach which did imply an API change.
But I would prefer to avoid such an API change.
The difficult part is getting the right dumpability flags assigned
before de_thread starts, hope you like this version.
If not, the v10 is of course also acceptable.
Thanks
Bernd.
diff --git a/fs/exec.c b/fs/exec.c
index 2f2b0acec4f0..902d3b230485 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1041,11 +1041,13 @@ static int exec_mmap(struct mm_struct *mm)
return 0;
}
-static int de_thread(struct task_struct *tsk)
+static int de_thread(struct task_struct *tsk, struct linux_binprm *bprm)
{
struct signal_struct *sig = tsk->signal;
struct sighand_struct *oldsighand = tsk->sighand;
spinlock_t *lock = &oldsighand->siglock;
+ struct task_struct *t = tsk;
+ bool unsafe_execve_in_progress = false;
if (thread_group_empty(tsk))
goto no_thread_group;
@@ -1068,6 +1070,19 @@ static int de_thread(struct task_struct *tsk)
if (!thread_group_leader(tsk))
sig->notify_count--;
+ while_each_thread(tsk, t) {
+ if (unlikely(t->ptrace)
+ && (t != tsk->group_leader || !t->exit_state))
+ unsafe_execve_in_progress = true;
+ }
+
+ if (unlikely(unsafe_execve_in_progress)) {
+ spin_unlock_irq(lock);
+ sig->exec_bprm = bprm;
+ mutex_unlock(&sig->cred_guard_mutex);
+ spin_lock_irq(lock);
+ }
+
while (sig->notify_count) {
__set_current_state(TASK_KILLABLE);
spin_unlock_irq(lock);
@@ -1158,6 +1173,11 @@ static int de_thread(struct task_struct *tsk)
release_task(leader);
}
+ if (unlikely(unsafe_execve_in_progress)) {
+ mutex_lock(&sig->cred_guard_mutex);
+ sig->exec_bprm = NULL;
+ }
+
sig->group_exec_task = NULL;
sig->notify_count = 0;
@@ -1169,6 +1189,11 @@ static int de_thread(struct task_struct *tsk)
return 0;
killed:
+ if (unlikely(unsafe_execve_in_progress)) {
+ mutex_lock(&sig->cred_guard_mutex);
+ sig->exec_bprm = NULL;
+ }
+
/* protects against exit_notify() and __exit_signal() */
read_lock(&tasklist_lock);
sig->group_exec_task = NULL;
@@ -1253,6 +1278,24 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
return retval;
+ /* If the binary is not readable then enforce mm->dumpable=0 */
+ would_dump(bprm, bprm->file);
+ if (bprm->have_execfd)
+ would_dump(bprm, bprm->executable);
+
+ /*
+ * Figure out dumpability. Note that this checking only of current
+ * is wrong, but userspace depends on it. This should be testing
+ * bprm->secureexec instead.
+ */
+ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
+ is_dumpability_changed(current_cred(), bprm->cred) ||
+ !(uid_eq(current_euid(), current_uid()) &&
+ gid_eq(current_egid(), current_gid())))
+ set_dumpable(bprm->mm, suid_dumpable);
+ else
+ set_dumpable(bprm->mm, SUID_DUMP_USER);
+
/*
* Ensure all future errors are fatal.
*/
@@ -1261,7 +1304,7 @@ int begin_new_exec(struct linux_binprm * bprm)
/*
* Make this the only thread in the thread group.
*/
- retval = de_thread(me);
+ retval = de_thread(me, bprm);
if (retval)
goto out;
@@ -1284,11 +1327,6 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
goto out;
- /* If the binary is not readable then enforce mm->dumpable=0 */
- would_dump(bprm, bprm->file);
- if (bprm->have_execfd)
- would_dump(bprm, bprm->executable);
-
/*
* Release all of the old mmap stuff
*/
@@ -1350,18 +1388,6 @@ int begin_new_exec(struct linux_binprm * bprm)
me->sas_ss_sp = me->sas_ss_size = 0;
- /*
- * Figure out dumpability. Note that this checking only of current
- * is wrong, but userspace depends on it. This should be testing
- * bprm->secureexec instead.
- */
- if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
- !(uid_eq(current_euid(), current_uid()) &&
- gid_eq(current_egid(), current_gid())))
- set_dumpable(current->mm, suid_dumpable);
- else
- set_dumpable(current->mm, SUID_DUMP_USER);
-
perf_event_exec();
__set_task_comm(me, kbasename(bprm->filename), true);
@@ -1480,6 +1506,11 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
return -ERESTARTNOINTR;
+ if (unlikely(current->signal->exec_bprm)) {
+ mutex_unlock(¤t->signal->cred_guard_mutex);
+ return -ERESTARTNOINTR;
+ }
+
bprm->cred = prepare_exec_creds();
if (likely(bprm->cred))
return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ffd54617c354..0da9adfadb48 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2788,6 +2788,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
if (rv < 0)
goto out_free;
+ if (unlikely(current->signal->exec_bprm)) {
+ mutex_unlock(¤t->signal->cred_guard_mutex);
+ rv = -ERESTARTNOINTR;
+ goto out_free;
+ }
+
rv = security_setprocattr(PROC_I(inode)->op.lsm,
file->f_path.dentry->d_name.name, page,
count);
diff --git a/include/linux/cred.h b/include/linux/cred.h
index f923528d5cc4..b01e309f5686 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -159,6 +159,7 @@ extern const struct cred *get_task_cred(struct task_struct *);
extern struct cred *cred_alloc_blank(void);
extern struct cred *prepare_creds(void);
extern struct cred *prepare_exec_creds(void);
+extern bool is_dumpability_changed(const struct cred *, const struct cred *);
extern int commit_creds(struct cred *);
extern void abort_creds(struct cred *);
extern const struct cred *override_creds(const struct cred *);
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0014d3adaf84..14df7073a0a8 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -234,9 +234,27 @@ struct signal_struct {
struct mm_struct *oom_mm; /* recorded mm when the thread group got
* killed by the oom killer */
+ struct linux_binprm *exec_bprm; /* Used to check ptrace_may_access
+ * against new credentials while
+ * de_thread is waiting for other
+ * traced threads to terminate.
+ * Set while de_thread is executing.
+ * The cred_guard_mutex is released
+ * after de_thread() has called
+ * zap_other_threads(), therefore
+ * a fatal signal is guaranteed to be
+ * already pending in the unlikely
+ * event, that
+ * current->signal->exec_bprm happens
+ * to be non-zero after the
+ * cred_guard_mutex was acquired.
+ */
+
struct mutex cred_guard_mutex; /* guard against foreign influences on
* credential calculations
* (notably. ptrace)
+ * Held while execve runs, except when
+ * a sibling thread is being traced.
* Deprecated do not use in new code.
* Use exec_update_lock instead.
*/
diff --git a/kernel/cred.c b/kernel/cred.c
index 98cb4eca23fb..586cb6c7cf6b 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -433,6 +433,28 @@ static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
return false;
}
+/**
+ * is_dumpability_changed - Will changing creds from old to new
+ * affect the dumpability in commit_creds?
+ *
+ * Return: false - dumpability will not be changed in commit_creds.
+ * Return: true - dumpability will be changed to non-dumpable.
+ *
+ * @old: The old credentials
+ * @new: The new credentials
+ */
+bool is_dumpability_changed(const struct cred *old, const struct cred *new)
+{
+ if (!uid_eq(old->euid, new->euid) ||
+ !gid_eq(old->egid, new->egid) ||
+ !uid_eq(old->fsuid, new->fsuid) ||
+ !gid_eq(old->fsgid, new->fsgid) ||
+ !cred_cap_issubset(old, new))
+ return true;
+
+ return false;
+}
+
/**
* commit_creds - Install new credentials upon the current task
* @new: The credentials to be assigned
@@ -467,11 +489,7 @@ int commit_creds(struct cred *new)
get_cred(new); /* we will require a ref for the subj creds too */
/* dumpability changes */
- if (!uid_eq(old->euid, new->euid) ||
- !gid_eq(old->egid, new->egid) ||
- !uid_eq(old->fsuid, new->fsuid) ||
- !gid_eq(old->fsgid, new->fsgid) ||
- !cred_cap_issubset(old, new)) {
+ if (is_dumpability_changed(old, new)) {
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 443057bee87c..eb1c450bb7d7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -20,6 +20,7 @@
#include <linux/pagemap.h>
#include <linux/ptrace.h>
#include <linux/security.h>
+#include <linux/binfmts.h>
#include <linux/signal.h>
#include <linux/uio.h>
#include <linux/audit.h>
@@ -435,6 +436,28 @@ static int ptrace_attach(struct task_struct *task, long request,
if (retval)
goto unlock_creds;
+ if (unlikely(task->in_execve)) {
+ struct linux_binprm *bprm = task->signal->exec_bprm;
+ const struct cred __rcu *old_cred;
+ struct mm_struct *old_mm;
+
+ retval = down_write_killable(&task->signal->exec_update_lock);
+ if (retval)
+ goto unlock_creds;
+ task_lock(task);
+ old_cred = task->real_cred;
+ old_mm = task->mm;
+ rcu_assign_pointer(task->real_cred, bprm->cred);
+ task->mm = bprm->mm;
+ retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
+ rcu_assign_pointer(task->real_cred, old_cred);
+ task->mm = old_mm;
+ task_unlock(task);
+ up_write(&task->signal->exec_update_lock);
+ if (retval)
+ goto unlock_creds;
+ }
+
write_lock_irq(&tasklist_lock);
retval = -EPERM;
if (unlikely(task->exit_state))
@@ -508,6 +531,14 @@ static int ptrace_traceme(void)
{
int ret = -EPERM;
+ if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
+ return -ERESTARTNOINTR;
+
+ if (unlikely(current->signal->exec_bprm)) {
+ mutex_unlock(¤t->signal->cred_guard_mutex);
+ return -ERESTARTNOINTR;
+ }
+
write_lock_irq(&tasklist_lock);
/* Are we already being traced? */
if (!current->ptrace) {
@@ -523,6 +554,7 @@ static int ptrace_traceme(void)
}
}
write_unlock_irq(&tasklist_lock);
+ mutex_unlock(¤t->signal->cred_guard_mutex);
return ret;
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 255999ba9190..b29bbfa0b044 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1955,9 +1955,15 @@ static long seccomp_set_mode_filter(unsigned int flags,
* Make sure we cannot change seccomp or nnp state via TSYNC
* while another thread is in the middle of calling exec.
*/
- if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
- mutex_lock_killable(¤t->signal->cred_guard_mutex))
- goto out_put_fd;
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
+ if (mutex_lock_killable(¤t->signal->cred_guard_mutex))
+ goto out_put_fd;
+
+ if (unlikely(current->signal->exec_bprm)) {
+ mutex_unlock(¤t->signal->cred_guard_mutex);
+ goto out_put_fd;
+ }
+ }
spin_lock_irq(¤t->sighand->siglock);
diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c
index 4db327b44586..3b7d81fb99bb 100644
--- a/tools/testing/selftests/ptrace/vmaccess.c
+++ b/tools/testing/selftests/ptrace/vmaccess.c
@@ -39,8 +39,15 @@ TEST(vmaccess)
f = open(mm, O_RDONLY);
ASSERT_GE(f, 0);
close(f);
- f = kill(pid, SIGCONT);
- ASSERT_EQ(f, 0);
+ f = waitpid(-1, NULL, 0);
+ ASSERT_NE(f, -1);
+ ASSERT_NE(f, 0);
+ ASSERT_NE(f, pid);
+ f = waitpid(-1, NULL, 0);
+ ASSERT_EQ(f, pid);
+ f = waitpid(-1, NULL, 0);
+ ASSERT_EQ(f, -1);
+ ASSERT_EQ(errno, ECHILD);
}
TEST(attach)
@@ -57,22 +64,24 @@ TEST(attach)
sleep(1);
k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
- ASSERT_EQ(errno, EAGAIN);
- ASSERT_EQ(k, -1);
+ ASSERT_EQ(k, 0);
k = waitpid(-1, &s, WNOHANG);
ASSERT_NE(k, -1);
ASSERT_NE(k, 0);
ASSERT_NE(k, pid);
ASSERT_EQ(WIFEXITED(s), 1);
ASSERT_EQ(WEXITSTATUS(s), 0);
- sleep(1);
- k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
+ k = waitpid(-1, &s, 0);
+ ASSERT_EQ(k, pid);
+ ASSERT_EQ(WIFSTOPPED(s), 1);
+ ASSERT_EQ(WSTOPSIG(s), SIGTRAP);
+ k = ptrace(PTRACE_CONT, pid, 0L, 0L);
ASSERT_EQ(k, 0);
k = waitpid(-1, &s, 0);
ASSERT_EQ(k, pid);
ASSERT_EQ(WIFSTOPPED(s), 1);
ASSERT_EQ(WSTOPSIG(s), SIGSTOP);
- k = ptrace(PTRACE_DETACH, pid, 0L, 0L);
+ k = ptrace(PTRACE_CONT, pid, 0L, 0L);
ASSERT_EQ(k, 0);
k = waitpid(-1, &s, 0);
ASSERT_EQ(k, pid);
--
2.39.2
nolibc currently uses 32-bit types for various APIs. These are
problematic as their reduced value range can lead to truncated values.
Signed-off-by: Thomas Weißschuh <linux(a)weissschuh.net>
---
Thomas Weißschuh (12):
tools/nolibc: use 64-bit ino_t
tools/nolibc: handle 64-bit off_t for llseek
tools/nolibc: prefer the llseek syscall
tools/nolibc: use 64-bit off_t
tools/nolibc: remove now superfluous overflow check in llseek
tools/nolibc: remove more __nolibc_enosys() fallbacks
tools/nolibc: prefer explicit 64-bit time-related system calls
tools/nolibc: gettimeofday(): avoid libgcc 64-bit divisions
tools/nolibc: use a custom struct timespec
tools/nolibc: always use 64-bit time types
selftests/nolibc: test compatibility of timespec and __kernel_timespec
tools/nolibc: remove time conversions
tools/include/nolibc/arch-s390.h | 3 +
tools/include/nolibc/poll.h | 12 ++--
tools/include/nolibc/std.h | 6 +-
tools/include/nolibc/sys.h | 21 +++---
tools/include/nolibc/sys/time.h | 2 +-
tools/include/nolibc/sys/timerfd.h | 20 +-----
tools/include/nolibc/time.h | 96 ++++++----------------------
tools/include/nolibc/types.h | 9 ++-
tools/testing/selftests/nolibc/nolibc-test.c | 18 ++++++
9 files changed, 68 insertions(+), 119 deletions(-)
---
base-commit: 90ee85c0e1e4b5804ceebbd731653e10ef3849a6
change-id: 20251001-nolibc-uapi-types-1c072d10fcc7
Best regards,
--
Thomas Weißschuh <linux(a)weissschuh.net>
Changelog:
v9:
Added review-bys and addressed comments from Mike Rapoport and
Pratyush Yadav.
Dropped patch that moves abort/finalize to public header per Mike's
request.
Added patch from Zhu Yanjun to output errors by name.
This series appliyes against akpm's mm-unstable branch.
This series refactors the KHO framework to better support in-kernel
users like the upcoming LUO. The current design, which relies on a
notifier chain and debugfs for control, is too restrictive for direct
programmatic use.
The core of this rework is the removal of the notifier chain in favor of
a direct registration API. This decouples clients from the shutdown-time
finalization sequence, allowing them to manage their preserved state
more flexibly and at any time.
In support of this new model, this series also:
- Makes the debugfs interface optional.
- Introduces APIs to unpreserve memory and fixes a bug in the abort
path where client state was being incorrectly discarded. Note that
this is an interim step, as a more comprehensive fix is planned as
part of the stateless KHO work [1].
- Moves all KHO code into a new kernel/liveupdate/ directory to
consolidate live update components.
[1] https://lore.kernel.org/all/20251020100306.2709352-1-jasonmiu@google.com
Mike Rapoport (Microsoft) (1):
kho: drop notifiers
Pasha Tatashin (7):
kho: make debugfs interface optional
kho: add interfaces to unpreserve folios, page ranges, and vmalloc
memblock: Unpreserve memory in case of error
test_kho: Unpreserve memory in case of error
kho: don't unpreserve memory during abort
liveupdate: kho: move to kernel/liveupdate
MAINTAINERS: update KHO maintainers
Zhu Yanjun (1):
liveupdate: kho: Use %pe format specifier for error pointer printing
Documentation/core-api/kho/concepts.rst | 2 +-
MAINTAINERS | 4 +-
include/linux/kexec_handover.h | 46 +-
init/Kconfig | 2 +
kernel/Kconfig.kexec | 24 -
kernel/Makefile | 3 +-
kernel/kexec_handover_internal.h | 16 -
kernel/liveupdate/Kconfig | 39 ++
kernel/liveupdate/Makefile | 5 +
kernel/{ => liveupdate}/kexec_handover.c | 532 +++++++-----------
.../{ => liveupdate}/kexec_handover_debug.c | 0
kernel/liveupdate/kexec_handover_debugfs.c | 221 ++++++++
kernel/liveupdate/kexec_handover_internal.h | 56 ++
lib/test_kho.c | 128 +++--
mm/memblock.c | 93 +--
tools/testing/selftests/kho/vmtest.sh | 1 +
16 files changed, 690 insertions(+), 482 deletions(-)
delete mode 100644 kernel/kexec_handover_internal.h
create mode 100644 kernel/liveupdate/Kconfig
create mode 100644 kernel/liveupdate/Makefile
rename kernel/{ => liveupdate}/kexec_handover.c (80%)
rename kernel/{ => liveupdate}/kexec_handover_debug.c (100%)
create mode 100644 kernel/liveupdate/kexec_handover_debugfs.c
create mode 100644 kernel/liveupdate/kexec_handover_internal.h
base-commit: 9ef7b034116354ee75502d1849280a4d2ff98a7c
--
2.51.1.930.gacf6e81ea2-goog
Hi all,
I have tried looking at an issue from the bpftool repository:
https://github.com/libbpf/bpftool/issues/121 and this patch series
tries to add that enhancement.
Summary: Currently when a map creation is successful there is no message
on the terminal, printing IDs on successful creation of maps can help
notify the user and can be used in CI/CD.
The first patch adds the logic for printing and the second patch adds a
simple selftest for the same.
Thank you very much.
V1 --> V2: PATCH 1 updated [Thanks Yonghong for suggesting better way of
error handling with a new label for close(fd); instead of calling
multiple times]
V2 --> V3: Thanks to Quentin.
PATCH1: drop \n in p_err statement
PATCH2: Remove messages in cases of successful ID printing. Also
remove message with a "FAIL:" prefix to make it more consistent.
Regards,
Harshit
Harshit Mogalapalli (2):
bpftool: Print map ID upon creation and support JSON output
selftests/bpf: Add test for bpftool map ID printing
tools/bpf/bpftool/map.c | 21 +++++++++---
.../testing/selftests/bpf/test_bpftool_map.sh | 32 +++++++++++++++++++
2 files changed, 49 insertions(+), 4 deletions(-)
--
2.50.1
Hi Linux-kselftest,
Please provide a quote for your products:
Include:
1.Pricing (per unit)
2.Delivery cost & timeline
3.Quote expiry date
Deadline: October
Thanks!
Danny Peddinti
PathnSitu Trading
Hi all,
I have tried looking at an issue from the bpftool repository:
https://github.com/libbpf/bpftool/issues/121 and this RFC tries to add
that enhancement.
Summary: Currently when a map creation is successful there is no message
on the terminal, printing IDs on successful creation of maps can help
notify the user and can be used in CI/CD.
The first patch adds the logic for printing and the second patch adds a
simple selftest for the same.
The github issue is not fully solved with these two patches, as there
are other bpf objects that might need similar additions. Would
appreciate any inputs on this.
Thank you very much.
V1 --> V2: PATCH 1 updated [Thanks Yonghong for suggesting better way of
error handling with a new label for close(fd); instead of calling
multiple times]
Regards,
Harshit
Harshit Mogalapalli (2):
bpftool: Print map ID upon creation and support JSON output
selftests/bpf: Add test for bpftool map ID printing
tools/bpf/bpftool/map.c | 21 ++++++++---
.../testing/selftests/bpf/test_bpftool_map.sh | 36 +++++++++++++++++++
2 files changed, 53 insertions(+), 4 deletions(-)
--
2.50.1
DAMON kunit tests were initially written assuming those will be run on
environments that are well controlled and therefore tolerant to
transient test failures and bugs in the test code itself. The user-mode
linux based manual run of the tests is one example of such an
environment. And the test code was written for adding more test
coverage as fast as possible, over making those safe and reliable.
As a result, the tests resulted in having a number of bugs including
real memory leaks, theoretical unhandled memory allocation failures, and
unused memory allocations. The allocation failures that are not handled
well are unlikely in the real world, since those allocations are too
small to fail. But in theory, it can happen and cause inappropriate
memory access.
It is arguable if bugs in test code can really harm users. But, anyway
bugs are bugs that need to be fixed. Fix the bugs one by one. Also Cc
stable@ for the fixes of memory leak and unhandled memory allocation
failures. The unused memory allocations are only a matter of memory
efficiency, so not Cc-ing stable@.
The first patch fixes memory leaks in the test code for the DAMON core
layer.
Following fifteen, three, and one patches respectively fix unhandled
memory allocation failures in the test code for DAMON core layer,
virtual address space DAMON operation set, and DAMON sysfs interface,
one by one per test function.
Final two patches remove memory allocations that are correctly
deallocated at the end, but not really being used by any code.
SeongJae Park (22):
mm/damon/tests/core-kunit: fix memory leak in
damon_test_set_filters_default_reject()
mm/damon/tests/core-kunit: handle allocation failures in
damon_test_regions()
mm/damon/tests/core-kunit: handle memory failure from
damon_test_target()
mm/damon/tests/core-kunit: handle memory alloc failure from
damon_test_aggregate()
mm/damon/tests/core-kunit: handle alloc failures on
damon_test_split_at()
mm/damon/tests/core-kunit: handle alloc failures on
damon_test_merge_two()
mm/damon/tests/core-kunit: handle alloc failures on
dasmon_test_merge_regions_of()
mm/damon/tests/core-kunit: handle alloc failures on
damon_test_split_regions_of()
mm/damon/tests/core-kunit: handle alloc failures in
damon_test_ops_registration()
mm/damon/tests/core-kunit: handle alloc failures in
damon_test_set_regions()
mm/damon/tests/core-kunit: handle alloc failures in
damon_test_update_monitoring_result()
mm/damon/tests/core-kunit: handle alloc failure on
damon_test_set_attrs()
mm/damon/tests/core-kunit: handle alloc failres in
damon_test_new_filter()
mm/damon/tests/core-kunit: handle alloc failure on
damos_test_commit_filter()
mm/damon/tests/core-kunit: handle alloc failures on
damos_test_filter_out()
mm/damon/tests/core-kunit: handle alloc failures on
damon_test_set_filters_default_reject()
mm/damon/tests/vaddr-kunit: handle alloc failures on
damon_do_test_apply_three_regions()
mm/damon/tests/vaddr-kunit: handle alloc failures in
damon_test_split_evenly_fail()
mm/damon/tests/vaddr-kunit: handle alloc failures on
damon_test_split_evenly_succ()
mm/damon/tests/sysfs-kunit: handle alloc failures on
damon_sysfs_test_add_targets()
mm/damon/tests/core-kunit: remove unnecessary damon_ctx variable on
damon_test_split_at()
mm/damon/tests/core-kunit: remove unused ctx in
damon_test_split_regions_of()
mm/damon/tests/core-kunit.h | 125 ++++++++++++++++++++++++++++++++---
mm/damon/tests/sysfs-kunit.h | 25 +++++++
mm/damon/tests/vaddr-kunit.h | 26 +++++++-
3 files changed, 163 insertions(+), 13 deletions(-)
base-commit: 75f0c76bb8c01fdea838a601dc3326b11177c0d8
--
2.47.3
Here is a small optimisation for the in-kernel PM, joined by a small
behavioural change to avoid confusions, and followed by a few more
tests.
- Patch 1: record fullmesh endpoints numbers, not to iterate over all
endpoints to check if one is marked as fullmesh.
- Patch 2: when at least one endpoint is marked as fullmesh, only use
these endpoints when reacting to an ADD_ADDR, even if there are no
endpoints for this IP family: this is less confusing.
- Patch 3: reduce duplicated code to prepare the next patch.
- Patch 4: extra "bind" cases: the listen socket restrict the bind to
one IP address, not allowing MP_JOIN to extra IP addresses, except if
another listening socket accepts them.
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
---
Matthieu Baerts (NGI0) (4):
mptcp: pm: in-kernel: record fullmesh endp nb
mptcp: pm: in kernel: only use fullmesh endp if any
selftests: mptcp: join: do_transfer: reduce code dup
selftests: mptcp: join: validate extra bind cases
include/uapi/linux/mptcp.h | 3 +-
net/mptcp/pm_kernel.c | 36 ++++-
net/mptcp/protocol.h | 1 +
net/mptcp/sockopt.c | 2 +
tools/testing/selftests/net/mptcp/mptcp_connect.c | 10 +-
tools/testing/selftests/net/mptcp/mptcp_join.sh | 187 +++++++++++++++++++---
6 files changed, 213 insertions(+), 26 deletions(-)
---
base-commit: 01cc760632b875c4ad0d8fec0b0c01896b8a36d4
change-id: 20251101-net-next-mptcp-fm-endp-nb-bind-cf7ab688d9f1
Best regards,
--
Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
From: Patrick Roy <roypat(a)amazon.co.uk>
[ based on kvm/next ]
Unmapping virtual machine guest memory from the host kernel's direct map is a
successful mitigation against Spectre-style transient execution issues: If the
kernel page tables do not contain entries pointing to guest memory, then any
attempted speculative read through the direct map will necessarily be blocked
by the MMU before any observable microarchitectural side-effects happen. This
means that Spectre-gadgets and similar cannot be used to target virtual machine
memory. Roughly 60% of speculative execution issues fall into this category [1,
Table 1].
This patch series extends guest_memfd with the ability to remove its memory
from the host kernel's direct map, to be able to attain the above protection
for KVM guests running inside guest_memfd.
Additionally, a Firecracker branch with support for these VMs can be found on
GitHub [2].
For more details, please refer to the v5 cover letter [v5]. No
substantial changes in design have taken place since.
=== Changes Since v6 ===
- Drop patch for passing struct address_space to ->free_folio(), due to
possible races with freeing of the address_space. (Hugh)
- Stop using PG_uptodate / gmem preparedness tracking to keep track of
direct map state. Instead, use the lowest bit of folio->private. (Mike, David)
- Do direct map removal when establishing mapping of gmem folio instead
of at allocation time, due to impossibility of handling direct map
removal errors in kvm_gmem_populate(). (Patrick)
- Do TLB flushes after direct map removal, and provide a module
parameter to opt out from them, and a new patch to export
flush_tlb_kernel_range() to KVM. (Will)
[1]: https://download.vusec.net/papers/quarantine_raid23.pdf
[2]: https://github.com/firecracker-microvm/firecracker/tree/feature/secret-hidi…
[RFCv1]: https://lore.kernel.org/kvm/20240709132041.3625501-1-roypat@amazon.co.uk/
[RFCv2]: https://lore.kernel.org/kvm/20240910163038.1298452-1-roypat@amazon.co.uk/
[RFCv3]: https://lore.kernel.org/kvm/20241030134912.515725-1-roypat@amazon.co.uk/
[v4]: https://lore.kernel.org/kvm/20250221160728.1584559-1-roypat@amazon.co.uk/
[v5]: https://lore.kernel.org/kvm/20250828093902.2719-1-roypat@amazon.co.uk/
[v6]: https://lore.kernel.org/kvm/20250912091708.17502-1-roypat@amazon.co.uk/
Patrick Roy (12):
arch: export set_direct_map_valid_noflush to KVM module
x86/tlb: export flush_tlb_kernel_range to KVM module
mm: introduce AS_NO_DIRECT_MAP
KVM: guest_memfd: Add stub for kvm_arch_gmem_invalidate
KVM: guest_memfd: Add flag to remove from direct map
KVM: guest_memfd: add module param for disabling TLB flushing
KVM: selftests: load elf via bounce buffer
KVM: selftests: set KVM_MEM_GUEST_MEMFD in vm_mem_add() if guest_memfd
!= -1
KVM: selftests: Add guest_memfd based vm_mem_backing_src_types
KVM: selftests: cover GUEST_MEMFD_FLAG_NO_DIRECT_MAP in existing
selftests
KVM: selftests: stuff vm_mem_backing_src_type into vm_shape
KVM: selftests: Test guest execution from direct map removed gmem
Documentation/virt/kvm/api.rst | 5 ++
arch/arm64/include/asm/kvm_host.h | 12 ++++
arch/arm64/mm/pageattr.c | 1 +
arch/loongarch/mm/pageattr.c | 1 +
arch/riscv/mm/pageattr.c | 1 +
arch/s390/mm/pageattr.c | 1 +
arch/x86/include/asm/tlbflush.h | 3 +-
arch/x86/mm/pat/set_memory.c | 1 +
arch/x86/mm/tlb.c | 1 +
include/linux/kvm_host.h | 9 +++
include/linux/pagemap.h | 16 +++++
include/linux/secretmem.h | 18 -----
include/uapi/linux/kvm.h | 2 +
lib/buildid.c | 4 +-
mm/gup.c | 19 ++----
mm/mlock.c | 2 +-
mm/secretmem.c | 8 +--
.../testing/selftests/kvm/guest_memfd_test.c | 2 +
.../testing/selftests/kvm/include/kvm_util.h | 37 ++++++++---
.../testing/selftests/kvm/include/test_util.h | 8 +++
tools/testing/selftests/kvm/lib/elf.c | 8 +--
tools/testing/selftests/kvm/lib/io.c | 23 +++++++
tools/testing/selftests/kvm/lib/kvm_util.c | 61 +++++++++--------
tools/testing/selftests/kvm/lib/test_util.c | 8 +++
tools/testing/selftests/kvm/lib/x86/sev.c | 1 +
.../selftests/kvm/pre_fault_memory_test.c | 1 +
.../selftests/kvm/set_memory_region_test.c | 50 ++++++++++++--
.../kvm/x86/private_mem_conversions_test.c | 7 +-
virt/kvm/guest_memfd.c | 66 +++++++++++++++++--
virt/kvm/kvm_main.c | 8 +++
30 files changed, 290 insertions(+), 94 deletions(-)
base-commit: a6ad54137af92535cfe32e19e5f3bc1bb7dbd383
--
2.51.0
The script "ethtool-common.sh" is not installed in INSTALL_PATH, and
triggers some errors when I try to run the test
'drivers/net/netdevsim/ethtool-coalesce.sh':
TAP version 13
1..1
# timeout set to 600
# selftests: drivers/net/netdevsim: ethtool-coalesce.sh
# ./ethtool-coalesce.sh: line 4: ethtool-common.sh: No such file or directory
# ./ethtool-coalesce.sh: line 25: make_netdev: command not found
# ethtool: bad command line argument(s)
# ./ethtool-coalesce.sh: line 124: check: command not found
# ./ethtool-coalesce.sh: line 126: [: -eq: unary operator expected
# FAILED /0 checks
not ok 1 selftests: drivers/net/netdevsim: ethtool-coalesce.sh # exit=1
Install this file to avoid this error. After this patch:
TAP version 13
1..1
# timeout set to 600
# selftests: drivers/net/netdevsim: ethtool-coalesce.sh
# PASSED all 22 checks
ok 1 selftests: drivers/net/netdevsim: ethtool-coalesce.sh
Fixes: fbb8531e58bd ("selftests: extract common functions in ethtool-common.sh")
Signed-off-by: Wang Liang <wangliang74(a)huawei.com>
---
tools/testing/selftests/drivers/net/netdevsim/Makefile | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/tools/testing/selftests/drivers/net/netdevsim/Makefile b/tools/testing/selftests/drivers/net/netdevsim/Makefile
index daf51113c827..df10c7243511 100644
--- a/tools/testing/selftests/drivers/net/netdevsim/Makefile
+++ b/tools/testing/selftests/drivers/net/netdevsim/Makefile
@@ -20,4 +20,8 @@ TEST_PROGS := \
udp_tunnel_nic.sh \
# end of TEST_PROGS
+TEST_FILES := \
+ ethtool-common.sh
+# end of TEST_FILES
+
include ../../../lib.mk
--
2.34.1
The GRO self-test, gro.c, currently constructs IPv6 packets containing a
Hop-by-Hop Options header (IPPROTO_HOPOPTS) to ensure the GRO path
correctly handles IPv6 extension headers.
However, network elements may be configured to drop packets with the
Hop-by-Hop Options header (HBH). This causes the self-test to fail
in environments where such network elements are present.
To improve the robustness and reliability of this test in diverse
network environments, switch from using IPPROTO_HOPOPTS to
IPPROTO_DSTOPTS (Destination Options).
The Destination Options header is less likely to be dropped by
intermediate routers and still serves the core purpose of the test:
validating GRO's handling of an IPv6 extension header. This change
ensures the test can execute successfully without being incorrectly
failed by network policies outside the kernel's control.
Fixes: 7d1575014a63 ("selftests/net: GRO coalesce test")
Reviewed-by: Willem de Bruijn <willemb(a)google.com>
Signed-off-by: Anubhav Singh <anubhavsinggh(a)google.com>
---
tools/testing/selftests/net/gro.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c
index 2b1d9f2b3e9e..d8c29fe39c1d 100644
--- a/tools/testing/selftests/net/gro.c
+++ b/tools/testing/selftests/net/gro.c
@@ -754,11 +754,11 @@ static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr, char *ext_data1,
static char exthdr_pck[sizeof(buf) + MIN_EXTHDR_SIZE];
create_packet(buf, 0, 0, PAYLOAD_LEN, 0);
- add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_HOPOPTS, ext_data1);
+ add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_DSTOPTS, ext_data1);
write_packet(fd, exthdr_pck, total_hdr_len + PAYLOAD_LEN + MIN_EXTHDR_SIZE, daddr);
create_packet(buf, PAYLOAD_LEN * 1, 0, PAYLOAD_LEN, 0);
- add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_HOPOPTS, ext_data2);
+ add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_DSTOPTS, ext_data2);
write_packet(fd, exthdr_pck, total_hdr_len + PAYLOAD_LEN + MIN_EXTHDR_SIZE, daddr);
}
--
2.51.1.851.g4ebd6896fd-goog
Due to the gro_sender sending data packets and FIN packets
in very quick succession, these are received almost simultaneously
by the gro_receiver. FIN packets are sometimes processed before the
data packets leading to intermittent (~1/100) test failures.
This change adds a delay of 100ms before sending FIN packets
in gro:tcp test to avoid the out-of-order delivery. The same
mitigation already exists for the gro:ip test.
Fixes: 7d1575014a63 ("selftests/net: GRO coalesce test")
Reviewed-by: Willem de Bruijn <willemb(a)google.com>
Signed-off-by: Anubhav Singh <anubhavsinggh(a)google.com>
---
tools/testing/selftests/net/gro.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c
index 2b1d9f2b3e9e..3fa63bd85dea 100644
--- a/tools/testing/selftests/net/gro.c
+++ b/tools/testing/selftests/net/gro.c
@@ -989,6 +989,7 @@ static void check_recv_pkts(int fd, int *correct_payload,
static void gro_sender(void)
{
+ const int fin_delay_us = 100 * 1000;
static char fin_pkt[MAX_HDR_LEN];
struct sockaddr_ll daddr = {};
int txfd = -1;
@@ -1032,15 +1033,22 @@ static void gro_sender(void)
write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
} else if (strcmp(testname, "tcp") == 0) {
send_changed_checksum(txfd, &daddr);
+ /* Adding sleep before sending FIN so that it is not
+ * received prior to other packets.
+ */
+ usleep(fin_delay_us);
write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
send_changed_seq(txfd, &daddr);
+ usleep(fin_delay_us);
write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
send_changed_ts(txfd, &daddr);
+ usleep(fin_delay_us);
write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
send_diff_opt(txfd, &daddr);
+ usleep(fin_delay_us);
write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
} else if (strcmp(testname, "ip") == 0) {
send_changed_ECN(txfd, &daddr);
--
2.51.1.851.g4ebd6896fd-goog
Currently, guard regions are not visible to users except through
/proc/$pid/pagemap, with no explicit visibility at the VMA level.
This makes the feature less useful, as it isn't entirely apparent which
VMAs may have these entries present, especially when performing actions
which walk through memory regions such as those performed by CRIU.
This series addresses this issue by introducing the VM_MAYBE_GUARD flag
which fulfils this role, updating the smaps logic to display an entry for
these.
The semantics of this flag are that a guard region MAY be present if set
(we cannot be sure, as we can't efficiently track whether an
MADV_GUARD_REMOVE finally removes all the guard regions in a VMA) - but if
not set the VMA definitely does NOT have any guard regions present.
It's problematic to establish this flag without further action, because
that means that VMAs with guard regions in them become non-mergeable with
adjacent VMAs for no especially good reason.
To work around this, this series also introduces the concept of 'sticky'
VMA flags - that is flags which:
a. if set in one VMA and not in another still permit those VMAs to be
merged (if otherwise compatible).
b. When they are merged, the resultant VMA must have the flag set.
The VMA logic is updated to propagate these flags correctly.
Additionally, VM_MAYBE_GUARD being an explicit VMA flag allows us to solve
an issue with file-backed guard regions - previously these established an
anon_vma object for file-backed mappings solely to have vma_needs_copy()
correctly propagate guard region mappings to child processes.
We introduce a new flag alias VM_COPY_ON_FORK (which currently only
specifies VM_MAYBE_GUARD) and update vma_needs_copy() to check explicitly
for this flag and to copy page tables if it is present, which resolves this
issue.
Finally we introduce extensive VMA userland tests to assert that the sticky
VMA logic behaves correctly as well as guard region self tests to assert
that smaps visibility is correctly implemented.
Lorenzo Stoakes (3):
mm: introduce VM_MAYBE_GUARD and make visible for guard regions
mm: implement sticky, copy on fork VMA flags
selftests/mm/guard-regions: add smaps visibility test
Documentation/filesystems/proc.rst | 1 +
fs/proc/task_mmu.c | 1 +
include/linux/mm.h | 33 ++++++
include/trace/events/mmflags.h | 1 +
mm/madvise.c | 22 ++--
mm/memory.c | 3 +
mm/vma.c | 22 ++--
tools/testing/selftests/mm/guard-regions.c | 120 +++++++++++++++++++++
tools/testing/selftests/mm/vm_util.c | 5 +
tools/testing/selftests/mm/vm_util.h | 1 +
tools/testing/vma/vma.c | 89 +++++++++++++--
tools/testing/vma/vma_internal.h | 33 ++++++
12 files changed, 303 insertions(+), 28 deletions(-)
--
2.51.0
Hello,
This series adds the live update support in the VFIO PCI subsystem on top of
Live Update Orchestrator (LUO) [1].
This series can also be found on GitHub:
https://github.com/shvipin/linux vfio/liveupdate/rfc-v1
Goal of live update in VFIO subsystem is to preserve VFIO PCI devices
while the host kernel is going through a live update. A preserved device
means it can continue to work, perform DMA, not get reset while host
under live update gets rebooted via kexec.
This series registers VFIO with LUO, implements LUO callbacks, skip DMA
clear, skip device reset, preserves and restores a device virtual
config during live update. I have added a selftest towards the end of
this series, vfio_pci_liveupdate_test, which sets certain properties of
a VFIO PCI device, performs a live update, and then validates those
properties are still same on the device.
Overall flow for a VFIO device going through a live update will be
something like:
1. Userspace passes a VFIO cdev FD along with a token to LUO for preservation.
2. LUO passes FD to VFIO subsystem to verify if FD can be preserved. If
yes, it increases the refcount on the FD.
3. Eventually, userspace tells LUO to prepare for live update which
results in LUO calling prepare() callback to each of its register filesystem
handler with the passed FD it should be preparing.
4. VFIO subsystem saves certain properties which will be either lost or
hard to recover from the device.
5. VFIO saves the needed data to KHO and provide LUO with the
physical address of the data preserved by KHO.
6. Userspace sends FREEZE event to freeze the system. LUO forwards this
to each of its registered subsystem.
7. VFIO disables interrupts configured on the device during freeze call.
8. Userspace performs kexec.
9. During kexec reboot, generally, all PCI devices gets their Bus Master
Enable bit disabled. In live update case, preserved VFIO devices are
skipped.
9. During boot, usual device enumeration happens and LUO also intializes
itself.
10. Userspace uses the same token value (step 1), and ask LUO to return
VFIO FD corresponding to token.
11. LUO ask VFIO to return VFIO cdev FD corresponding to the token. It
gives it the physical address which VFIO returned it in step 5.
12. VFIO restore the KHO data and read the BDF value it saved. It
iterates through all of the VFIO device it has in its VFIO cdev
class and finds the BDF device.
13. VFIO creates an anonymous inode and file corresponding to the VFIO
PCI device and returns it to LUO and LUO returns it to userspace.
14. Now FD returned to userspace works exactly same as if userspace has
opened a VFIO device from /dev/vfio/device/* location.
15. It makes usual bind iommufd and attach page table calls.
16. During bind, when VFIO device is internally opened for the first
time:
- VFIO skips Bus Master Disable
- VFIO skips device reset.
- VFIO instead of initializing vconfig from the scratch uses the
vconfig stored in KHO, and same for few other fields.
This is what current series is implementing and validating through
selftest.
There are other things are which not implemented yet and some are also
dependent on other subsystems. For example:
1. Once a device has been prepared, VFIO should not allow any changes to
its state from userspace for example, changing PCI config values,
resetting the device, etc.
2. Device IOVA is not preserved in this series. This work is done
separately in IOMMMUFD live update preservation [2]
3. During PCI device enumeration, PCI subsystem writes to PCI config
space, attach device to its original driver if present. This work is
being done in PCI preservation [3].
4. Enabling PCI device done in VFIO subsystem should be handled in PCI
subsystem. Current, this patch series hasn't changed the behavior.
5. If live update gets canceled, interrupts which are disabled in
freeze need to be reconfigured again.
6. In finish, if a device is not restored, how to know if KHO folio has been
restored or not.
6. VFIO cdev is restored in anonymous file system. This should instead
be done on devetmpfs
For reviewers, following are the grouping of patches in this series:
Patches 1-4
-----------
Feel free to ignore if you are only interested in VFIO.
These are only for live update selftests. I had to make some changes
on top LUO v4 series, to create a library out of them which can be
used in other selftests (vfio), and fix some build issues.
Patches 5-9
-----------
Adds basic live update support in VFIO.
Registers to LUO, saves the device BDF in KHO during prepare, and
returns VFIO cdev FD during restore.
It doesn't save or skip anything else.
Patches 10-17
-------------
Adds support for skipping certain opertions and preserving certain
data needed to restore a device.
Patches 18-21
-------------
- Integrate VFIO selftest with live update selftest library.
- Adds a basic vfio_pci_liveupdate_test test which validates that Bus
Master Enable bit is preserved, and virtual config is restored
properly.
Testing
-------
I have done testing on QEMU with a test pci device and also on a bare
metal with Intel DSA device. Make sure IDXD driver is not built in your
kernel if testing with Intel DSA device. Basically, whichever device you
use, it should not get auto-bind to any other driver.
Important config options which should be enabled to test this series:
- CONFIG_KEXEC_FILE
- CONFIG_LIVEUPDATE
- CONFIG_KEXEC_HANDOVER
Besides this usual VFIO, VFIO_PCI, IOMMU and other dependencies are
enabled.
To build the test provide KHDR_INCLUDES to your make command if your
headers are out-of-tree.
KHDR_INCLUDES="-isystem ../../../../build/usr/include" make
vfio_pci_liveupdate_test needs to be executed manually. This test needs
to be executed two times; one before the live update and second after.
./run.sh -d 0000:00:04.0 vfio_pci_liveupdate_test
Next Steps
----------
1. Looking forward to feedback on this series.
- What other things we should save?
- Which things should not be saved?
- Any locks or incorrect locking done in the series.
- Any optimizations.
2. Integration with IOMMUFD and PCI series for complete workflow where a
device continues a DMA while undergoing through live update.
I will be going on a paternity leave soon, so, my responses gonna be
intermittent. David Matlack (dmatlack(a)google.com) has graciously offered
to work on this series and continue upstream engagement on this feature
until I am back. Thank you, David!
[1] LUO-v4: https://lore.kernel.org/linux-mm/20250929010321.3462457-1-pasha.tatashin@so…
[2] IOMMUFD: https://lore.kernel.org/linux-iommu/20250928190624.3735830-1-skhawaja@googl…
[3] PCI: https://lore.kernel.org/linux-pci/20250916-luo-pci-v2-0-c494053c3c08@kernel…
Vipin Sharma (21):
selftests/liveupdate: Build tests from the selftests/liveupdate
directory
selftests/liveupdate: Create library of core live update ioctls
selftests/liveupdate: Move do_kexec.sh script to liveupdate/lib
selftests/liveupdate: Move LUO ioctls calls to liveupdate library
vfio/pci: Register VFIO live update file handler to Live Update
Orchestrator
vfio/pci: Accept live update preservation request for VFIO cdev
vfio/pci: Store VFIO PCI device preservation data in KHO for live
update
vfio/pci: Retrieve preserved VFIO device for Live Update Orechestrator
vfio/pci: Add Live Update finish callback implementation
PCI: Add option to skip Bus Master Enable reset during kexec
vfio/pci: Skip clearing bus master on live update device during kexec
vfio/pci: Skip clearing bus master on live update restored device
vfio/pci: Preserve VFIO PCI config space through live update
vfio/pci: Skip device reset on live update restored device.
PCI: Make PCI saved state and capability structs public
vfio/pci: Save and restore the PCI state of the VFIO device
vfio/pci: Disable interrupts before going live update kexec
vfio: selftests: Build liveupdate library in VFIO selftests
vfio: selftests: Initialize vfio_pci_device using a VFIO cdev FD
vfio: selftests: Add VFIO live update test
vfio: selftests: Validate vconfig preservation of VFIO PCI device
during live update
drivers/pci/pci-driver.c | 6 +-
drivers/pci/pci.c | 5 -
drivers/pci/pci.h | 7 -
drivers/vfio/pci/Makefile | 1 +
drivers/vfio/pci/vfio_pci_config.c | 17 +
drivers/vfio/pci/vfio_pci_core.c | 31 +-
drivers/vfio/pci/vfio_pci_liveupdate.c | 461 ++++++++++++++++++
drivers/vfio/pci/vfio_pci_priv.h | 17 +
drivers/vfio/vfio_main.c | 20 +-
include/linux/pci.h | 15 +
include/linux/vfio.h | 8 +
include/linux/vfio_pci_core.h | 1 +
tools/testing/selftests/liveupdate/.gitignore | 7 +-
tools/testing/selftests/liveupdate/Makefile | 31 +-
.../liveupdate/{ => lib}/do_kexec.sh | 0
.../liveupdate/lib/include/liveupdate_util.h | 27 +
.../selftests/liveupdate/lib/libliveupdate.mk | 18 +
.../liveupdate/lib/liveupdate_util.c | 106 ++++
.../selftests/liveupdate/luo_multi_file.c | 2 -
.../selftests/liveupdate/luo_multi_kexec.c | 2 -
.../selftests/liveupdate/luo_multi_session.c | 2 -
.../selftests/liveupdate/luo_test_utils.c | 73 +--
.../selftests/liveupdate/luo_test_utils.h | 10 +-
.../selftests/liveupdate/luo_unreclaimed.c | 1 -
tools/testing/selftests/vfio/Makefile | 15 +-
.../selftests/vfio/lib/include/vfio_util.h | 1 +
.../selftests/vfio/lib/vfio_pci_device.c | 33 +-
.../selftests/vfio/vfio_pci_liveupdate_test.c | 116 +++++
28 files changed, 900 insertions(+), 133 deletions(-)
create mode 100644 drivers/vfio/pci/vfio_pci_liveupdate.c
rename tools/testing/selftests/liveupdate/{ => lib}/do_kexec.sh (100%)
create mode 100644 tools/testing/selftests/liveupdate/lib/include/liveupdate_util.h
create mode 100644 tools/testing/selftests/liveupdate/lib/libliveupdate.mk
create mode 100644 tools/testing/selftests/liveupdate/lib/liveupdate_util.c
create mode 100644 tools/testing/selftests/vfio/vfio_pci_liveupdate_test.c
base-commit: e48be01cadc981362646dc3a87d57316421590a5
--
2.51.0.858.gf9c4a03a3a-goog
v22: fixing build error due to -march=zicfiss being picked in gcc-13 and above
but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
v21: fixed build errors.
Basics and overview
===================
Software with larger attack surfaces (e.g. network facing apps like databases,
browsers or apps relying on browser runtimes) suffer from memory corruption
issues which can be utilized by attackers to bend control flow of the program
to eventually gain control (by making their payload executable). Attackers are
able to perform such attacks by leveraging call-sites which rely on indirect
calls or return sites which rely on obtaining return address from stack memory.
To mitigate such attacks, risc-v extension zicfilp enforces that all indirect
calls must land on a landing pad instruction `lpad` else cpu will raise software
check exception (a new cpu exception cause code on riscv).
Similarly for return flow, risc-v extension zicfiss extends architecture with
- `sspush` instruction to push return address on a shadow stack
- `sspopchk` instruction to pop return address from shadow stack
and compare with input operand (i.e. return address on stack)
- `sspopchk` to raise software check exception if comparision above
was a mismatch
- Protection mechanism using which shadow stack is not writeable via
regular store instructions
More information an details can be found at extensions github repo [1].
Equivalent to landing pad (zicfilp) on x86 is `ENDBRANCH` instruction in Intel
CET [3] and branch target identification (BTI) [4] on arm.
Similarly x86's Intel CET has shadow stack [5] and arm64 has guarded control
stack (GCS) [6] which are very similar to risc-v's zicfiss shadow stack.
x86 and arm64 support for user mode shadow stack is already in mainline.
Kernel awareness for user control flow integrity
================================================
This series picks up Samuel Holland's envcfg changes [2] as well. So if those are
being applied independently, they should be removed from this series.
Enabling:
In order to maintain compatibility and not break anything in user mode, kernel
doesn't enable control flow integrity cpu extensions on binary by default.
Instead exposes a prctl interface to enable, disable and lock the shadow stack
or landing pad feature for a task. This allows userspace (loader) to enumerate
if all objects in its address space are compiled with shadow stack and landing
pad support and accordingly enable the feature. Additionally if a subsequent
`dlopen` happens on a library, user mode can take a decision again to disable
the feature (if incoming library is not compiled with support) OR terminate the
task (if user mode policy is strict to have all objects in address space to be
compiled with control flow integirty cpu feature). prctl to enable shadow stack
results in allocating shadow stack from virtual memory and activating for user
address space. x86 and arm64 are also following same direction due to similar
reason(s).
clone/fork:
On clone and fork, cfi state for task is inherited by child. Shadow stack is
part of virtual memory and is a writeable memory from kernel perspective
(writeable via a restricted set of instructions aka shadow stack instructions)
Thus kernel changes ensure that this memory is converted into read-only when
fork/clone happens and COWed when fault is taken due to sspush, sspopchk or
ssamoswap. In case `CLONE_VM` is specified and shadow stack is to be enabled,
kernel will automatically allocate a shadow stack for that clone call.
map_shadow_stack:
x86 introduced `map_shadow_stack` system call to allow user space to explicitly
map shadow stack memory in its address space. It is useful to allocate shadow
for different contexts managed by a single thread (green threads or contexts)
risc-v implements this system call as well.
signal management:
If shadow stack is enabled for a task, kernel performs an asynchronous control
flow diversion to deliver the signal and eventually expects userspace to issue
sigreturn so that original execution can be resumed. Even though resume context
is prepared by kernel, it is in user space memory and is subject to memory
corruption and corruption bugs can be utilized by attacker in this race window
to perform arbitrary sigreturn and eventually bypass cfi mechanism.
Another issue is how to ensure that cfi related state on sigcontext area is not
trampled by legacy apps or apps compiled with old kernel headers.
In order to mitigate control-flow hijacting, kernel prepares a token and place
it on shadow stack before signal delivery and places address of token in
sigcontext structure. During sigreturn, kernel obtains address of token from
sigcontext struture, reads token from shadow stack and validates it and only
then allow sigreturn to succeed. Compatiblity issue is solved by adopting
dynamic sigcontext management introduced for vector extension. This series
re-factor the code little bit to allow future sigcontext management easy (as
proposed by Andy Chiu from SiFive)
config and compilation:
Introduce a new risc-v config option `CONFIG_RISCV_USER_CFI`. Selecting this
config option picks the kernel support for user control flow integrity. This
optin is presented only if toolchain has shadow stack and landing pad support.
And is on purpose guarded by toolchain support. Reason being that eventually
vDSO also needs to be compiled in with shadow stack and landing pad support.
vDSO compile patches are not included as of now because landing pad labeling
scheme is yet to settle for usermode runtime.
To get more information on kernel interactions with respect to
zicfilp and zicfiss, patch series adds documentation for
`zicfilp` and `zicfiss` in following:
Documentation/arch/riscv/zicfiss.rst
Documentation/arch/riscv/zicfilp.rst
How to test this series
=======================
Toolchain
---------
$ git clone git@github.com:sifive/riscv-gnu-toolchain.git -b cfi-dev
$ riscv-gnu-toolchain/configure --prefix=<path-to-where-to-build> --with-arch=rv64gc_zicfilp_zicfiss --enable-linux --disable-gdb --with-extra-multilib-test="rv64gc_zicfilp_zicfiss-lp64d:-static"
$ make -j$(nproc)
Qemu
----
Get the lastest qemu
$ cd qemu
$ mkdir build
$ cd build
$ ../configure --target-list=riscv64-softmmu
$ make -j$(nproc)
Opensbi
-------
$ git clone git@github.com:deepak0414/opensbi.git -b v6_cfi_spec_split_opensbi
$ make CROSS_COMPILE=<your riscv toolchain> -j$(nproc) PLATFORM=generic
Linux
-----
Running defconfig is fine. CFI is enabled by default if the toolchain
supports it.
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc) defconfig
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc)
Running
-------
Modify your qemu command to have:
-bios <path-to-cfi-opensbi>/build/platform/generic/firmware/fw_dynamic.bin
-cpu rv64,zicfilp=true,zicfiss=true,zimop=true,zcmop=true
References
==========
[1] - https://github.com/riscv/riscv-cfi
[2] - https://lore.kernel.org/all/20240814081126.956287-1-samuel.holland@sifive.c…
[3] - https://lwn.net/Articles/889475/
[4] - https://developer.arm.com/documentation/109576/0100/Branch-Target-Identific…
[5] - https://www.intel.com/content/dam/develop/external/us/en/documents/catc17-i…
[6] - https://lwn.net/Articles/940403/
To: Thomas Gleixner <tglx(a)linutronix.de>
To: Ingo Molnar <mingo(a)redhat.com>
To: Borislav Petkov <bp(a)alien8.de>
To: Dave Hansen <dave.hansen(a)linux.intel.com>
To: x86(a)kernel.org
To: H. Peter Anvin <hpa(a)zytor.com>
To: Andrew Morton <akpm(a)linux-foundation.org>
To: Liam R. Howlett <Liam.Howlett(a)oracle.com>
To: Vlastimil Babka <vbabka(a)suse.cz>
To: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
To: Paul Walmsley <paul.walmsley(a)sifive.com>
To: Palmer Dabbelt <palmer(a)dabbelt.com>
To: Albert Ou <aou(a)eecs.berkeley.edu>
To: Conor Dooley <conor(a)kernel.org>
To: Rob Herring <robh(a)kernel.org>
To: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
To: Arnd Bergmann <arnd(a)arndb.de>
To: Christian Brauner <brauner(a)kernel.org>
To: Peter Zijlstra <peterz(a)infradead.org>
To: Oleg Nesterov <oleg(a)redhat.com>
To: Eric Biederman <ebiederm(a)xmission.com>
To: Kees Cook <kees(a)kernel.org>
To: Jonathan Corbet <corbet(a)lwn.net>
To: Shuah Khan <shuah(a)kernel.org>
To: Jann Horn <jannh(a)google.com>
To: Conor Dooley <conor+dt(a)kernel.org>
To: Miguel Ojeda <ojeda(a)kernel.org>
To: Alex Gaynor <alex.gaynor(a)gmail.com>
To: Boqun Feng <boqun.feng(a)gmail.com>
To: Gary Guo <gary(a)garyguo.net>
To: Björn Roy Baron <bjorn3_gh(a)protonmail.com>
To: Benno Lossin <benno.lossin(a)proton.me>
To: Andreas Hindborg <a.hindborg(a)kernel.org>
To: Alice Ryhl <aliceryhl(a)google.com>
To: Trevor Gross <tmgross(a)umich.edu>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-fsdevel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Cc: linux-riscv(a)lists.infradead.org
Cc: devicetree(a)vger.kernel.org
Cc: linux-arch(a)vger.kernel.org
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: alistair.francis(a)wdc.com
Cc: richard.henderson(a)linaro.org
Cc: jim.shu(a)sifive.com
Cc: andybnac(a)gmail.com
Cc: kito.cheng(a)sifive.com
Cc: charlie(a)rivosinc.com
Cc: atishp(a)rivosinc.com
Cc: evan(a)rivosinc.com
Cc: cleger(a)rivosinc.com
Cc: alexghiti(a)rivosinc.com
Cc: samitolvanen(a)google.com
Cc: broonie(a)kernel.org
Cc: rick.p.edgecombe(a)intel.com
Cc: rust-for-linux(a)vger.kernel.org
changelog
---------
v22:
- CONFIG_RISCV_USER_CFI was by default "n". With dual vdso support it is
default "y" (if toolchain supports it). Fixing build error due to
"-march=zicfiss" being picked in gcc-13 partially. gcc-13 only recognizes the
flag but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
- picked up tags and some cosmetic changes in commit message for dual vdso
patch.
v21:
- Fixing build errors due to changes in arch/riscv/include/asm/vdso.h
Using #ifdef instead of IS_ENABLED in arch/riscv/include/asm/vdso.h
vdso-cfi-offsets.h should be included only when CONFIG_RISCV_USER_CFI
is selected.
v20:
- rebased on v6.18-rc1.
- Added two vDSO support. If `CONFIG_RISCV_USER_CFI` is selected
two vDSOs are compiled (one for hardware prior to RVA23 and one
for RVA23 onwards). Kernel exposes RVA23 vDSO if hardware/cpu
implements zimop else exposes existing vDSO to userspace.
- default selection for `CONFIG_RISCV_USER_CFI` is "Yes".
- replaced "__ASSEMBLY__" with "__ASSEMBLER__"
v19:
- riscv_nousercfi was `int`. changed it to unsigned long.
Thanks to Alex Ghiti for reporting it. It was a bug.
- ELP is cleared on trap entry only when CONFIG_64BIT.
- restore ssp back on return to usermode was being done
before `riscv_v_context_nesting_end` on trap exit path.
If kernel shadow stack were enabled this would result in
kernel operating on user shadow stack and panic (as I found
in my testing of kcfi patch series). So fixed that.
v18:
- rebased on 6.16-rc1
- uprobe handling clears ELP in sstatus image in pt_regs
- vdso was missing shadow stack elf note for object files.
added that. Additional asm file for vdso needed the elf marker
flag. toolchain should complain if `-fcf-protection=full` and
marker is missing for object generated from asm file. Asked
toolchain folks to fix this. Although no reason to gate the merge
on that.
- Split up compile options for march and fcf-protection in vdso
Makefile
- CONFIG_RISCV_USER_CFI option is moved under "Kernel features" menu
Added `arch/riscv/configs/hardening.config` fragment which selects
CONFIG_RISCV_USER_CFI
v17:
- fixed warnings due to empty macros in usercfi.h (reported by alexg)
- fixed prefixes in commit titles reported by alexg
- took below uprobe with fcfi v2 patch from Zong Li and squashed it with
"riscv/traps: Introduce software check exception and uprobe handling"
https://lore.kernel.org/all/20250604093403.10916-1-zong.li@sifive.com/
v16:
- If FWFT is not implemented or returns error for shadow stack activation, then
no_usercfi is set to disable shadow stack. Although this should be picked up
by extension validation and activation. Fixed this bug for zicfilp and zicfiss
both. Thanks to Charlie Jenkins for reporting this.
- If toolchain doesn't support cfi, cfi kselftest shouldn't build. Suggested by
Charlie Jenkins.
- Default for CONFIG_RISCV_USER_CFI is set to no. Charlie/Atish suggested to
keep it off till we have more hardware availibility with RVA23 profile and
zimop/zcmop implemented. Else this will start breaking people's workflow
- Includes the fix if "!RV64 and !SBI" then definitions for FWFT in
asm-offsets.c error.
v15:
- Toolchain has been updated to include `-fcf-protection` flag. This
exists for x86 as well. Updated kernel patches to compile vDSO and
selftest to compile with `fcf-protection=full` flag.
- selecting CONFIG_RISCV_USERCFI selects CONFIG_RISCV_SBI.
- Patch to enable shadow stack for kernel wasn't hidden behind
CONFIG_RISCV_USERCFI and CONFIG_RISCV_SBI. fixed that.
v14:
- rebased on top of palmer/sbi-v3. Thus dropped clement's FWFT patches
Updated RISCV_ISA_EXT_XXXX in hwcap and hwprobe constants.
- Took Radim's suggestions on bitfields.
- Placed cfi_state at the end of thread_info block so that current situation
is not disturbed with respect to member fields of thread_info in single
cacheline.
v13:
- cpu_supports_shadow_stack/cpu_supports_indirect_br_lp_instr uses
riscv_has_extension_unlikely()
- uses nops(count) to create nop slide
- RISCV_ACQUIRE_BARRIER is not needed in `amo_user_shstk`. Removed it
- changed ternaries to simply use implicit casting to convert to bool.
- kernel command line allows to disable zicfilp and zicfiss independently.
updated kernel-parameters.txt.
- ptrace user abi for cfi uses bitmasks instead of bitfields. Added ptrace
kselftest.
- cosmetic and grammatical changes to documentation.
v12:
- It seems like I had accidently squashed arch agnostic indirect branch
tracking prctl and riscv implementation of those prctls. Split them again.
- set_shstk_status/set_indir_lp_status perform CSR writes only when CPU
support is available. As suggested by Zong Li.
- Some minor clean up in kselftests as suggested by Zong Li.
v11:
- patch "arch/riscv: compile vdso with landing pad" was unconditionally
selecting `_zicfilp` for vDSO compile. fixed that. Changed `lpad 1` to
to `lpad 0`.
v10:
- dropped "mm: helper `is_shadow_stack_vma` to check shadow stack vma". This patch
is not that interesting to this patch series for risc-v. There are instances in
arch directories where VM_SHADOW_STACK flag is anyways used. Dropping this patch
to expedite merging in riscv tree.
- Took suggestions from `Clement` on "riscv: zicfiss / zicfilp enumeration" to
validate presence of cfi based on config.
- Added a patch for vDSO to have `lpad 0`. I had omitted this earlier to make sure
we add single vdso object with cfi enabled. But a vdso object with scheme of
zero labeled landing pad is least common denominator and should work with all
objects of zero labeled as well as function-signature labeled objects.
v9:
- rebased on master (39a803b754d5 fix braino in "9p: fix ->rename_sem exclusion")
- dropped "mm: Introduce ARCH_HAS_USER_SHADOW_STACK" (master has it from arm64/gcs)
- dropped "prctl: arch-agnostic prctl for shadow stack" (master has it from arm64/gcs)
v8:
- rebased on palmer/for-next
- dropped samuel holland's `envcfg` context switch patches.
they are in parlmer/for-next
v7:
- Removed "riscv/Kconfig: enable HAVE_EXIT_THREAD for riscv"
Instead using `deactivate_mm` flow to clean up.
see here for more context
https://lore.kernel.org/all/20230908203655.543765-1-rick.p.edgecombe@intel.…
- Changed the header include in `kselftest`. Hopefully this fixes compile
issue faced by Zong Li at SiFive.
- Cleaned up an orphaned change to `mm/mmap.c` in below patch
"riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE"
- Lock interfaces for shadow stack and indirect branch tracking expect arg == 0
Any future evolution of this interface should accordingly define how arg should
be setup.
- `mm/map.c` has an instance of using `VM_SHADOW_STACK`. Fixed it to use helper
`is_shadow_stack_vma`.
- Link to v6: https://lore.kernel.org/r/20241008-v5_user_cfi_series-v6-0-60d9fe073f37@riv…
v6:
- Picked up Samuel Holland's changes as is with `envcfg` placed in
`thread` instead of `thread_info`
- fixed unaligned newline escapes in kselftest
- cleaned up messages in kselftest and included test output in commit message
- fixed a bug in clone path reported by Zong Li
- fixed a build issue if CONFIG_RISCV_ISA_V is not selected
(this was introduced due to re-factoring signal context
management code)
v5:
- rebased on v6.12-rc1
- Fixed schema related issues in device tree file
- Fixed some of the documentation related issues in zicfilp/ss.rst
(style issues and added index)
- added `SHADOW_STACK_SET_MARKER` so that implementation can define base
of shadow stack.
- Fixed warnings on definitions added in usercfi.h when
CONFIG_RISCV_USER_CFI is not selected.
- Adopted context header based signal handling as proposed by Andy Chiu
- Added support for enabling kernel mode access to shadow stack using
FWFT
(https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-firmware…)
- Link to v5: https://lore.kernel.org/r/20241001-v5_user_cfi_series-v1-0-3ba65b6e550f@riv…
(Note: I had an issue in my workflow due to which version number wasn't
picked up correctly while sending out patches)
v4:
- rebased on 6.11-rc6
- envcfg: Converged with Samuel Holland's patches for envcfg management on per-
thread basis.
- vma_is_shadow_stack is renamed to is_vma_shadow_stack
- picked up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` patch
- signal context: using extended context management to maintain compatibility.
- fixed `-Wmissing-prototypes` compiler warnings for prctl functions
- Documentation fixes and amending typos.
- Link to v4: https://lore.kernel.org/all/20240912231650.3740732-1-debug@rivosinc.com/
v3:
- envcfg
logic to pick up base envcfg had a bug where `ENVCFG_CBZE` could have been
picked on per task basis, even though CPU didn't implement it. Fixed in
this series.
- dt-bindings
As suggested, split into separate commit. fixed the messaging that spec is
in public review
- arch_is_shadow_stack change
arch_is_shadow_stack changed to vma_is_shadow_stack
- hwprobe
zicfiss / zicfilp if present will get enumerated in hwprobe
- selftests
As suggested, added object and binary filenames to .gitignore
Selftest binary anyways need to be compiled with cfi enabled compiler which
will make sure that landing pad and shadow stack are enabled. Thus removed
separate enable/disable tests. Cleaned up tests a bit.
- Link to v3: https://lore.kernel.org/lkml/20240403234054.2020347-1-debug@rivosinc.com/
v2:
- Using config `CONFIG_RISCV_USER_CFI`, kernel support for riscv control flow
integrity for user mode programs can be compiled in the kernel.
- Enabling of control flow integrity for user programs is left to user runtime
- This patch series introduces arch agnostic `prctls` to enable shadow stack
and indirect branch tracking. And implements them on riscv.
---
Changes in v22:
- Link to v21: https://lore.kernel.org/r/20251015-v5_user_cfi_series-v21-0-6a07856e90e7@ri…
Changes in v21:
- Link to v20: https://lore.kernel.org/r/20251013-v5_user_cfi_series-v20-0-b9de4be9912e@ri…
Changes in v20:
- Link to v19: https://lore.kernel.org/r/20250731-v5_user_cfi_series-v19-0-09b468d7beab@ri…
Changes in v19:
- Link to v18: https://lore.kernel.org/r/20250711-v5_user_cfi_series-v18-0-a8ee62f9f38e@ri…
Changes in v18:
- Link to v17: https://lore.kernel.org/r/20250604-v5_user_cfi_series-v17-0-4565c2cf869f@ri…
Changes in v17:
- Link to v16: https://lore.kernel.org/r/20250522-v5_user_cfi_series-v16-0-64f61a35eee7@ri…
Changes in v16:
- Link to v15: https://lore.kernel.org/r/20250502-v5_user_cfi_series-v15-0-914966471885@ri…
Changes in v15:
- changelog posted just below cover letter
- Link to v14: https://lore.kernel.org/r/20250429-v5_user_cfi_series-v14-0-5239410d012a@ri…
Changes in v14:
- changelog posted just below cover letter
- Link to v13: https://lore.kernel.org/r/20250424-v5_user_cfi_series-v13-0-971437de586a@ri…
Changes in v13:
- changelog posted just below cover letter
- Link to v12: https://lore.kernel.org/r/20250314-v5_user_cfi_series-v12-0-e51202b53138@ri…
Changes in v12:
- changelog posted just below cover letter
- Link to v11: https://lore.kernel.org/r/20250310-v5_user_cfi_series-v11-0-86b36cbfb910@ri…
Changes in v11:
- changelog posted just below cover letter
- Link to v10: https://lore.kernel.org/r/20250210-v5_user_cfi_series-v10-0-163dcfa31c60@ri…
---
Andy Chiu (1):
riscv: signal: abstract header saving for setup_sigcontext
Deepak Gupta (26):
mm: VM_SHADOW_STACK definition for riscv
dt-bindings: riscv: zicfilp and zicfiss in dt-bindings (extensions.yaml)
riscv: zicfiss / zicfilp enumeration
riscv: zicfiss / zicfilp extension csr and bit definitions
riscv: usercfi state for task and save/restore of CSR_SSP on trap entry/exit
riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE
riscv/mm: manufacture shadow stack pte
riscv/mm: teach pte_mkwrite to manufacture shadow stack PTEs
riscv/mm: write protect and shadow stack
riscv/mm: Implement map_shadow_stack() syscall
riscv/shstk: If needed allocate a new shadow stack on clone
riscv: Implements arch agnostic shadow stack prctls
prctl: arch-agnostic prctl for indirect branch tracking
riscv: Implements arch agnostic indirect branch tracking prctls
riscv/traps: Introduce software check exception and uprobe handling
riscv/signal: save and restore of shadow stack for signal
riscv/kernel: update __show_regs to print shadow stack register
riscv/ptrace: riscv cfi status and state via ptrace and in core files
riscv/hwprobe: zicfilp / zicfiss enumeration in hwprobe
riscv: kernel command line option to opt out of user cfi
riscv: enable kernel access to shadow stack memory via FWFT sbi call
arch/riscv: dual vdso creation logic and select vdso based on hw
riscv: create a config for shadow stack and landing pad instr support
riscv: Documentation for landing pad / indirect branch tracking
riscv: Documentation for shadow stack on riscv
kselftest/riscv: kselftest for user mode cfi
Jim Shu (1):
arch/riscv: compile vdso with landing pad and shadow stack note
Documentation/admin-guide/kernel-parameters.txt | 8 +
Documentation/arch/riscv/index.rst | 2 +
Documentation/arch/riscv/zicfilp.rst | 115 +++++
Documentation/arch/riscv/zicfiss.rst | 179 +++++++
.../devicetree/bindings/riscv/extensions.yaml | 14 +
arch/riscv/Kconfig | 22 +
arch/riscv/Makefile | 8 +-
arch/riscv/configs/hardening.config | 4 +
arch/riscv/include/asm/asm-prototypes.h | 1 +
arch/riscv/include/asm/assembler.h | 44 ++
arch/riscv/include/asm/cpufeature.h | 12 +
arch/riscv/include/asm/csr.h | 16 +
arch/riscv/include/asm/entry-common.h | 2 +
arch/riscv/include/asm/hwcap.h | 2 +
arch/riscv/include/asm/mman.h | 26 +
arch/riscv/include/asm/mmu_context.h | 7 +
arch/riscv/include/asm/pgtable.h | 30 +-
arch/riscv/include/asm/processor.h | 1 +
arch/riscv/include/asm/thread_info.h | 3 +
arch/riscv/include/asm/usercfi.h | 95 ++++
arch/riscv/include/asm/vdso.h | 13 +-
arch/riscv/include/asm/vector.h | 3 +
arch/riscv/include/uapi/asm/hwprobe.h | 2 +
arch/riscv/include/uapi/asm/ptrace.h | 34 ++
arch/riscv/include/uapi/asm/sigcontext.h | 1 +
arch/riscv/kernel/Makefile | 2 +
arch/riscv/kernel/asm-offsets.c | 10 +
arch/riscv/kernel/cpufeature.c | 27 +
arch/riscv/kernel/entry.S | 38 ++
arch/riscv/kernel/head.S | 27 +
arch/riscv/kernel/process.c | 27 +-
arch/riscv/kernel/ptrace.c | 95 ++++
arch/riscv/kernel/signal.c | 148 +++++-
arch/riscv/kernel/sys_hwprobe.c | 2 +
arch/riscv/kernel/sys_riscv.c | 10 +
arch/riscv/kernel/traps.c | 54 ++
arch/riscv/kernel/usercfi.c | 545 +++++++++++++++++++++
arch/riscv/kernel/vdso.c | 7 +
arch/riscv/kernel/vdso/Makefile | 40 +-
arch/riscv/kernel/vdso/flush_icache.S | 4 +
arch/riscv/kernel/vdso/gen_vdso_offsets.sh | 4 +-
arch/riscv/kernel/vdso/getcpu.S | 4 +
arch/riscv/kernel/vdso/note.S | 3 +
arch/riscv/kernel/vdso/rt_sigreturn.S | 4 +
arch/riscv/kernel/vdso/sys_hwprobe.S | 4 +
arch/riscv/kernel/vdso/vgetrandom-chacha.S | 5 +-
arch/riscv/kernel/vdso_cfi/Makefile | 25 +
arch/riscv/kernel/vdso_cfi/vdso-cfi.S | 11 +
arch/riscv/mm/init.c | 2 +-
arch/riscv/mm/pgtable.c | 16 +
include/linux/cpu.h | 4 +
include/linux/mm.h | 7 +
include/uapi/linux/elf.h | 2 +
include/uapi/linux/prctl.h | 27 +
kernel/sys.c | 30 ++
tools/testing/selftests/riscv/Makefile | 2 +-
tools/testing/selftests/riscv/cfi/.gitignore | 3 +
tools/testing/selftests/riscv/cfi/Makefile | 16 +
tools/testing/selftests/riscv/cfi/cfi_rv_test.h | 82 ++++
tools/testing/selftests/riscv/cfi/riscv_cfi_test.c | 173 +++++++
tools/testing/selftests/riscv/cfi/shadowstack.c | 385 +++++++++++++++
tools/testing/selftests/riscv/cfi/shadowstack.h | 27 +
62 files changed, 2475 insertions(+), 41 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20240930-v5_user_cfi_series-3dc332f8f5b2
--
- debug
Hello,
this is yet another conversion series, this time tackling the
test_tc_edt.sh. This one was at the bottom of our list due to the fact
that it is based on some bandwith measurement (and so, increasing the
risk to make it flaky in CI), but here is an attempt anyway, as it also
showcases a nice example of BPF-based rate shaping.
The converted test roughly follows the original script logic, with two
veths in two namespaces, a TCP connection between a client and a server,
and the client pushing as much data as possible during a specific
period. We then compute the effective data rate, shaped by the eBPF
program, by reading the RX interface stats, and compare it to the target
rate. The test passes if the measured rate is within a defined error
margin.
There are two knobs driving the robustness of the test in CI:
- the test duration (the higher, the more precise is the effective rate)
- the tolerated error margin
The original test was configured with a 20s duration and a 1% error
margin. The new test is configured with a 2s duration and a 2% error
margin, to:
- make the duration tolerable in CI
- while keeping enough margin for rate measure fluctuations depending on
the CI machines load
This has been run multiple times locally to ensure that those values are
sane, and once in CI before sending the series, but I suggest to let it
live a few days in CI to see how it really behaves.
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore(a)bootlin.com>
---
Alexis Lothoré (eBPF Foundation) (4):
selftests/bpf: rename test_tc_edt.bpf.c section to expose program type
selftests/bpf: integrate test_tc_edt into test_progs
selftests/bpf: remove test_tc_edt.sh
selftests/bpf: do not hardcode target rate in test_tc_edt BPF program
tools/testing/selftests/bpf/Makefile | 2 -
.../testing/selftests/bpf/prog_tests/test_tc_edt.c | 274 +++++++++++++++++++++
tools/testing/selftests/bpf/progs/test_tc_edt.c | 9 +-
tools/testing/selftests/bpf/test_tc_edt.sh | 100 --------
4 files changed, 279 insertions(+), 106 deletions(-)
---
base-commit: 1e2d874b04ba46a3b9fe6697097aa437641f4339
change-id: 20251030-tc_edt-3ea8e8d3d14e
Best regards,
--
Alexis Lothoré, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com
v22: fixing build error due to -march=zicfiss being picked in gcc-13 and above
but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
v21: fixed build errors.
Basics and overview
===================
Software with larger attack surfaces (e.g. network facing apps like databases,
browsers or apps relying on browser runtimes) suffer from memory corruption
issues which can be utilized by attackers to bend control flow of the program
to eventually gain control (by making their payload executable). Attackers are
able to perform such attacks by leveraging call-sites which rely on indirect
calls or return sites which rely on obtaining return address from stack memory.
To mitigate such attacks, risc-v extension zicfilp enforces that all indirect
calls must land on a landing pad instruction `lpad` else cpu will raise software
check exception (a new cpu exception cause code on riscv).
Similarly for return flow, risc-v extension zicfiss extends architecture with
- `sspush` instruction to push return address on a shadow stack
- `sspopchk` instruction to pop return address from shadow stack
and compare with input operand (i.e. return address on stack)
- `sspopchk` to raise software check exception if comparision above
was a mismatch
- Protection mechanism using which shadow stack is not writeable via
regular store instructions
More information an details can be found at extensions github repo [1].
Equivalent to landing pad (zicfilp) on x86 is `ENDBRANCH` instruction in Intel
CET [3] and branch target identification (BTI) [4] on arm.
Similarly x86's Intel CET has shadow stack [5] and arm64 has guarded control
stack (GCS) [6] which are very similar to risc-v's zicfiss shadow stack.
x86 and arm64 support for user mode shadow stack is already in mainline.
Kernel awareness for user control flow integrity
================================================
This series picks up Samuel Holland's envcfg changes [2] as well. So if those are
being applied independently, they should be removed from this series.
Enabling:
In order to maintain compatibility and not break anything in user mode, kernel
doesn't enable control flow integrity cpu extensions on binary by default.
Instead exposes a prctl interface to enable, disable and lock the shadow stack
or landing pad feature for a task. This allows userspace (loader) to enumerate
if all objects in its address space are compiled with shadow stack and landing
pad support and accordingly enable the feature. Additionally if a subsequent
`dlopen` happens on a library, user mode can take a decision again to disable
the feature (if incoming library is not compiled with support) OR terminate the
task (if user mode policy is strict to have all objects in address space to be
compiled with control flow integirty cpu feature). prctl to enable shadow stack
results in allocating shadow stack from virtual memory and activating for user
address space. x86 and arm64 are also following same direction due to similar
reason(s).
clone/fork:
On clone and fork, cfi state for task is inherited by child. Shadow stack is
part of virtual memory and is a writeable memory from kernel perspective
(writeable via a restricted set of instructions aka shadow stack instructions)
Thus kernel changes ensure that this memory is converted into read-only when
fork/clone happens and COWed when fault is taken due to sspush, sspopchk or
ssamoswap. In case `CLONE_VM` is specified and shadow stack is to be enabled,
kernel will automatically allocate a shadow stack for that clone call.
map_shadow_stack:
x86 introduced `map_shadow_stack` system call to allow user space to explicitly
map shadow stack memory in its address space. It is useful to allocate shadow
for different contexts managed by a single thread (green threads or contexts)
risc-v implements this system call as well.
signal management:
If shadow stack is enabled for a task, kernel performs an asynchronous control
flow diversion to deliver the signal and eventually expects userspace to issue
sigreturn so that original execution can be resumed. Even though resume context
is prepared by kernel, it is in user space memory and is subject to memory
corruption and corruption bugs can be utilized by attacker in this race window
to perform arbitrary sigreturn and eventually bypass cfi mechanism.
Another issue is how to ensure that cfi related state on sigcontext area is not
trampled by legacy apps or apps compiled with old kernel headers.
In order to mitigate control-flow hijacting, kernel prepares a token and place
it on shadow stack before signal delivery and places address of token in
sigcontext structure. During sigreturn, kernel obtains address of token from
sigcontext struture, reads token from shadow stack and validates it and only
then allow sigreturn to succeed. Compatiblity issue is solved by adopting
dynamic sigcontext management introduced for vector extension. This series
re-factor the code little bit to allow future sigcontext management easy (as
proposed by Andy Chiu from SiFive)
config and compilation:
Introduce a new risc-v config option `CONFIG_RISCV_USER_CFI`. Selecting this
config option picks the kernel support for user control flow integrity. This
optin is presented only if toolchain has shadow stack and landing pad support.
And is on purpose guarded by toolchain support. Reason being that eventually
vDSO also needs to be compiled in with shadow stack and landing pad support.
vDSO compile patches are not included as of now because landing pad labeling
scheme is yet to settle for usermode runtime.
To get more information on kernel interactions with respect to
zicfilp and zicfiss, patch series adds documentation for
`zicfilp` and `zicfiss` in following:
Documentation/arch/riscv/zicfiss.rst
Documentation/arch/riscv/zicfilp.rst
How to test this series
=======================
Toolchain
---------
$ git clone git@github.com:sifive/riscv-gnu-toolchain.git -b cfi-dev
$ riscv-gnu-toolchain/configure --prefix=<path-to-where-to-build> --with-arch=rv64gc_zicfilp_zicfiss --enable-linux --disable-gdb --with-extra-multilib-test="rv64gc_zicfilp_zicfiss-lp64d:-static"
$ make -j$(nproc)
Qemu
----
Get the lastest qemu
$ cd qemu
$ mkdir build
$ cd build
$ ../configure --target-list=riscv64-softmmu
$ make -j$(nproc)
Opensbi
-------
$ git clone git@github.com:deepak0414/opensbi.git -b v6_cfi_spec_split_opensbi
$ make CROSS_COMPILE=<your riscv toolchain> -j$(nproc) PLATFORM=generic
Linux
-----
Running defconfig is fine. CFI is enabled by default if the toolchain
supports it.
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc) defconfig
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc)
Running
-------
Modify your qemu command to have:
-bios <path-to-cfi-opensbi>/build/platform/generic/firmware/fw_dynamic.bin
-cpu rv64,zicfilp=true,zicfiss=true,zimop=true,zcmop=true
References
==========
[1] - https://github.com/riscv/riscv-cfi
[2] - https://lore.kernel.org/all/20240814081126.956287-1-samuel.holland@sifive.c…
[3] - https://lwn.net/Articles/889475/
[4] - https://developer.arm.com/documentation/109576/0100/Branch-Target-Identific…
[5] - https://www.intel.com/content/dam/develop/external/us/en/documents/catc17-i…
[6] - https://lwn.net/Articles/940403/
To: Thomas Gleixner <tglx(a)linutronix.de>
To: Ingo Molnar <mingo(a)redhat.com>
To: Borislav Petkov <bp(a)alien8.de>
To: Dave Hansen <dave.hansen(a)linux.intel.com>
To: x86(a)kernel.org
To: H. Peter Anvin <hpa(a)zytor.com>
To: Andrew Morton <akpm(a)linux-foundation.org>
To: Liam R. Howlett <Liam.Howlett(a)oracle.com>
To: Vlastimil Babka <vbabka(a)suse.cz>
To: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
To: Paul Walmsley <paul.walmsley(a)sifive.com>
To: Palmer Dabbelt <palmer(a)dabbelt.com>
To: Albert Ou <aou(a)eecs.berkeley.edu>
To: Conor Dooley <conor(a)kernel.org>
To: Rob Herring <robh(a)kernel.org>
To: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
To: Arnd Bergmann <arnd(a)arndb.de>
To: Christian Brauner <brauner(a)kernel.org>
To: Peter Zijlstra <peterz(a)infradead.org>
To: Oleg Nesterov <oleg(a)redhat.com>
To: Eric Biederman <ebiederm(a)xmission.com>
To: Kees Cook <kees(a)kernel.org>
To: Jonathan Corbet <corbet(a)lwn.net>
To: Shuah Khan <shuah(a)kernel.org>
To: Jann Horn <jannh(a)google.com>
To: Conor Dooley <conor+dt(a)kernel.org>
To: Miguel Ojeda <ojeda(a)kernel.org>
To: Alex Gaynor <alex.gaynor(a)gmail.com>
To: Boqun Feng <boqun.feng(a)gmail.com>
To: Gary Guo <gary(a)garyguo.net>
To: Björn Roy Baron <bjorn3_gh(a)protonmail.com>
To: Benno Lossin <benno.lossin(a)proton.me>
To: Andreas Hindborg <a.hindborg(a)kernel.org>
To: Alice Ryhl <aliceryhl(a)google.com>
To: Trevor Gross <tmgross(a)umich.edu>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-fsdevel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Cc: linux-riscv(a)lists.infradead.org
Cc: devicetree(a)vger.kernel.org
Cc: linux-arch(a)vger.kernel.org
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: alistair.francis(a)wdc.com
Cc: richard.henderson(a)linaro.org
Cc: jim.shu(a)sifive.com
Cc: andybnac(a)gmail.com
Cc: kito.cheng(a)sifive.com
Cc: charlie(a)rivosinc.com
Cc: atishp(a)rivosinc.com
Cc: evan(a)rivosinc.com
Cc: cleger(a)rivosinc.com
Cc: alexghiti(a)rivosinc.com
Cc: samitolvanen(a)google.com
Cc: broonie(a)kernel.org
Cc: rick.p.edgecombe(a)intel.com
Cc: rust-for-linux(a)vger.kernel.org
changelog
---------
v22:
- CONFIG_RISCV_USER_CFI was by default "n". With dual vdso support it is
default "y" (if toolchain supports it). Fixing build error due to
"-march=zicfiss" being picked in gcc-13 partially. gcc-13 only recognizes the
flag but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
- picked up tags and some cosmetic changes in commit message for dual vdso
patch.
v21:
- Fixing build errors due to changes in arch/riscv/include/asm/vdso.h
Using #ifdef instead of IS_ENABLED in arch/riscv/include/asm/vdso.h
vdso-cfi-offsets.h should be included only when CONFIG_RISCV_USER_CFI
is selected.
v20:
- rebased on v6.18-rc1.
- Added two vDSO support. If `CONFIG_RISCV_USER_CFI` is selected
two vDSOs are compiled (one for hardware prior to RVA23 and one
for RVA23 onwards). Kernel exposes RVA23 vDSO if hardware/cpu
implements zimop else exposes existing vDSO to userspace.
- default selection for `CONFIG_RISCV_USER_CFI` is "Yes".
- replaced "__ASSEMBLY__" with "__ASSEMBLER__"
v19:
- riscv_nousercfi was `int`. changed it to unsigned long.
Thanks to Alex Ghiti for reporting it. It was a bug.
- ELP is cleared on trap entry only when CONFIG_64BIT.
- restore ssp back on return to usermode was being done
before `riscv_v_context_nesting_end` on trap exit path.
If kernel shadow stack were enabled this would result in
kernel operating on user shadow stack and panic (as I found
in my testing of kcfi patch series). So fixed that.
v18:
- rebased on 6.16-rc1
- uprobe handling clears ELP in sstatus image in pt_regs
- vdso was missing shadow stack elf note for object files.
added that. Additional asm file for vdso needed the elf marker
flag. toolchain should complain if `-fcf-protection=full` and
marker is missing for object generated from asm file. Asked
toolchain folks to fix this. Although no reason to gate the merge
on that.
- Split up compile options for march and fcf-protection in vdso
Makefile
- CONFIG_RISCV_USER_CFI option is moved under "Kernel features" menu
Added `arch/riscv/configs/hardening.config` fragment which selects
CONFIG_RISCV_USER_CFI
v17:
- fixed warnings due to empty macros in usercfi.h (reported by alexg)
- fixed prefixes in commit titles reported by alexg
- took below uprobe with fcfi v2 patch from Zong Li and squashed it with
"riscv/traps: Introduce software check exception and uprobe handling"
https://lore.kernel.org/all/20250604093403.10916-1-zong.li@sifive.com/
v16:
- If FWFT is not implemented or returns error for shadow stack activation, then
no_usercfi is set to disable shadow stack. Although this should be picked up
by extension validation and activation. Fixed this bug for zicfilp and zicfiss
both. Thanks to Charlie Jenkins for reporting this.
- If toolchain doesn't support cfi, cfi kselftest shouldn't build. Suggested by
Charlie Jenkins.
- Default for CONFIG_RISCV_USER_CFI is set to no. Charlie/Atish suggested to
keep it off till we have more hardware availibility with RVA23 profile and
zimop/zcmop implemented. Else this will start breaking people's workflow
- Includes the fix if "!RV64 and !SBI" then definitions for FWFT in
asm-offsets.c error.
v15:
- Toolchain has been updated to include `-fcf-protection` flag. This
exists for x86 as well. Updated kernel patches to compile vDSO and
selftest to compile with `fcf-protection=full` flag.
- selecting CONFIG_RISCV_USERCFI selects CONFIG_RISCV_SBI.
- Patch to enable shadow stack for kernel wasn't hidden behind
CONFIG_RISCV_USERCFI and CONFIG_RISCV_SBI. fixed that.
v14:
- rebased on top of palmer/sbi-v3. Thus dropped clement's FWFT patches
Updated RISCV_ISA_EXT_XXXX in hwcap and hwprobe constants.
- Took Radim's suggestions on bitfields.
- Placed cfi_state at the end of thread_info block so that current situation
is not disturbed with respect to member fields of thread_info in single
cacheline.
v13:
- cpu_supports_shadow_stack/cpu_supports_indirect_br_lp_instr uses
riscv_has_extension_unlikely()
- uses nops(count) to create nop slide
- RISCV_ACQUIRE_BARRIER is not needed in `amo_user_shstk`. Removed it
- changed ternaries to simply use implicit casting to convert to bool.
- kernel command line allows to disable zicfilp and zicfiss independently.
updated kernel-parameters.txt.
- ptrace user abi for cfi uses bitmasks instead of bitfields. Added ptrace
kselftest.
- cosmetic and grammatical changes to documentation.
v12:
- It seems like I had accidently squashed arch agnostic indirect branch
tracking prctl and riscv implementation of those prctls. Split them again.
- set_shstk_status/set_indir_lp_status perform CSR writes only when CPU
support is available. As suggested by Zong Li.
- Some minor clean up in kselftests as suggested by Zong Li.
v11:
- patch "arch/riscv: compile vdso with landing pad" was unconditionally
selecting `_zicfilp` for vDSO compile. fixed that. Changed `lpad 1` to
to `lpad 0`.
v10:
- dropped "mm: helper `is_shadow_stack_vma` to check shadow stack vma". This patch
is not that interesting to this patch series for risc-v. There are instances in
arch directories where VM_SHADOW_STACK flag is anyways used. Dropping this patch
to expedite merging in riscv tree.
- Took suggestions from `Clement` on "riscv: zicfiss / zicfilp enumeration" to
validate presence of cfi based on config.
- Added a patch for vDSO to have `lpad 0`. I had omitted this earlier to make sure
we add single vdso object with cfi enabled. But a vdso object with scheme of
zero labeled landing pad is least common denominator and should work with all
objects of zero labeled as well as function-signature labeled objects.
v9:
- rebased on master (39a803b754d5 fix braino in "9p: fix ->rename_sem exclusion")
- dropped "mm: Introduce ARCH_HAS_USER_SHADOW_STACK" (master has it from arm64/gcs)
- dropped "prctl: arch-agnostic prctl for shadow stack" (master has it from arm64/gcs)
v8:
- rebased on palmer/for-next
- dropped samuel holland's `envcfg` context switch patches.
they are in parlmer/for-next
v7:
- Removed "riscv/Kconfig: enable HAVE_EXIT_THREAD for riscv"
Instead using `deactivate_mm` flow to clean up.
see here for more context
https://lore.kernel.org/all/20230908203655.543765-1-rick.p.edgecombe@intel.…
- Changed the header include in `kselftest`. Hopefully this fixes compile
issue faced by Zong Li at SiFive.
- Cleaned up an orphaned change to `mm/mmap.c` in below patch
"riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE"
- Lock interfaces for shadow stack and indirect branch tracking expect arg == 0
Any future evolution of this interface should accordingly define how arg should
be setup.
- `mm/map.c` has an instance of using `VM_SHADOW_STACK`. Fixed it to use helper
`is_shadow_stack_vma`.
- Link to v6: https://lore.kernel.org/r/20241008-v5_user_cfi_series-v6-0-60d9fe073f37@riv…
v6:
- Picked up Samuel Holland's changes as is with `envcfg` placed in
`thread` instead of `thread_info`
- fixed unaligned newline escapes in kselftest
- cleaned up messages in kselftest and included test output in commit message
- fixed a bug in clone path reported by Zong Li
- fixed a build issue if CONFIG_RISCV_ISA_V is not selected
(this was introduced due to re-factoring signal context
management code)
v5:
- rebased on v6.12-rc1
- Fixed schema related issues in device tree file
- Fixed some of the documentation related issues in zicfilp/ss.rst
(style issues and added index)
- added `SHADOW_STACK_SET_MARKER` so that implementation can define base
of shadow stack.
- Fixed warnings on definitions added in usercfi.h when
CONFIG_RISCV_USER_CFI is not selected.
- Adopted context header based signal handling as proposed by Andy Chiu
- Added support for enabling kernel mode access to shadow stack using
FWFT
(https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-firmware…)
- Link to v5: https://lore.kernel.org/r/20241001-v5_user_cfi_series-v1-0-3ba65b6e550f@riv…
(Note: I had an issue in my workflow due to which version number wasn't
picked up correctly while sending out patches)
v4:
- rebased on 6.11-rc6
- envcfg: Converged with Samuel Holland's patches for envcfg management on per-
thread basis.
- vma_is_shadow_stack is renamed to is_vma_shadow_stack
- picked up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` patch
- signal context: using extended context management to maintain compatibility.
- fixed `-Wmissing-prototypes` compiler warnings for prctl functions
- Documentation fixes and amending typos.
- Link to v4: https://lore.kernel.org/all/20240912231650.3740732-1-debug@rivosinc.com/
v3:
- envcfg
logic to pick up base envcfg had a bug where `ENVCFG_CBZE` could have been
picked on per task basis, even though CPU didn't implement it. Fixed in
this series.
- dt-bindings
As suggested, split into separate commit. fixed the messaging that spec is
in public review
- arch_is_shadow_stack change
arch_is_shadow_stack changed to vma_is_shadow_stack
- hwprobe
zicfiss / zicfilp if present will get enumerated in hwprobe
- selftests
As suggested, added object and binary filenames to .gitignore
Selftest binary anyways need to be compiled with cfi enabled compiler which
will make sure that landing pad and shadow stack are enabled. Thus removed
separate enable/disable tests. Cleaned up tests a bit.
- Link to v3: https://lore.kernel.org/lkml/20240403234054.2020347-1-debug@rivosinc.com/
v2:
- Using config `CONFIG_RISCV_USER_CFI`, kernel support for riscv control flow
integrity for user mode programs can be compiled in the kernel.
- Enabling of control flow integrity for user programs is left to user runtime
- This patch series introduces arch agnostic `prctls` to enable shadow stack
and indirect branch tracking. And implements them on riscv.
---
Changes in v22:
- Link to v21: https://lore.kernel.org/r/20251015-v5_user_cfi_series-v21-0-6a07856e90e7@ri…
Changes in v21:
- Link to v20: https://lore.kernel.org/r/20251013-v5_user_cfi_series-v20-0-b9de4be9912e@ri…
Changes in v20:
- Link to v19: https://lore.kernel.org/r/20250731-v5_user_cfi_series-v19-0-09b468d7beab@ri…
Changes in v19:
- Link to v18: https://lore.kernel.org/r/20250711-v5_user_cfi_series-v18-0-a8ee62f9f38e@ri…
Changes in v18:
- Link to v17: https://lore.kernel.org/r/20250604-v5_user_cfi_series-v17-0-4565c2cf869f@ri…
Changes in v17:
- Link to v16: https://lore.kernel.org/r/20250522-v5_user_cfi_series-v16-0-64f61a35eee7@ri…
Changes in v16:
- Link to v15: https://lore.kernel.org/r/20250502-v5_user_cfi_series-v15-0-914966471885@ri…
Changes in v15:
- changelog posted just below cover letter
- Link to v14: https://lore.kernel.org/r/20250429-v5_user_cfi_series-v14-0-5239410d012a@ri…
Changes in v14:
- changelog posted just below cover letter
- Link to v13: https://lore.kernel.org/r/20250424-v5_user_cfi_series-v13-0-971437de586a@ri…
Changes in v13:
- changelog posted just below cover letter
- Link to v12: https://lore.kernel.org/r/20250314-v5_user_cfi_series-v12-0-e51202b53138@ri…
Changes in v12:
- changelog posted just below cover letter
- Link to v11: https://lore.kernel.org/r/20250310-v5_user_cfi_series-v11-0-86b36cbfb910@ri…
Changes in v11:
- changelog posted just below cover letter
- Link to v10: https://lore.kernel.org/r/20250210-v5_user_cfi_series-v10-0-163dcfa31c60@ri…
---
Andy Chiu (1):
riscv: signal: abstract header saving for setup_sigcontext
Deepak Gupta (26):
mm: VM_SHADOW_STACK definition for riscv
dt-bindings: riscv: zicfilp and zicfiss in dt-bindings (extensions.yaml)
riscv: zicfiss / zicfilp enumeration
riscv: zicfiss / zicfilp extension csr and bit definitions
riscv: usercfi state for task and save/restore of CSR_SSP on trap entry/exit
riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE
riscv/mm: manufacture shadow stack pte
riscv/mm: teach pte_mkwrite to manufacture shadow stack PTEs
riscv/mm: write protect and shadow stack
riscv/mm: Implement map_shadow_stack() syscall
riscv/shstk: If needed allocate a new shadow stack on clone
riscv: Implements arch agnostic shadow stack prctls
prctl: arch-agnostic prctl for indirect branch tracking
riscv: Implements arch agnostic indirect branch tracking prctls
riscv/traps: Introduce software check exception and uprobe handling
riscv/signal: save and restore of shadow stack for signal
riscv/kernel: update __show_regs to print shadow stack register
riscv/ptrace: riscv cfi status and state via ptrace and in core files
riscv/hwprobe: zicfilp / zicfiss enumeration in hwprobe
riscv: kernel command line option to opt out of user cfi
riscv: enable kernel access to shadow stack memory via FWFT sbi call
arch/riscv: dual vdso creation logic and select vdso based on hw
riscv: create a config for shadow stack and landing pad instr support
riscv: Documentation for landing pad / indirect branch tracking
riscv: Documentation for shadow stack on riscv
kselftest/riscv: kselftest for user mode cfi
Jim Shu (1):
arch/riscv: compile vdso with landing pad and shadow stack note
Documentation/admin-guide/kernel-parameters.txt | 8 +
Documentation/arch/riscv/index.rst | 2 +
Documentation/arch/riscv/zicfilp.rst | 115 +++++
Documentation/arch/riscv/zicfiss.rst | 179 +++++++
.../devicetree/bindings/riscv/extensions.yaml | 14 +
arch/riscv/Kconfig | 22 +
arch/riscv/Makefile | 8 +-
arch/riscv/configs/hardening.config | 4 +
arch/riscv/include/asm/asm-prototypes.h | 1 +
arch/riscv/include/asm/assembler.h | 44 ++
arch/riscv/include/asm/cpufeature.h | 12 +
arch/riscv/include/asm/csr.h | 16 +
arch/riscv/include/asm/entry-common.h | 2 +
arch/riscv/include/asm/hwcap.h | 2 +
arch/riscv/include/asm/mman.h | 26 +
arch/riscv/include/asm/mmu_context.h | 7 +
arch/riscv/include/asm/pgtable.h | 30 +-
arch/riscv/include/asm/processor.h | 1 +
arch/riscv/include/asm/thread_info.h | 3 +
arch/riscv/include/asm/usercfi.h | 95 ++++
arch/riscv/include/asm/vdso.h | 13 +-
arch/riscv/include/asm/vector.h | 3 +
arch/riscv/include/uapi/asm/hwprobe.h | 2 +
arch/riscv/include/uapi/asm/ptrace.h | 34 ++
arch/riscv/include/uapi/asm/sigcontext.h | 1 +
arch/riscv/kernel/Makefile | 2 +
arch/riscv/kernel/asm-offsets.c | 10 +
arch/riscv/kernel/cpufeature.c | 27 +
arch/riscv/kernel/entry.S | 38 ++
arch/riscv/kernel/head.S | 27 +
arch/riscv/kernel/process.c | 27 +-
arch/riscv/kernel/ptrace.c | 95 ++++
arch/riscv/kernel/signal.c | 148 +++++-
arch/riscv/kernel/sys_hwprobe.c | 2 +
arch/riscv/kernel/sys_riscv.c | 10 +
arch/riscv/kernel/traps.c | 54 ++
arch/riscv/kernel/usercfi.c | 545 +++++++++++++++++++++
arch/riscv/kernel/vdso.c | 7 +
arch/riscv/kernel/vdso/Makefile | 40 +-
arch/riscv/kernel/vdso/flush_icache.S | 4 +
arch/riscv/kernel/vdso/gen_vdso_offsets.sh | 4 +-
arch/riscv/kernel/vdso/getcpu.S | 4 +
arch/riscv/kernel/vdso/note.S | 3 +
arch/riscv/kernel/vdso/rt_sigreturn.S | 4 +
arch/riscv/kernel/vdso/sys_hwprobe.S | 4 +
arch/riscv/kernel/vdso/vgetrandom-chacha.S | 5 +-
arch/riscv/kernel/vdso_cfi/Makefile | 25 +
arch/riscv/kernel/vdso_cfi/vdso-cfi.S | 11 +
arch/riscv/mm/init.c | 2 +-
arch/riscv/mm/pgtable.c | 16 +
include/linux/cpu.h | 4 +
include/linux/mm.h | 7 +
include/uapi/linux/elf.h | 2 +
include/uapi/linux/prctl.h | 27 +
kernel/sys.c | 30 ++
tools/testing/selftests/riscv/Makefile | 2 +-
tools/testing/selftests/riscv/cfi/.gitignore | 3 +
tools/testing/selftests/riscv/cfi/Makefile | 16 +
tools/testing/selftests/riscv/cfi/cfi_rv_test.h | 82 ++++
tools/testing/selftests/riscv/cfi/riscv_cfi_test.c | 173 +++++++
tools/testing/selftests/riscv/cfi/shadowstack.c | 385 +++++++++++++++
tools/testing/selftests/riscv/cfi/shadowstack.h | 27 +
62 files changed, 2475 insertions(+), 41 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20240930-v5_user_cfi_series-3dc332f8f5b2
--
- debug
Hi all,
The test_xsk.sh script covers many AF_XDP use cases. The tests it runs
are defined in xksxceiver.c. Since this script is used to test real
hardware, the goal here is to leave it as it is, and only integrate the
tests that run on veth peers into the test_progs framework.
PATCH 1 extracts test_xsk[.c/.h] from xskxceiver[.c/.h] to make the
tests available to test_progs.
PATCH 2 to 7 fix small issues in the current test
PATCH 8 to 13 handle all errors to release resources instead of calling
exit() when any error occurs.
PATCH 14 isolates the tests that won't fit in the CI
PATCH 15 integrates the CI tests to the test_progs framework
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet(a)bootlin.com>
---
Changes in v7:
- Restore 'test_ns' prefix to allow parallel execution.
- PATCH 11: fix potential uninitialized variable spotted by AI.
- PACTH 12: fix potential resource leak spotted by AI
- Link to v6: https://lore.kernel.org/r/20251029-xsk-v6-0-5a63a64dff98@bootlin.com
Changes in v6:
- Setup veth peer once for each mode instead of once for each substest
- Rename the 'flaky' table 'skip-ci' table and move the automatically
skipped and the longest tests into it
- Link to v5: https://lore.kernel.org/r/20251016-xsk-v5-0-662c95eb8005@bootlin.com
Changes in v5:
- Rebase on latest bpf-next_base
- Move XDP_ADJUST_TAIL_SHRINK_MULTI_BUFF to the flaky table
- Add Maciej's reviewed-by
- Link to v4: https://lore.kernel.org/r/20250924-xsk-v4-0-20e57537b876@bootlin.com
Changes in v4:
- Fix test_xsk.sh's summary report.
- Merge PATCH 11 & 12 together, otherwise PATCH 11 fails to build.
- Split old PATCH 3 in two patches. The first one fixes
testapp_stats_rx_dropped(), the second one fixes
testapp_xdp_shared_umem(). The unecessary frees (in
testapp_stats_rx_full() and testapp_stats_fill_empty() are removed)
- Link to v3: https://lore.kernel.org/r/20250904-xsk-v3-0-ce382e331485@bootlin.com
Changes in v3:
- Rebase on latest bpf-next_base to integrate commit c9110e6f7237 ("selftests/bpf:
Fix count write in testapp_xdp_metadata_copy()").
- Move XDP_METADATA_COPY_* tests from flaky-tests to nominal tests
- Link to v2: https://lore.kernel.org/r/20250902-xsk-v2-0-17c6345d5215@bootlin.com
Changes in v2:
- Rebase on the latest bpf-next_base and integrate the newly added tests
to the work (adjust_tail* and tx_queue_consumer tests)
- Re-order patches to split xkxceiver sooner.
- Fix the bug reported by Maciej.
- Fix verbose mode in test_xsk.sh by keeping kselftest (remove PATCH 1,
7 and 8)
- Link to v1: https://lore.kernel.org/r/20250313-xsk-v1-0-7374729a93b9@bootlin.com
---
Bastien Curutchet (eBPF Foundation) (15):
selftests/bpf: test_xsk: Split xskxceiver
selftests/bpf: test_xsk: Initialize bitmap before use
selftests/bpf: test_xsk: Fix __testapp_validate_traffic()'s return value
selftests/bpf: test_xsk: fix memory leak in testapp_stats_rx_dropped()
selftests/bpf: test_xsk: fix memory leak in testapp_xdp_shared_umem()
selftests/bpf: test_xsk: Wrap test clean-up in functions
selftests/bpf: test_xsk: Release resources when swap fails
selftests/bpf: test_xsk: Add return value to init_iface()
selftests/bpf: test_xsk: Don't exit immediately when xsk_attach fails
selftests/bpf: test_xsk: Don't exit immediately when gettimeofday fails
selftests/bpf: test_xsk: Don't exit immediately when workers fail
selftests/bpf: test_xsk: Don't exit immediately if validate_traffic fails
selftests/bpf: test_xsk: Don't exit immediately on allocation failures
selftests/bpf: test_xsk: Isolate non-CI tests
selftests/bpf: test_xsk: Integrate test_xsk.c to test_progs framework
tools/testing/selftests/bpf/Makefile | 11 +-
tools/testing/selftests/bpf/prog_tests/test_xsk.c | 2596 ++++++++++++++++++++
tools/testing/selftests/bpf/prog_tests/test_xsk.h | 298 +++
tools/testing/selftests/bpf/prog_tests/xsk.c | 151 ++
tools/testing/selftests/bpf/xskxceiver.c | 2696 +--------------------
tools/testing/selftests/bpf/xskxceiver.h | 156 --
6 files changed, 3184 insertions(+), 2724 deletions(-)
---
base-commit: 1e2d874b04ba46a3b9fe6697097aa437641f4339
change-id: 20250218-xsk-0cf90e975d14
Best regards,
--
Bastien Curutchet (eBPF Foundation) <bastien.curutchet(a)bootlin.com>
[Kevin has a done a great job to get through reviews on all these, and
Vasant/Ankit have been looking at it on AMD systems, I think we are close to
being done now!]
Currently each of the iommu page table formats duplicates all of the logic
to maintain the page table and perform map/unmap/etc operations. There are
several different versions of the algorithms between all the different
formats. The io-pgtable system provides an interface to help isolate the
page table code from the iommu driver, but doesn't provide tools to
implement the common algorithms.
This makes it very hard to improve the state of the pagetable code under
the iommu domains as any proposed improvement needs to alter a large
number of different driver code paths. Combined with a lack of software
based testing this makes improvement in this area very hard.
iommufd wants several new page table operations:
- More efficient map/unmap operations, using iommufd's batching logic
- unmap that returns the physical addresses into a batch as it progresses
- cut that allows splitting areas so large pages can have holes
poked in them dynamically (ie guestmemfd hitless shared/private
transitions)
- More agressive freeing of table memory to avoid waste
- Fragmenting large pages so that dirty tracking can be more granular
- Reassembling large pages so that VMs can run at full IO performance
in migration/dirty tracking error flows
- KHO integration for kernel live upgrade
Together these are algorithmically complex enough to be a very significant
task to go and implement in all the page table formats we support. Just
the "server" focused drivers use almost all the formats (ARMv8 S1&S2 / x86
PAE / AMDv1 / VT-d SS / RISCV)
Instead of doing the duplicated work, this series takes the first step to
consolidate the algorithms into one places. In spirit it is similar to the
work Christoph did a few years back to pull the redundant get_user_pages()
implementations out of the arch code into core MM. This unlocked a great
deal of improvement in that space in the following years. I would like to
see the same benefit in iommu as well.
My first RFC showed a bigger picture with all most all formats and more
algorithms. This series reorganizes that to be narrowly focused on just
enough to convert the AMD driver to use the new mechanism.
kunit tests are provided that allow good testing of the algorithms and all
formats on x86, nothing is arch specific.
AMD is one of the simpler options as the HW is quite uniform with few
different options/bugs while still requiring the complicated contiguous
pages support. The HW also has a very simple range based invalidation
approach that is easy to implement.
The AMD v1 and AMD v2 page table formats are implemented bit for bit
identical to the current code, tested using a compare kunit test that
checks against the io-pgtable version (on github, see below).
Updating the AMD driver to replace the io-pgtable layer with the new stuff
is fairly straightforward now. The layering is fixed up in the new version
so that all the invalidation goes through function pointers.
Several small fixing patches have come out of this as I've been fixing the
problems that the test suite uncovers in the current code, and
implementing the fixed version in iommupt.
On performance, there is a quite wide variety of implementation designs
across all the drivers. Looking at some key performance across
the main formats:
iommu_map():
pgsz ,avg new,old ns, min new,old ns , min % (+ve is better)
2^12, 53,66 , 51,63 , 19.19 (AMDV1)
256*2^12, 386,1909 , 367,1795 , 79.79
256*2^21, 362,1633 , 355,1556 , 77.77
2^12, 56,62 , 52,59 , 11.11 (AMDv2)
256*2^12, 405,1355 , 357,1292 , 72.72
256*2^21, 393,1160 , 358,1114 , 67.67
2^12, 55,65 , 53,62 , 14.14 (VT-d second stage)
256*2^12, 391,518 , 332,512 , 35.35
256*2^21, 383,635 , 336,624 , 46.46
2^12, 57,65 , 55,63 , 12.12 (ARM 64 bit)
256*2^12, 380,389 , 361,369 , 2.02
256*2^21, 358,419 , 345,400 , 13.13
iommu_unmap():
pgsz ,avg new,old ns, min new,old ns , min % (+ve is better)
2^12, 69,88 , 65,85 , 23.23 (AMDv1)
256*2^12, 353,6498 , 331,6029 , 94.94
256*2^21, 373,6014 , 360,5706 , 93.93
2^12, 71,72 , 66,69 , 4.04 (AMDv2)
256*2^12, 228,891 , 206,871 , 76.76
256*2^21, 254,721 , 245,711 , 65.65
2^12, 69,87 , 65,82 , 20.20 (VT-d second stage)
256*2^12, 210,321 , 200,315 , 36.36
256*2^21, 255,349 , 238,342 , 30.30
2^12, 72,77 , 68,74 , 8.08 (ARM 64 bit)
256*2^12, 521,357 , 447,346 , -29.29
256*2^21, 489,358 , 433,345 , -25.25
* Above numbers include additional patches to remove the iommu_pgsize()
overheads. gcc 13.3.0, i7-12700
This version provides fairly consistent performance across formats. ARM
unmap performance is quite different because this version supports
contiguous pages and uses a very different algorithm for unmapping. Though
why it is so worse compared to AMDv1 I haven't figured out yet.
The per-format commits include a more detailed chart.
There is a second branch:
https://github.com/jgunthorpe/linux/commits/iommu_pt_all
Containing supporting work and future steps:
- ARM short descriptor (32 bit), ARM long descriptor (64 bit) formats
- RISCV format and RISCV conversion
https://github.com/jgunthorpe/linux/commits/iommu_pt_riscv
- Support for a DMA incoherent HW page table walker
- VT-d second stage format and VT-d conversion
https://github.com/jgunthorpe/linux/commits/iommu_pt_vtd
- DART v1 & v2 format
- Draft of a iommufd 'cut' operation to break down huge pages
- A compare test that checks the iommupt formats against the iopgtable
interface, including updating AMD to have a working iopgtable and patches
to make VT-d have an iopgtable for testing.
- A performance test to micro-benchmark map and unmap against iogptable
My strategy is to go one by one for the drivers:
- AMD driver conversion
- RISCV page table and driver
- Intel VT-d driver and VTDSS page table
- Flushing improvements for RISCV
- ARM SMMUv3
And concurrently work on the algorithm side:
- debugfs content dump, like VT-d has
- Cut support
- Increase/Decrease page size support
- map/unmap batching
- KHO
As we make more algorithm improvements the value to convert the drivers
increases.
This is on github: https://github.com/jgunthorpe/linux/commits/iommu_pt
v7:
- Rebase to v6.18-rc2
- Improve comments and documentation
- Add a few missed __sme_sets() for AMD CC
- Rename pt_iommu_flush_ops -> pt_iommu_driver_ops
VT-D -> VT-d
pt_clear_entry -> pt_clear_entries
pt_entry_write_is_dirty -> pt_entry_is_write_dirty
pt_entry_set_write_clean -> pt_entry_make_write_clean
- Tidy some of the map flow into a new function do_map()
- Fix ffz64()
v6: https://patch.msgid.link/r/0-v6-0fb54a1d9850+36b-iommu_pt_jgg@nvidia.com
- Improve comments and documentation
- Rename pt_entry_oa_full -> pt_entry_oa_exact
pt_has_system_page -> pt_has_system_page_size
pt_max_output_address_lg2 -> pt_max_oa_lg2
log2_f*() -> vaf* / oaf* / f*_t
pt_item_fully_covered -> pt_entry_fully_covered
- Fix missed constant propogation causing division
- Consolidate debugging checks to pt_check_install_leaf_args()
- Change collect->ignore_mapped to check_mapped
- Shuffle some hunks around to more appropriate patches
- Two new mini kunit tests
v5: https://patch.msgid.link/r/0-v5-116c4948af3d+68091-iommu_pt_jgg@nvidia.com
- Text grammar updates and kdoc fixes
v4: https://patch.msgid.link/r/0-v4-0d6a6726a372+18959-iommu_pt_jgg@nvidia.com
- Rebase on v6.16-rc3
- Integrate the HATS/HATDis changes
- Remove 'default n' from kconfig
- Remove unused 'PT_FIXED_TOP_LEVEL'
- Improve comments and documentation
- Fix some compile warnings from kbuild robots
v3: https://patch.msgid.link/r/0-v3-a93aab628dbc+521-iommu_pt_jgg@nvidia.com
- Rebase on v6.16-rc2
- s/PT_ENTRY_WORD_SIZE/PT_ITEM_WORD_SIZE/s to follow the language better
- Comment and documentation updates
- Add PT_TOP_PHYS_MASK to help manage alignment restrictions on the top
pointer
- Add missed force_aperture = true
- Make pt_iommu_deinit() take care of the not-yet-inited error case
internally as AMD/RISCV/VTD all shared this logic
- Change gather_range() into gather_range_pages() so it also deals with
the page list. This makes the following cache flushing series simpler
- Fix missed update of unmap->unmapped in some error cases
- Change clear_contig() to order the gather more logically
- Remove goto from the error handling in __map_range_leaf()
- s/log2_/oalog2_/ in places where the argument is an oaddr_t
- Pass the pts to pt_table_install64/32()
- Do not use SIGN_EXTEND for the AMDv2 page table because of Vasant's
information on how PASID 0 works.
v2: https://patch.msgid.link/r/0-v2-5c26bde5c22d+58b-iommu_pt_jgg@nvidia.com
- AMD driver only, many code changes
RFC: https://lore.kernel.org/all/0-v1-01fa10580981+1d-iommu_pt_jgg@nvidia.com/
Cc: Michael Roth <michael.roth(a)amd.com>
Cc: Alexey Kardashevskiy <aik(a)amd.com>
Cc: Pasha Tatashin <pasha.tatashin(a)soleen.com>
Cc: James Gowans <jgowans(a)amazon.com>
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
Alejandro Jimenez (1):
iommu/amd: Use the generic iommu page table
Jason Gunthorpe (14):
genpt: Generic Page Table base API
genpt: Add Documentation/ files
iommupt: Add the basic structure of the iommu implementation
iommupt: Add the AMD IOMMU v1 page table format
iommupt: Add iova_to_phys op
iommupt: Add unmap_pages op
iommupt: Add map_pages op
iommupt: Add read_and_clear_dirty op
iommupt: Add a kunit test for Generic Page Table
iommupt: Add a mock pagetable format for iommufd selftest to use
iommufd: Change the selftest to use iommupt instead of xarray
iommupt: Add the x86 64 bit page table format
iommu/amd: Remove AMD io_pgtable support
iommupt: Add a kunit test for the IOMMU implementation
.clang-format | 1 +
Documentation/driver-api/generic_pt.rst | 142 ++
Documentation/driver-api/index.rst | 1 +
drivers/iommu/Kconfig | 2 +
drivers/iommu/Makefile | 1 +
drivers/iommu/amd/Kconfig | 5 +-
drivers/iommu/amd/Makefile | 2 +-
drivers/iommu/amd/amd_iommu.h | 1 -
drivers/iommu/amd/amd_iommu_types.h | 110 +-
drivers/iommu/amd/io_pgtable.c | 577 --------
drivers/iommu/amd/io_pgtable_v2.c | 370 ------
drivers/iommu/amd/iommu.c | 538 ++++----
drivers/iommu/generic_pt/.kunitconfig | 13 +
drivers/iommu/generic_pt/Kconfig | 68 +
drivers/iommu/generic_pt/fmt/Makefile | 26 +
drivers/iommu/generic_pt/fmt/amdv1.h | 415 ++++++
drivers/iommu/generic_pt/fmt/defs_amdv1.h | 21 +
drivers/iommu/generic_pt/fmt/defs_x86_64.h | 21 +
drivers/iommu/generic_pt/fmt/iommu_amdv1.c | 15 +
drivers/iommu/generic_pt/fmt/iommu_mock.c | 10 +
drivers/iommu/generic_pt/fmt/iommu_template.h | 48 +
drivers/iommu/generic_pt/fmt/iommu_x86_64.c | 11 +
drivers/iommu/generic_pt/fmt/x86_64.h | 259 ++++
drivers/iommu/generic_pt/iommu_pt.h | 1162 +++++++++++++++++
drivers/iommu/generic_pt/kunit_generic_pt.h | 713 ++++++++++
drivers/iommu/generic_pt/kunit_iommu.h | 183 +++
drivers/iommu/generic_pt/kunit_iommu_pt.h | 487 +++++++
drivers/iommu/generic_pt/pt_common.h | 358 +++++
drivers/iommu/generic_pt/pt_defs.h | 329 +++++
drivers/iommu/generic_pt/pt_fmt_defaults.h | 233 ++++
drivers/iommu/generic_pt/pt_iter.h | 636 +++++++++
drivers/iommu/generic_pt/pt_log2.h | 122 ++
drivers/iommu/io-pgtable.c | 4 -
drivers/iommu/iommufd/Kconfig | 1 +
drivers/iommu/iommufd/iommufd_test.h | 11 +-
drivers/iommu/iommufd/selftest.c | 438 +++----
include/linux/generic_pt/common.h | 167 +++
include/linux/generic_pt/iommu.h | 271 ++++
include/linux/io-pgtable.h | 2 -
include/linux/irqchip/riscv-imsic.h | 3 +-
tools/testing/selftests/iommu/iommufd.c | 60 +-
tools/testing/selftests/iommu/iommufd_utils.h | 12 +
42 files changed, 6237 insertions(+), 1612 deletions(-)
create mode 100644 Documentation/driver-api/generic_pt.rst
delete mode 100644 drivers/iommu/amd/io_pgtable.c
delete mode 100644 drivers/iommu/amd/io_pgtable_v2.c
create mode 100644 drivers/iommu/generic_pt/.kunitconfig
create mode 100644 drivers/iommu/generic_pt/Kconfig
create mode 100644 drivers/iommu/generic_pt/fmt/Makefile
create mode 100644 drivers/iommu/generic_pt/fmt/amdv1.h
create mode 100644 drivers/iommu/generic_pt/fmt/defs_amdv1.h
create mode 100644 drivers/iommu/generic_pt/fmt/defs_x86_64.h
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_amdv1.c
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_mock.c
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_template.h
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_x86_64.c
create mode 100644 drivers/iommu/generic_pt/fmt/x86_64.h
create mode 100644 drivers/iommu/generic_pt/iommu_pt.h
create mode 100644 drivers/iommu/generic_pt/kunit_generic_pt.h
create mode 100644 drivers/iommu/generic_pt/kunit_iommu.h
create mode 100644 drivers/iommu/generic_pt/kunit_iommu_pt.h
create mode 100644 drivers/iommu/generic_pt/pt_common.h
create mode 100644 drivers/iommu/generic_pt/pt_defs.h
create mode 100644 drivers/iommu/generic_pt/pt_fmt_defaults.h
create mode 100644 drivers/iommu/generic_pt/pt_iter.h
create mode 100644 drivers/iommu/generic_pt/pt_log2.h
create mode 100644 include/linux/generic_pt/common.h
create mode 100644 include/linux/generic_pt/iommu.h
base-commit: bf3db0366052dcdf7dea89a07929b690aac59b15
--
2.43.0
This small patchset is about avoid verifier bug warning when conditional
jumps on same register when the register holds a scalar with range.
v3:
- Enhance is_scalar_branch_taken() to handle scalar case. (Eduard)
- Update the selftest to cover all conditional jump opcodes. (Eduard)
v2:
https://lore.kernel.org/bpf/20251025053017.2308823-1-kafai.wan@linux.dev/
- Enhance is_branch_taken() and is_scalar_branch_taken() to handle
branch direction computation for same register. (Eduard and Alexei)
- Update the selftest.
v1:
https://lore.kernel.org/bpf/20251022164457.1203756-1-kafai.wan@linux.dev/
---
KaFai Wan (2):
bpf: Skip bounds adjustment for conditional jumps on same scalar
register
selftests/bpf: Add test for conditional jumps on same scalar register
kernel/bpf/verifier.c | 33 ++++
.../selftests/bpf/progs/verifier_bounds.c | 154 ++++++++++++++++++
2 files changed, 187 insertions(+)
--
2.43.0
From: Chia-Yu Chang <chia-yu.chang(a)nokia-bell-labs.com>
Hello,
Plesae find the v5 AccECN case handling patch series, which covers
several excpetional case handling of Accurate ECN spec (RFC9768),
adds new identifiers to be used by CC modules, adds ecn_delta into
rate_sample, and keeps the ACE counter for computation, etc.
This patch series is part of the full AccECN patch series, which is available at
https://github.com/L4STeam/linux-net-next/commits/upstream_l4steam/
Best regards,
Chia-Yu
---
v5:
- Move previous #11 in v4 in latter patch after discussion with RFC author.
- Add #3 to update the comments for SKB_GSO_TCP_ECN and SKB_GSO_TCP_ACCECN. (Parav Pandit <parav(a)nvidia.com>)
- Add gro self-test for TCP CWR flag in #4. (Eric Dumazet <edumazet(a)google.com>)
- Add fixes: tag into #7 (Paolo Abeni <pabeni(a)redhat.com>)
- Update commit message of #8 and if condition check (Paolo Abeni <pabeni(a)redhat.com>)
- Add empty line between variable declarations and code in #13 (Paolo Abeni <pabeni(a)redhat.com>)
v4:
- Add previous #13 in v2 back after dicussion with the RFC author.
- Add TCP_ACCECN_OPTION_PERSIST to tcp_ecn_option sysctl to ignore AccECN fallback policy on sending AccECN option.
v3:
- Add additional min() check if pkts_acked_ewma is not initialized in #1. (Paolo Abeni <pabeni(a)redhat.com>)
- Change TCP_CONG_WANTS_ECT_1 into individual flag add helper function INET_ECN_xmit_wants_ect_1() in #3. (Paolo Abeni <pabeni(a)redhat.com>)
- Add empty line between variable declarations and code in #4. (Paolo Abeni <pabeni(a)redhat.com>)
- Update commit message to fix old AccECN commits in #5. (Paolo Abeni <pabeni(a)redhat.com>)
- Remove unnecessary brackets in #10. (Paolo Abeni <pabeni(a)redhat.com>)
- Move patch #3 in v2 to a later Prague patch serise and remove patch #13 in v2. (Paolo Abeni <pabeni(a)redhat.com>)
---
Chia-Yu Chang (12):
net: update commnets for SKB_GSO_TCP_ECN and SKB_GSO_TCP_ACCECN
selftests/net: gro: add self-test for TCP CWR flag
tcp: L4S ECT(1) identifier and NEEDS_ACCECN for CC modules
tcp: disable RFC3168 fallback identifier for CC modules
tcp: accecn: handle unexpected AccECN negotiation feedback
tcp: accecn: retransmit downgraded SYN in AccECN negotiation
tcp: move increment of num_retrans
tcp: accecn: retransmit SYN/ACK without AccECN option or non-AccECN
SYN/ACK
tcp: accecn: unset ECT if receive or send ACE=0 in AccECN negotiaion
tcp: accecn: fallback outgoing half link to non-AccECN
tcp: accecn: detect loss ACK w/ AccECN option and add
TCP_ACCECN_OPTION_PERSIST
tcp: accecn: enable AccECN
Ilpo Järvinen (2):
tcp: try to avoid safer when ACKs are thinned
gro: flushing when CWR is set negatively affects AccECN
Documentation/networking/ip-sysctl.rst | 4 +-
.../networking/net_cachelines/tcp_sock.rst | 1 +
include/linux/skbuff.h | 13 ++-
include/linux/tcp.h | 4 +-
include/net/inet_ecn.h | 20 +++-
include/net/tcp.h | 32 ++++++-
include/net/tcp_ecn.h | 92 ++++++++++++++-----
net/ipv4/sysctl_net_ipv4.c | 4 +-
net/ipv4/tcp.c | 2 +
net/ipv4/tcp_cong.c | 10 +-
net/ipv4/tcp_input.c | 37 +++++++-
net/ipv4/tcp_minisocks.c | 40 +++++---
net/ipv4/tcp_offload.c | 3 +-
net/ipv4/tcp_output.c | 42 ++++++---
tools/testing/selftests/net/gro.c | 80 +++++++++++-----
15 files changed, 294 insertions(+), 90 deletions(-)
--
2.34.1
Parsing KTAP is quite an inconvenience, but most of the time the thing
you really want to know is "did anything fail"?
Let's give the user the his information without them needing
to parse anything.
Because of the use of subshells and namespaces, this needs to be
communicated via a file. Just write arbitrary data into the file and
treat non-empty content as a signal that something failed.
In case any user depends on the current behaviour, such as running this
from a script with `set -e` and parsing the result for failures
afterwards, add a flag they can set to get the old behaviour, namely
--no-error-on-fail.
Signed-off-by: Brendan Jackman <jackmanb(a)google.com>
---
Changes in v3:
- Fixed quoting
- Link to v2: https://lore.kernel.org/r/20251014-b4-ksft-error-on-fail-v2-1-b3e2657237b8@…
Changes in v2:
- Fixed bug in report_failure()
- Made error-on-fail the default
- Link to v1: https://lore.kernel.org/r/20251007-b4-ksft-error-on-fail-v1-1-71bf058f5662@…
---
tools/testing/selftests/kselftest/runner.sh | 14 ++++++++++----
tools/testing/selftests/run_kselftest.sh | 14 ++++++++++++++
2 files changed, 24 insertions(+), 4 deletions(-)
diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh
index 2c3c58e65a419f5ee8d7dc51a37671237a07fa0b..3a62039fa6217f3453423ff011575d0a1eb8c275 100644
--- a/tools/testing/selftests/kselftest/runner.sh
+++ b/tools/testing/selftests/kselftest/runner.sh
@@ -44,6 +44,12 @@ tap_timeout()
fi
}
+report_failure()
+{
+ echo "not ok $*"
+ echo "$*" >> "$kselftest_failures_file"
+}
+
run_one()
{
DIR="$1"
@@ -105,7 +111,7 @@ run_one()
echo "# $TEST_HDR_MSG"
if [ ! -e "$TEST" ]; then
echo "# Warning: file $TEST is missing!"
- echo "not ok $test_num $TEST_HDR_MSG"
+ report_failure "$test_num $TEST_HDR_MSG"
else
if [ -x /usr/bin/stdbuf ]; then
stdbuf="/usr/bin/stdbuf --output=L "
@@ -123,7 +129,7 @@ run_one()
interpreter=$(head -n 1 "$TEST" | cut -c 3-)
cmd="$stdbuf $interpreter ./$BASENAME_TEST"
else
- echo "not ok $test_num $TEST_HDR_MSG"
+ report_failure "$test_num $TEST_HDR_MSG"
return
fi
fi
@@ -137,9 +143,9 @@ run_one()
echo "ok $test_num $TEST_HDR_MSG # SKIP"
elif [ $rc -eq $timeout_rc ]; then \
echo "#"
- echo "not ok $test_num $TEST_HDR_MSG # TIMEOUT $kselftest_timeout seconds"
+ report_failure "$test_num $TEST_HDR_MSG # TIMEOUT $kselftest_timeout seconds"
else
- echo "not ok $test_num $TEST_HDR_MSG # exit=$rc"
+ report_failure "$test_num $TEST_HDR_MSG # exit=$rc"
fi)
cd - >/dev/null
fi
diff --git a/tools/testing/selftests/run_kselftest.sh b/tools/testing/selftests/run_kselftest.sh
index 0443beacf3621ae36cb12ffd57f696ddef3526b5..d4be97498b32e975c63a1167d3060bdeba674c8c 100755
--- a/tools/testing/selftests/run_kselftest.sh
+++ b/tools/testing/selftests/run_kselftest.sh
@@ -33,6 +33,7 @@ Usage: $0 [OPTIONS]
-c | --collection COLLECTION Run all tests from COLLECTION
-l | --list List the available collection:test entries
-d | --dry-run Don't actually run any tests
+ -f | --no-error-on-fail Don't exit with an error just because tests failed
-n | --netns Run each test in namespace
-h | --help Show this usage info
-o | --override-timeout Number of seconds after which we timeout
@@ -44,6 +45,7 @@ COLLECTIONS=""
TESTS=""
dryrun=""
kselftest_override_timeout=""
+ERROR_ON_FAIL=true
while true; do
case "$1" in
-s | --summary)
@@ -65,6 +67,9 @@ while true; do
-d | --dry-run)
dryrun="echo"
shift ;;
+ -f | --no-error-on-fail)
+ ERROR_ON_FAIL=false
+ shift ;;
-n | --netns)
RUN_IN_NETNS=1
shift ;;
@@ -105,9 +110,18 @@ if [ -n "$TESTS" ]; then
available="$(echo "$valid" | sed -e 's/ /\n/g')"
fi
+kselftest_failures_file="$(mktemp --tmpdir kselftest-failures-XXXXXX)"
+export kselftest_failures_file
+
collections=$(echo "$available" | cut -d: -f1 | sort | uniq)
for collection in $collections ; do
[ -w /dev/kmsg ] && echo "kselftest: Running tests in $collection" >> /dev/kmsg
tests=$(echo "$available" | grep "^$collection:" | cut -d: -f2)
($dryrun cd "$collection" && $dryrun run_many $tests)
done
+
+failures="$(cat "$kselftest_failures_file")"
+rm "$kselftest_failures_file"
+if "$ERROR_ON_FAIL" && [ "$failures" ]; then
+ exit 1
+fi
---
base-commit: 8f5ae30d69d7543eee0d70083daf4de8fe15d585
change-id: 20251007-b4-ksft-error-on-fail-0c2cb3246041
Best regards,
--
Brendan Jackman <jackmanb(a)google.com>
Hello,
this series is a small follow-up to the test_tc_tunnel recent
integration, to address some small missing details raised during the
final review ([1]). This is mostly about adding some missing checks on
net namespaces management.
[1] https://lore.kernel.org/bpf/1ac9d14e-4250-480c-b863-410be78ac6c6@linux.dev/
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore(a)bootlin.com>
---
Alexis Lothoré (eBPF Foundation) (3):
selftests/bpf: skip tc_tunnel subtest if its setup fails
selftests/bpf: add checks in tc_tunnel when entering net namespaces
selftests/bpf: use start_server_str rather than start_reuseport_server in tc_tunnel
.../selftests/bpf/prog_tests/test_tc_tunnel.c | 162 ++++++++++++++-------
1 file changed, 107 insertions(+), 55 deletions(-)
---
base-commit: 1e2d874b04ba46a3b9fe6697097aa437641f4339
change-id: 20251030-tc_tunnel_improv-6b9d1c22c6f6
Best regards,
--
Alexis Lothoré, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com
This patch set adds timer test case for LoongArch system, it is based
on common arch_timer test case. And it includes time counter function,
one-shot/period mode interrupt, and software emulated timer function
test.
Bibo Mao (6):
KVM: LoongArch: selftests: Add system registers save and restore on
exception
KVM: LoongArch: selftests: Add exception handler register interface
KVM: LoongArch: selftests: Add basic interfaces
KVM: LoongArch: selftests: Add timer test case with one-shot mode
KVM: LoongArch: selftests: Add period mode timer and time counter test
KVM: LoongArch: selftests: Add SW emulated timer test
tools/testing/selftests/kvm/Makefile.kvm | 10 +-
.../kvm/include/loongarch/arch_timer.h | 84 ++++++++
.../kvm/include/loongarch/processor.h | 81 +++++++-
.../selftests/kvm/lib/loongarch/exception.S | 6 +
.../selftests/kvm/lib/loongarch/processor.c | 38 +++-
.../selftests/kvm/loongarch/arch_timer.c | 187 ++++++++++++++++++
6 files changed, 400 insertions(+), 6 deletions(-)
create mode 100644 tools/testing/selftests/kvm/include/loongarch/arch_timer.h
create mode 100644 tools/testing/selftests/kvm/loongarch/arch_timer.c
base-commit: e53642b87a4f4b03a8d7e5f8507fc3cd0c595ea6
--
2.39.3
Hi Linus,
Please pull the following kselftest fixes update for Linux 6.18-rc4.
Fixes build warning in cachestat found during clang build and adds
tmpshmcstat to .gitignore.
diff is attached.
thanks,
-- Shuah
----------------------------------------------------------------
The following changes since commit 3a8660878839faadb4f1a6dd72c3179c1df56787:
Linux 6.18-rc1 (2025-10-12 13:42:36 -0700)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest tags/linux_kselftest-fixes-6.18-rc4
for you to fetch changes up to 920aa3a7705a061cb3004572d8b7932b54463dbf:
selftests: cachestat: Fix warning on declaration under label (2025-10-22 09:23:18 -0600)
----------------------------------------------------------------
linux_kselftest-fixes-6.18-rc4
Fixes build warning in cachestat found during clang build and adds
tmpshmcstat to .gitignore.
----------------------------------------------------------------
Madhur Kumar (1):
selftests/cachestat: add tmpshmcstat file to .gitignore
Sidharth Seela (1):
selftests: cachestat: Fix warning on declaration under label
tools/testing/selftests/cachestat/.gitignore | 1 +
tools/testing/selftests/cachestat/test_cachestat.c | 4 ++--
2 files changed, 3 insertions(+), 2 deletions(-)
----------------------------------------------------------------
Hi Linus,
Please pull the following kunit fixes update for Linux 6.18-rc4.
Fixes log overwrite in param_tests and fixes incorrect cast of priv
pointer in test_dev_action(). Updates email address for Rae Moar in
MAINTAINERS KUnit entry.
diff is attached.
thanks,
-- Shuah
----------------------------------------------------------------
The following changes since commit 3a8660878839faadb4f1a6dd72c3179c1df56787:
Linux 6.18-rc1 (2025-10-12 13:42:36 -0700)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest tags/linux_kselftest-kunit-fixes-6.18-rc4
for you to fetch changes up to f3903ec76ae6afcdba0347681d1dda005fb145cd:
MAINTAINERS: Update KUnit email address for Rae Moar (2025-10-29 14:57:54 -0600)
----------------------------------------------------------------
linux_kselftest-kunit-fixes-6.18-rc4
Fixes log overwrite in param_tests and fixes incorrect cast of priv
pointer in test_dev_action(). Updates email address for Rae Moar in
MAINTAINERS KUnit entry.
----------------------------------------------------------------
Carlos Llamas (1):
kunit: prevent log overwrite in param_tests
Florian Schmaus (1):
kunit: test_dev_action: Correctly cast 'priv' pointer to long*
Rae Moar (1):
MAINTAINERS: Update KUnit email address for Rae Moar
.mailmap | 1 +
MAINTAINERS | 2 +-
lib/kunit/kunit-test.c | 2 +-
lib/kunit/test.c | 3 ++-
4 files changed, 5 insertions(+), 3 deletions(-)
----------------------------------------------------------------
The test 'ethtool-features.sh' failed with the below output:
TAP version 13
1..1
# timeout set to 600
# selftests: drivers/net/netdevsim: ethtool-features.sh
# Warning: file ethtool-features.sh is not executable
# ethtool: bad command line argument(s)
# For more information run ethtool -h
# ethtool: bad command line argument(s)
# For more information run ethtool -h
# ethtool: bad command line argument(s)
# For more information run ethtool -h
# ethtool: bad command line argument(s)
# For more information run ethtool -h
# ethtool: bad command line argument(s)
# For more information run ethtool -h
# ethtool: bad command line argument(s)
# For more information run ethtool -h
# ethtool: bad command line argument(s)
# For more information run ethtool -h
# ethtool: bad command line argument(s)
# For more information run ethtool -h
# ethtool: bad command line argument(s)
# For more information run ethtool -h
# ethtool: bad command line argument(s)
# For more information run ethtool -h
# FAILED 10/10 checks
not ok 1 selftests: drivers/net/netdevsim: ethtool-features.sh # exit=1
Similar to commit 18378b0e49d9 ("selftests/damon: Add executable
permission to test scripts"), the script 'ethtool-features.sh' has no
executable permission, which leads to the warning 'file
ethtool-features.sh is not executable'.
Old version ethtool (my ethtool version is 5.16) does not support command
'ethtool --json -k enp1s0', which leads to the output 'ethtool: bad
command line argument(s)'.
This patch adds executable permission to script 'ethtool-features.sh', and
check 'ethtool --json -k' support. After this patch:
TAP version 13
1..1
# timeout set to 600
# selftests: drivers/net/netdevsim: ethtool-features.sh
# SKIP: No --json -k support in ethtool
ok 1 selftests: drivers/net/netdevsim: ethtool-features.sh
Fixes: 0189270117c3 ("selftests: netdevsim: add a test checking ethtool features")
Signed-off-by: Wang Liang <wangliang74(a)huawei.com>
---
.../selftests/drivers/net/netdevsim/ethtool-features.sh | 5 +++++
1 file changed, 5 insertions(+)
mode change 100644 => 100755 tools/testing/selftests/drivers/net/netdevsim/ethtool-features.sh
diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-features.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-features.sh
old mode 100644
new mode 100755
index bc210dc6ad2d..f771dc6839ea
--- a/tools/testing/selftests/drivers/net/netdevsim/ethtool-features.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-features.sh
@@ -7,6 +7,11 @@ NSIM_NETDEV=$(make_netdev)
set -o pipefail
+if ! ethtool --json -k $NSIM_NETDEV > /dev/null 2>&1; then
+ echo "SKIP: No --json -k support in ethtool"
+ exit $ksft_skip
+fi
+
FEATS="
tx-checksum-ip-generic
tx-scatter-gather
--
2.34.1
Prior to commit 9245fd6b8531 ("KVM: x86: model canonical checks more
precisely"), KVM_SET_NESTED_STATE would fail if the state was captured
with L2 active, L1 had CR4.LA57 set, L2 did not, and the
VMCS12.HOST_GSBASE (or other host-state field checked for canonicality)
had an address greater than 48 bits wide.
Add a regression test that reproduces the KVM_SET_NESTED_STATE failure
conditions. To do so, the first three patches add support for 5-level
paging in the selftest L1 VM.
v1 -> v2
Ended the page walking loops before visiting 4K mappings [Yosry]
Changed VM_MODE_PXXV48_4K into VM_MODE_PXXVYY_4K;
use 5-level paging when possible [Sean]
Removed the check for non-NULL vmx_pages in guest_code() [Yosry]
Jim Mattson (4):
KVM: selftests: Use a loop to create guest page tables
KVM: selftests: Use a loop to walk guest page tables
KVM: selftests: Change VM_MODE_PXXV48_4K to VM_MODE_PXXVYY_4K
KVM: selftests: Add a VMX test for LA57 nested state
tools/testing/selftests/kvm/Makefile.kvm | 1 +
.../testing/selftests/kvm/include/kvm_util.h | 4 +-
.../selftests/kvm/include/x86/processor.h | 2 +-
.../selftests/kvm/lib/arm64/processor.c | 2 +-
tools/testing/selftests/kvm/lib/kvm_util.c | 30 ++--
.../testing/selftests/kvm/lib/x86/processor.c | 80 +++++------
tools/testing/selftests/kvm/lib/x86/vmx.c | 6 +-
.../kvm/x86/vmx_la57_nested_state_test.c | 134 ++++++++++++++++++
8 files changed, 197 insertions(+), 62 deletions(-)
create mode 100644 tools/testing/selftests/kvm/x86/vmx_la57_nested_state_test.c
--
2.51.1.851.g4ebd6896fd-goog
[ based on kvm/next ]
Implement guest_memfd population via the write syscall.
This is useful in non-CoCo use cases where the host can access guest
memory. Even though the same can also be achieved via userspace mapping
and memcpying from userspace, write provides a more performant option
because it does not need to set page tables and it does not cause a page
fault for every page like memcpy would. Note that memcpy cannot be
accelerated via MADV_POPULATE_WRITE as it is not supported by
guest_memfd and relies on GUP.
Populating 512MiB of guest_memfd on a x86 machine:
- via memcpy: 436 ms
- via write: 202 ms (-54%)
The write syscall support is conditional on kvm_gmem_supports_mmap.
When in-place shared/private conversion is supported, write should only
be allowed on shared pages.
v6:
- Make write support conditional on mmap support instead of relying on
the up-to-date flag to decide whether writing to a page is allowed
- James: Remove depenendencies on folio_test_large
- James: Remove page alignment restriction
- James: Formatting fixes
v5:
- https://lore.kernel.org/kvm/20250902111951.58315-1-kalyazin@amazon.com/
- Replace the call to the unexported filemap_remove_folio with
zeroing the bytes that could not be copied
- Fix checkpatch findings
v4:
- https://lore.kernel.org/kvm/20250828153049.3922-1-kalyazin@amazon.com
- Switch from implementing the write callback to write_iter
- Remove conditional compilation
v3:
- https://lore.kernel.org/kvm/20250303130838.28812-1-kalyazin@amazon.com
- David/Mike D: Only compile support for the write syscall if
CONFIG_KVM_GMEM_SHARED_MEM (now gone) is enabled.
v2:
- https://lore.kernel.org/kvm/20241129123929.64790-1-kalyazin@amazon.com
- Switch from an ioctl to the write syscall to implement population
v1:
- https://lore.kernel.org/kvm/20241024095429.54052-1-kalyazin@amazon.com
Nikita Kalyazin (2):
KVM: guest_memfd: add generic population via write
KVM: selftests: update guest_memfd write tests
.../testing/selftests/kvm/guest_memfd_test.c | 51 ++++++++++++++++---
virt/kvm/guest_memfd.c | 49 ++++++++++++++++++
2 files changed, 94 insertions(+), 6 deletions(-)
base-commit: 6b36119b94d0b2bb8cea9d512017efafd461d6ac
--
2.50.1
sched_ext tasks can be starved by long-running RT tasks, especially since
RT throttling was replaced by deadline servers to boost only SCHED_NORMAL
tasks.
Several users in the community have reported issues with RT stalling
sched_ext tasks. This is fairly common on distributions or environments
where applications like video compositors, audio services, etc. run as RT
tasks by default.
Example trace (showing a per-CPU kthread stalled due to the sway Wayland
compositor running as an RT task):
runnable task stall (kworker/0:0[106377] failed to run for 5.043s)
...
CPU 0 : nr_run=3 flags=0xd cpu_rel=0 ops_qseq=20646200 pnt_seq=45388738
curr=sway[994] class=rt_sched_class
R kworker/0:0[106377] -5043ms
scx_state/flags=3/0x1 dsq_flags=0x0 ops_state/qseq=0/0
sticky/holding_cpu=-1/-1 dsq_id=0x8000000000000002 dsq_vtime=0 slice=20000000
cpus=01
This is often perceived as a bug in the BPF schedulers, but in reality
schedulers can't do much: RT tasks run outside their control and can
potentially consume 100% of the CPU bandwidth.
Fix this by adding a sched_ext deadline server, so that sched_ext tasks are
also boosted and do not suffer starvation.
Two kselftests are also provided to verify the starvation fixes and
bandwidth allocation is correct.
== Highlights in this version ==
- wait for inactive_task_timer() to fire before removing the bandwidth
reservation (Juri/Peter: please check if this new
dl_server_remove_params() implementation makes sense to you)
- removed the explicit dl_server_stop() from dequeue_task_scx() and rely
on the delayed stop behavior (Juri/Peter: ditto)
This patchset is also available in the following git branch:
git://git.kernel.org/pub/scm/linux/kernel/git/arighi/linux.git scx-dl-server
Changes in v10:
- reordered patches to better isolate sched_ext changes vs sched/deadline
changes (Andrea Righi)
- define ext_server only with CONFIG_SCHED_CLASS_EXT=y (Andrea Righi)
- add WARN_ON_ONCE(!cpus) check in dl_server_apply_params() (Andrea Righi)
- wait for inactive_task_timer to fire before removing the bandwidth
reservation (Juri Lelli)
- remove explicit dl_server_stop() in dequeue_task_scx() to reduce timer
reprogramming overhead (Juri Lelli)
- do not restart pick_task() when invoked by the dl_server (Tejun Heo)
- rename rq_dl_server to dl_server (Peter Zijlstra)
- fixed a missing dl_server start in dl_server_on() (Christian Loehle)
- add a comment to the rt_stall selftest to better explain the 4%
threshold (Emil Tsalapatis)
Changes in v9:
- Drop the ->balance() logic as its functionality is now integrated into
->pick_task(), allowing dl_server to call pick_task_scx() directly
- Link to v8: https://lore.kernel.org/all/20250903095008.162049-1-arighi@nvidia.com/
Changes in v8:
- Add tj's patch to de-couple balance and pick_task and avoid changing
sched/core callbacks to propagate @rf
- Simplify dl_se->dl_server check (suggested by PeterZ)
- Small coding style fixes in the kselftests
- Link to v7: https://lore.kernel.org/all/20250809184800.129831-1-joelagnelf@nvidia.com/
Changes in v7:
- Rebased to Linus master
- Link to v6: https://lore.kernel.org/all/20250702232944.3221001-1-joelagnelf@nvidia.com/
Changes in v6:
- Added Acks to few patches
- Fixes to few nits suggested by Tejun
- Link to v5: https://lore.kernel.org/all/20250620203234.3349930-1-joelagnelf@nvidia.com/
Changes in v5:
- Added a kselftest (total_bw) to sched_ext to verify bandwidth values
from debugfs
- Address comment from Andrea about redundant rq clock invalidation
- Link to v4: https://lore.kernel.org/all/20250617200523.1261231-1-joelagnelf@nvidia.com/
Changes in v4:
- Fixed issues with hotplugged CPUs having their DL server bandwidth
altered due to loading SCX
- Fixed other issues
- Rebased on Linus master
- All sched_ext kselftests reliably pass now, also verified that the
total_bw in debugfs (CONFIG_SCHED_DEBUG) is conserved with these patches
- Link to v3: https://lore.kernel.org/all/20250613051734.4023260-1-joelagnelf@nvidia.com/
Changes in v3:
- Removed code duplication in debugfs. Made ext interface separate
- Fixed issue where rq_lock_irqsave was not used in the relinquish patch
- Fixed running bw accounting issue in dl_server_remove_params
- Link to v2: https://lore.kernel.org/all/20250602180110.816225-1-joelagnelf@nvidia.com/
Changes in v2:
- Fixed a hang related to using rq_lock instead of rq_lock_irqsave
- Added support to remove BW of DL servers when they are switched to/from EXT
- Link to v1: https://lore.kernel.org/all/20250315022158.2354454-1-joelagnelf@nvidia.com/
Andrea Righi (5):
sched/deadline: Add support to initialize and remove dl_server bandwidth
sched_ext: Add a DL server for sched_ext tasks
sched/deadline: Account ext server bandwidth
sched_ext: Selectively enable ext and fair DL servers
selftests/sched_ext: Add test for sched_ext dl_server
Joel Fernandes (6):
sched/debug: Fix updating of ppos on server write ops
sched/debug: Stop and start server based on if it was active
sched/deadline: Clear the defer params
sched/deadline: Add a server arg to dl_server_update_idle_time()
sched/debug: Add support to change sched_ext server params
selftests/sched_ext: Add test for DL server total_bw consistency
kernel/sched/core.c | 3 +
kernel/sched/deadline.c | 169 +++++++++++---
kernel/sched/debug.c | 171 +++++++++++---
kernel/sched/ext.c | 144 +++++++++++-
kernel/sched/fair.c | 2 +-
kernel/sched/idle.c | 2 +-
kernel/sched/sched.h | 8 +-
kernel/sched/topology.c | 5 +
tools/testing/selftests/sched_ext/Makefile | 2 +
tools/testing/selftests/sched_ext/rt_stall.bpf.c | 23 ++
tools/testing/selftests/sched_ext/rt_stall.c | 222 ++++++++++++++++++
tools/testing/selftests/sched_ext/total_bw.c | 281 +++++++++++++++++++++++
12 files changed, 955 insertions(+), 77 deletions(-)
create mode 100644 tools/testing/selftests/sched_ext/rt_stall.bpf.c
create mode 100644 tools/testing/selftests/sched_ext/rt_stall.c
create mode 100644 tools/testing/selftests/sched_ext/total_bw.c
Hello,
this is the v3 of test_tc_tunnel conversion into test_progs framework.
This new revision:
- fixes a few issues spotted by the bot reviewer
- removes any test ensuring connection failure (and so depending on a
timout) to keep the execution time reasonable
test_tc_tunnel.sh tests a variety of tunnels based on BPF: packets are
encapsulated by a BPF program on the client egress. We then check that
those packets can be decapsulated on server ingress side, either thanks
to kernel-based or BPF-based decapsulation. Those tests are run thanks
to two veths in two dedicated namespaces.
- patches 1 and 2 are preparatory patches
- patch 3 introduce tc_tunnel test into test_progs
- patch 4 gets rid of the test_tc_tunnel.sh script
The new test has been executed both in some x86 local qemu machine, as
well as in CI:
# ./test_progs -a tc_tunnel
#454/1 tc_tunnel/ipip_none:OK
#454/2 tc_tunnel/ipip6_none:OK
#454/3 tc_tunnel/ip6tnl_none:OK
#454/4 tc_tunnel/sit_none:OK
#454/5 tc_tunnel/vxlan_eth:OK
#454/6 tc_tunnel/ip6vxlan_eth:OK
#454/7 tc_tunnel/gre_none:OK
#454/8 tc_tunnel/gre_eth:OK
#454/9 tc_tunnel/gre_mpls:OK
#454/10 tc_tunnel/ip6gre_none:OK
#454/11 tc_tunnel/ip6gre_eth:OK
#454/12 tc_tunnel/ip6gre_mpls:OK
#454/13 tc_tunnel/udp_none:OK
#454/14 tc_tunnel/udp_eth:OK
#454/15 tc_tunnel/udp_mpls:OK
#454/16 tc_tunnel/ip6udp_none:OK
#454/17 tc_tunnel/ip6udp_eth:OK
#454/18 tc_tunnel/ip6udp_mpls:OK
#454 tc_tunnel:OK
Summary: 1/18 PASSED, 0 SKIPPED, 0 FAILED
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore(a)bootlin.com>
---
Changes in v3:
- remove systematic "connection must fail" test part of each subtest
- also remove kernel-based decap test for subtests supposed to fail on
kernel side
- fix potential fd leak if connection structure allocation fails
- fix wrong early return in run_test
- Link to v2: https://lore.kernel.org/r/20251022-tc_tunnel-v2-0-a44a0bd52902@bootlin.com
Changes in v2:
- declare a single tc_prog_attach helper rather than multiple,
intermediate helpers
- move the new helper to network_helpers.c rather than a dedicated
file
- do not rename existing tc_helpers.c/h pair (drop patch)
- keep only the minimal set of needed NS switches
- Link to v1: https://lore.kernel.org/r/20251017-tc_tunnel-v1-0-2d86808d86b2@bootlin.com
---
Alexis Lothoré (eBPF Foundation) (4):
selftests/bpf: add tc helpers
selftests/bpf: make test_tc_tunnel.bpf.c compatible with big endian platforms
selftests/bpf: integrate test_tc_tunnel.sh tests into test_progs
selftests/bpf: remove test_tc_tunnel.sh
tools/testing/selftests/bpf/Makefile | 1 -
tools/testing/selftests/bpf/network_helpers.c | 45 ++
tools/testing/selftests/bpf/network_helpers.h | 16 +
.../selftests/bpf/prog_tests/test_tc_tunnel.c | 674 +++++++++++++++++++++
.../testing/selftests/bpf/prog_tests/test_tunnel.c | 107 +---
tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 95 ++-
tools/testing/selftests/bpf/test_tc_tunnel.sh | 320 ----------
7 files changed, 790 insertions(+), 468 deletions(-)
---
base-commit: ecdeefe65eaeb82a1262e20401ba750b8c9e0b97
change-id: 20250811-tc_tunnel-c61342683f18
Best regards,
--
Alexis Lothoré, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com
get-reg-list includes ZCR_EL2 in the list of EL2 registers that it looks
for when NV is enabled but does not have any feature gate for this register,
meaning that testing any combination of features that includes EL2 but does
not include SVE will result in a test failure due to a missing register
being reported:
| The following lines are missing registers:
|
| ARM64_SYS_REG(3, 4, 1, 2, 0),
Add ZCR_EL2 to feat_id_regs so that the test knows not to expect to see it
without SVE being enabled.
Fixes: 3a90b6f27964 ("KVM: arm64: selftests: get-reg-list: Add base EL2 registers")
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
tools/testing/selftests/kvm/arm64/get-reg-list.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/tools/testing/selftests/kvm/arm64/get-reg-list.c b/tools/testing/selftests/kvm/arm64/get-reg-list.c
index c9b84eeaab6b..7ae26ce875ad 100644
--- a/tools/testing/selftests/kvm/arm64/get-reg-list.c
+++ b/tools/testing/selftests/kvm/arm64/get-reg-list.c
@@ -68,6 +68,7 @@ static struct feature_id_reg feat_id_regs[] = {
REG_FEAT(VNCR_EL2, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY),
REG_FEAT(CNTHV_CTL_EL2, ID_AA64MMFR1_EL1, VH, IMP),
REG_FEAT(CNTHV_CVAL_EL2,ID_AA64MMFR1_EL1, VH, IMP),
+ REG_FEAT(ZCR_EL2, ID_AA64PFR0_EL1, SVE, IMP),
};
bool filter_reg(__u64 reg)
---
base-commit: 211ddde0823f1442e4ad052a2f30f050145ccada
change-id: 20251023-kvm-arm64-get-reg-list-zcr-el2-c43090e11f23
Best regards,
--
Mark Brown <broonie(a)kernel.org>
Since GITS_TYPER.PTA == 0, the ITS MAPC command demands a CPU ID,
rather than a physical redistributor address, for its RDbase
command argument.
As such, when MAPC-ing guest ITS collections, vgic_lpi_stress iterates
over CPU IDs in the range [0, nr_cpus), passing them as the RDbase
vcpu_id argument to its_send_mapc_cmd().
However, its_encode_target() in the its_send_mapc_cmd() selftest
handler expects RDbase arguments to be formatted with a 16 bit
offset, as shown by the 16-bit target_addr right shift its implementation:
its_mask_encode(&cmd->raw_cmd[2], target_addr >> 16, 51, 16)
At the moment, all CPU IDs passed into its_send_mapc_cmd() have no
offset, therefore becoming 0x0 after the bit shift. Thus, when
vgic_its_cmd_handle_mapc() receives the ITS command in vgic-its.c,
it always interprets the RDbase target CPU as CPU 0. All interrupts
sent to collections will be processed by vCPU 0, which defeats the
purpose of this multi-vCPU test.
Fix by creating procnum_to_rdbase() helper function, which left-shifts
the vCPU parameter received by its_send_mapc_cmd 16 bits before passing
it to its_encode_target for encoding.
Signed-off-by: Maximilian Dittgen <mdittgen(a)amazon.de>
---
v2: Refactor the vcpu_id left shift into procnum_to_rdbase() helper.
Rename and rewrite commit to reflect root cause of bug which was
improper RDbase formatting, not that MAPC expects a physical
address as the RDbase parameter.
To validate the patch, I added the following debug code at the top of vgic_its_cmd_handle_mapc:
u64 raw_cmd2 = le64_to_cpu(its_cmd[2]);
u32 target_addr = its_cmd_get_target_addr(its_cmd);
kvm_info("MAPC: coll_id=%d, raw_cmd[2]=0x%llx, parsed_target=%u\n",
coll_id, raw_cmd2, target_addr);
vcpu = kvm_get_vcpu_by_id(kvm, its_cmd_get_target_addr(its_cmd));
kvm_info("MAPC: coll_id=%d, vcpu_id=%d\n", coll_id, vcpu ? vcpu->vcpu_id : -1);
I then ran `./vgic_lpi_stress -v 3` to trigger the stress selftest with 3 vCPUs.
Before the patch, the debug logs read:
kvm [20832]: MAPC: coll_id=0, raw_cmd[2]=0x8000000000000000, parsed_target=0
kvm [20832]: MAPC: coll_id=0, vcpu_id=0
kvm [20832]: MAPC: coll_id=1, raw_cmd[2]=0x8000000000000001, parsed_target=0
kvm [20832]: MAPC: coll_id=1, vcpu_id=0
kvm [20832]: MAPC: coll_id=2, raw_cmd[2]=0x8000000000000002, parsed_target=0
kvm [20832]: MAPC: coll_id=2, vcpu_id=0
Note the last bit of the cmd string reflects the collection ID, but the rest of the cmd string reads 0. The handler parses out vCPU 0 for all 3 mapc calls.
After the patch, the debug logs read:
kvm [20019]: MAPC: coll_id=0, raw_cmd[2]=0x8000000000000000, parsed_target=0
kvm [20019]: MAPC: coll_id=0, vcpu_id=0
kvm [20019]: MAPC: coll_id=1, raw_cmd[2]=0x8000000000010001, parsed_target=1
kvm [20019]: MAPC: coll_id=1, vcpu_id=1
kvm [20019]: MAPC: coll_id=2, raw_cmd[2]=0x8000000000020002, parsed_target=2
kvm [20019]: MAPC: coll_id=2, vcpu_id=2
Note that the target vcpu and target collection are both visible in the cmd string. The handler parses out the correct vCPU for all 3 mapc calls.
___
tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c
index 09f270545646..0e2f8ed90f30 100644
--- a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c
+++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c
@@ -15,6 +15,8 @@
#include "gic_v3.h"
#include "processor.h"
+#define GITS_COLLECTION_TARGET_SHIFT 16
+
static u64 its_read_u64(unsigned long offset)
{
return readq_relaxed(GITS_BASE_GVA + offset);
@@ -163,6 +165,11 @@ static void its_encode_collection(struct its_cmd_block *cmd, u16 col)
its_mask_encode(&cmd->raw_cmd[2], col, 15, 0);
}
+static u64 procnum_to_rdbase(u32 vcpu_id)
+{
+ return vcpu_id << GITS_COLLECTION_TARGET_SHIFT;
+}
+
#define GITS_CMDQ_POLL_ITERATIONS 0
static void its_send_cmd(void *cmdq_base, struct its_cmd_block *cmd)
@@ -217,7 +224,7 @@ void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool val
its_encode_cmd(&cmd, GITS_CMD_MAPC);
its_encode_collection(&cmd, collection_id);
- its_encode_target(&cmd, vcpu_id);
+ its_encode_target(&cmd, procnum_to_rdbase(vcpu_id));
its_encode_valid(&cmd, valid);
its_send_cmd(cmdq_base, &cmd);
--
2.50.1 (Apple Git-155)
Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christian Schlaeger
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597
This series introduces stats counters for psp. Device key rotations,
and so called 'stale-events' are common to all drivers and are tracked
by the core.
A driver facing api is provided for reporting stats required by the
"Implementation Requirements" section of the PSP Architecture
Specification. Drivers must implement these stats.
Lastly, implementations of the driver stats api for mlx5 and netdevsim
are included.
Here is the output of running the psp selftest suite and then
printing out stats with the ynl cli on system with a psp-capable CX7:
$ ./ksft-psp-stats/drivers/net/psp.py
TAP version 13
1..28
ok 1 psp.test_case # SKIP Test requires IPv4 connectivity
ok 2 psp.data_basic_send_v0_ip6
ok 3 psp.test_case # SKIP Test requires IPv4 connectivity
ok 4 psp.data_basic_send_v1_ip6
ok 5 psp.test_case # SKIP Test requires IPv4 connectivity
ok 6 psp.data_basic_send_v2_ip6 # SKIP ('PSP version not supported', 'hdr0-aes-gmac-128')
ok 7 psp.test_case # SKIP Test requires IPv4 connectivity
ok 8 psp.data_basic_send_v3_ip6 # SKIP ('PSP version not supported', 'hdr0-aes-gmac-256')
ok 9 psp.test_case # SKIP Test requires IPv4 connectivity
ok 10 psp.data_mss_adjust_ip6
ok 11 psp.dev_list_devices
ok 12 psp.dev_get_device
ok 13 psp.dev_get_device_bad
ok 14 psp.dev_rotate
ok 15 psp.dev_rotate_spi
ok 16 psp.assoc_basic
ok 17 psp.assoc_bad_dev
ok 18 psp.assoc_sk_only_conn
ok 19 psp.assoc_sk_only_mismatch
ok 20 psp.assoc_sk_only_mismatch_tx
ok 21 psp.assoc_sk_only_unconn
ok 22 psp.assoc_version_mismatch
ok 23 psp.assoc_twice
ok 24 psp.data_send_bad_key
ok 25 psp.data_send_disconnect
ok 26 psp.data_stale_key
ok 27 psp.removal_device_rx # XFAIL Test only works on netdevsim
ok 28 psp.removal_device_bi # XFAIL Test only works on netdevsim
# Totals: pass:19 fail:0 xfail:2 xpass:0 skip:7 error:0
#
# Responder logs (0):
# STDERR:
# Set PSP enable on device 1 to 0x3
# Set PSP enable on device 1 to 0x0
$ cd ynl/
$ ./pyynl/cli.py --spec netlink/specs/psp.yaml --dump get-stats
[{'dev-id': 1,
'key-rotations': 5,
'rx-auth-fail': 21,
'rx-bad': 0,
'rx-bytes': 11844,
'rx-error': 0,
'rx-packets': 94,
'stale-events': 6,
'tx-bytes': 1128456,
'tx-error': 0,
'tx-packets': 780}]
CHANGES:
v2:
- don't return skb->len from psp_nl_get_stats_dumpit() on success and
EMSGSIZE
- use %pe to print PTR_ERR()
v1: https://lore.kernel.org/netdev/20251022193739.1376320-1-daniel.zahka@gmail.…
Daniel Zahka (2):
selftests: drv-net: psp: add assertions on core-tracked psp dev stats
netdevsim: implement psp device stats
Jakub Kicinski (3):
psp: report basic stats from the core
psp: add stats from psp spec to driver facing api
net/mlx5e: Add PSP stats support for Rx/Tx flows
Documentation/netlink/specs/psp.yaml | 95 +++++++
.../mellanox/mlx5/core/en_accel/psp.c | 239 ++++++++++++++++--
.../mellanox/mlx5/core/en_accel/psp.h | 18 ++
.../mellanox/mlx5/core/en_accel/psp_rxtx.c | 1 +
.../net/ethernet/mellanox/mlx5/core/en_main.c | 5 +
drivers/net/netdevsim/netdevsim.h | 5 +
drivers/net/netdevsim/psp.c | 27 ++
include/net/psp/types.h | 35 +++
include/uapi/linux/psp.h | 18 ++
net/psp/psp-nl-gen.c | 19 ++
net/psp/psp-nl-gen.h | 2 +
net/psp/psp_main.c | 3 +-
net/psp/psp_nl.c | 94 +++++++
net/psp/psp_sock.c | 4 +-
tools/testing/selftests/drivers/net/psp.py | 13 +
15 files changed, 561 insertions(+), 17 deletions(-)
--
2.47.3
While debugging issues related to aarch64 only systems I ran into
speedbumps due to the lack of detail in the results reported when the
guest register read and reset value preservation tests were run, they
generated an immediately fatal assert without indicating which register
was being tested. Update these tests to report a result per register,
making it much easier to see what the problem being reported is.
A similar, though less severe, issue exists with the validation of the
individual bitfields in registers due to the use of immediately fatal
asserts. Update those asserts to be standard kselftest reports.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Mark Brown (3):
KVM: selftests: arm64: Report set_id_reg reads of test registers as tests
KVM: selftests: arm64: Report register reset tests individually
KVM: selftests: arm64: Make set_id_regs bitfield validatity checks non-fatal
tools/testing/selftests/kvm/arm64/set_id_regs.c | 108 ++++++++++++++++++------
1 file changed, 82 insertions(+), 26 deletions(-)
---
base-commit: 211ddde0823f1442e4ad052a2f30f050145ccada
change-id: 20251028-kvm-arm64-set-id-regs-aarch64-ebb77969401c
Best regards,
--
Mark Brown <broonie(a)kernel.org>
When the BPF ring buffer is full, a new event cannot be recorded until one
or more old events are consumed to make enough space for it. In cases such
as fault diagnostics, where recent events are more useful than older ones,
this mechanism may lead to critical events being lost.
So add overwrite mode for BPF ring buffer to address it. In this mode, the
new event overwrites the oldest event when the buffer is full.
v3:
- remove half-round wakeup, drop unnecessary min in ringbuf_avail_data_sz(),
switch to smp_load_acquire, update tests and fix typos, etc (Andrii)
- rebase and re-collect performance data
v2:
https://lore.kernel.org/bpf/20250905150641.2078838-1-xukuohai@huaweicloud.c…
- remove libbpf changes (Andrii)
- update overwrite benchmark
v1:
https://lore.kernel.org/bpf/20250804022101.2171981-1-xukuohai@huaweicloud.c…
Xu Kuohai (3):
bpf: Add overwrite mode for BPF ring buffer
selftests/bpf: Add overwrite mode test for BPF ring buffer
selftests/bpf/benchs: Add overwrite mode benchmark for BPF ring buffer
include/uapi/linux/bpf.h | 4 +
kernel/bpf/ringbuf.c | 109 +++++++++++++++---
tools/include/uapi/linux/bpf.h | 4 +
tools/testing/selftests/bpf/Makefile | 3 +-
.../selftests/bpf/benchs/bench_ringbufs.c | 66 ++++++++++-
.../bpf/benchs/run_bench_ringbufs.sh | 4 +
.../selftests/bpf/prog_tests/ringbuf.c | 64 ++++++++++
.../selftests/bpf/progs/ringbuf_bench.c | 11 ++
.../bpf/progs/test_ringbuf_overwrite.c | 98 ++++++++++++++++
9 files changed, 337 insertions(+), 26 deletions(-)
create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_overwrite.c
--
2.43.0
Hi all,
The test_xsk.sh script covers many AF_XDP use cases. The tests it runs
are defined in xksxceiver.c. Since this script is used to test real
hardware, the goal here is to leave it as it is, and only integrate the
tests that run on veth peers into the test_progs framework.
I've looked into what could improve the speed in the CI:
- some tests are skipped when run on veth peers in a VM (because they
rely on huge page allocation or HW rings). This skipping logic still
takes some time and can be easily avoided.
- the TEARDOWN test is quite long (several seconds on its own) because
it runs the same test 10 times in a row to ensure the teardown process
works properly
With theses tests fully skipped in the CI and the veth setup done only
once for each mode (DRV / SKB), the execution time is reduced to about 5
seconds on my setup.
```
$ tools/testing/selftests/bpf/vmtest.sh -d $HOME/ebpf/output-regular/ -- time ./test_progs -t xsk
[...]
real 0m 5.04s
user 0m 0.38s
sys 0m 1.61s
```
It still feels a bit long, but there are 24 tests run in both DRV and
SKB modes which means around 100ms for each one. I'm not sure I can make
it much faster without randomizing the tests so that not all of them run
in every CI execution.
PATCH 1 extracts test_xsk[.c/.h] from xskxceiver[.c/.h] to make the
tests available to test_progs.
PATCH 2 to 7 fix small issues in the current test
PATCH 8 to 13 handle all errors to release resources instead of calling
exit() when any error occurs.
PATCH 14 isolates the tests that won't fit in the CI
PATCH 15 integrates the CI tests to the test_progs framework
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet(a)bootlin.com>
---
Changes in v6:
- Setup veth peer once for each mode instead of once for each substest
- Rename the 'flaky' table 'skip-ci' table and move the automatically
skipped and the longest tests into it
- Link to v5: https://lore.kernel.org/r/20251016-xsk-v5-0-662c95eb8005@bootlin.com
Changes in v5:
- Rebase on latest bpf-next_base
- Move XDP_ADJUST_TAIL_SHRINK_MULTI_BUFF to the flaky table
- Add Maciej's reviewed-by
- Link to v4: https://lore.kernel.org/r/20250924-xsk-v4-0-20e57537b876@bootlin.com
Changes in v4:
- Fix test_xsk.sh's summary report.
- Merge PATCH 11 & 12 together, otherwise PATCH 11 fails to build.
- Split old PATCH 3 in two patches. The first one fixes
testapp_stats_rx_dropped(), the second one fixes
testapp_xdp_shared_umem(). The unecessary frees (in
testapp_stats_rx_full() and testapp_stats_fill_empty() are removed)
- Link to v3: https://lore.kernel.org/r/20250904-xsk-v3-0-ce382e331485@bootlin.com
Changes in v3:
- Rebase on latest bpf-next_base to integrate commit c9110e6f7237 ("selftests/bpf:
Fix count write in testapp_xdp_metadata_copy()").
- Move XDP_METADATA_COPY_* tests from flaky-tests to nominal tests
- Link to v2: https://lore.kernel.org/r/20250902-xsk-v2-0-17c6345d5215@bootlin.com
Changes in v2:
- Rebase on the latest bpf-next_base and integrate the newly added tests
to the work (adjust_tail* and tx_queue_consumer tests)
- Re-order patches to split xkxceiver sooner.
- Fix the bug reported by Maciej.
- Fix verbose mode in test_xsk.sh by keeping kselftest (remove PATCH 1,
7 and 8)
- Link to v1: https://lore.kernel.org/r/20250313-xsk-v1-0-7374729a93b9@bootlin.com
---
Bastien Curutchet (eBPF Foundation) (15):
selftests/bpf: test_xsk: Split xskxceiver
selftests/bpf: test_xsk: Initialize bitmap before use
selftests/bpf: test_xsk: Fix __testapp_validate_traffic()'s return value
selftests/bpf: test_xsk: fix memory leak in testapp_stats_rx_dropped()
selftests/bpf: test_xsk: fix memory leak in testapp_xdp_shared_umem()
selftests/bpf: test_xsk: Wrap test clean-up in functions
selftests/bpf: test_xsk: Release resources when swap fails
selftests/bpf: test_xsk: Add return value to init_iface()
selftests/bpf: test_xsk: Don't exit immediately when xsk_attach fails
selftests/bpf: test_xsk: Don't exit immediately when gettimeofday fails
selftests/bpf: test_xsk: Don't exit immediately when workers fail
selftests/bpf: test_xsk: Don't exit immediately if validate_traffic fails
selftests/bpf: test_xsk: Don't exit immediately on allocation failures
selftests/bpf: test_xsk: Isolate non-CI tests
selftests/bpf: test_xsk: Integrate test_xsk.c to test_progs framework
tools/testing/selftests/bpf/Makefile | 11 +-
tools/testing/selftests/bpf/prog_tests/test_xsk.c | 2595 ++++++++++++++++++++
tools/testing/selftests/bpf/prog_tests/test_xsk.h | 298 +++
tools/testing/selftests/bpf/prog_tests/xsk.c | 151 ++
tools/testing/selftests/bpf/xskxceiver.c | 2696 +--------------------
tools/testing/selftests/bpf/xskxceiver.h | 156 --
6 files changed, 3183 insertions(+), 2724 deletions(-)
---
base-commit: 4481a8590725400f37d3015f0ee0d53a2cdc1bd6
change-id: 20250218-xsk-0cf90e975d14
Best regards,
--
Bastien Curutchet (eBPF Foundation) <bastien.curutchet(a)bootlin.com>
The bareudp.sh script uses /bin/sh and it will load another lib.sh
BASH script at the very beginning.
But on some operating systems like Ubuntu, /bin/sh is actually pointed to
DASH, thus it will try to run BASH commands with DASH and consequently
leads to syntax issues.
This patch fixes syntax failures on systems where /bin/sh is not BASH by
explicitily using BASH for bareudp.sh.
Po-Hsu Lin (1):
selftests: net: use BASH for bareudp testing
tools/testing/selftests/net/bareudp.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--
2.34.1
Socket APIs like recvfrom(), accept(), and getsockname() expect socklen_t*
arg, but tests were using int variables. This causes -Wpointer-sign
warnings on platforms where socklen_t is unsigned.
Change the variable type from int to socklen_t to resolve the warning and
ensure type safety across platforms.
warning fixed:
sctp_collision.c:62:70: warning: passing 'int *' to parameter of
type 'socklen_t *' (aka 'unsigned int *') converts between pointers to
integer types with different sign [-Wpointer-sign]
62 | ret = recvfrom(sd, buf, sizeof(buf),
0, (struct sockaddr *)&daddr, &len);
| ^~~~
/usr/include/sys/socket.h:165:27: note: passing argument to
parameter '__addr_len' here
165 | socklen_t *__restrict __addr_len);
| ^
Reviewed-by: Muhammad Usama Anjum <usama.anjum(a)collabora.com>
Signed-off-by: Ankit Khushwaha <ankitkhushwaha.linux(a)gmail.com>
---
tools/testing/selftests/net/netfilter/sctp_collision.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/net/netfilter/sctp_collision.c b/tools/testing/selftests/net/netfilter/sctp_collision.c
index 21bb1cfd8a85..91df996367e9 100644
--- a/tools/testing/selftests/net/netfilter/sctp_collision.c
+++ b/tools/testing/selftests/net/netfilter/sctp_collision.c
@@ -9,7 +9,8 @@
int main(int argc, char *argv[])
{
struct sockaddr_in saddr = {}, daddr = {};
- int sd, ret, len = sizeof(daddr);
+ socklen_t len = sizeof(daddr);
struct timeval tv = {25, 0};
char buf[] = "hello";
+ int sd, ret;
--
2.51.0
This series fixes a memory corruption bug in KHO that occurs when KFENCE
is enabled.
The root cause is that KHO metadata, allocated via kzalloc(), can be
randomly serviced by kfence_alloc(). When a kernel boots via KHO, the
early memblock allocator is restricted to a "scratch area". This forces
the KFENCE pool to be allocated within this scratch area, creating a
conflict. If KHO metadata is subsequently placed in this pool, it gets
corrupted during the next kexec operation.
Patch 1/3 introduces a debug-only feature (CONFIG_KEXEC_HANDOVER_DEBUG)
that adds checks to detect and fail any operation that attempts to place
KHO metadata or preserved memory within the scratch area. This serves as
a validation and diagnostic tool to confirm the problem without
affecting production builds.
Patch 2/3 Increases bitmap to PAGE_SIZE, so buddy allocator can be used.
Patch 3/3 Provides the fix by modifying KHO to allocate its metadata
directly from the buddy allocator instead of slab. This bypasses the
KFENCE interception entirely.
Pasha Tatashin (3):
liveupdate: kho: warn and fail on metadata or preserved memory in
scratch area
liveupdate: kho: Increase metadata bitmap size to PAGE_SIZE
liveupdate: kho: allocate metadata directly from the buddy allocator
include/linux/gfp.h | 3 ++
kernel/Kconfig.kexec | 9 ++++
kernel/Makefile | 1 +
kernel/kexec_handover.c | 72 ++++++++++++++++++++------------
kernel/kexec_handover_debug.c | 25 +++++++++++
kernel/kexec_handover_internal.h | 16 +++++++
6 files changed, 100 insertions(+), 26 deletions(-)
create mode 100644 kernel/kexec_handover_debug.c
create mode 100644 kernel/kexec_handover_internal.h
base-commit: 6548d364a3e850326831799d7e3ea2d7bb97ba08
--
2.51.0.869.ge66316f041-goog
Hi all,
I have tried looking at an issue from the bpftool repository:
https://github.com/libbpf/bpftool/issues/121 and this RFC tries to add
that enhancement.
Summary: Currently when a map creation is successful there is no message
on the terminal, printing IDs on successful creation of maps can help
notify the user and can be used in CI/CD.
The first patch adds the logic for printing and the second patch adds a
simple selftest for the same.
The github issue is not fully solved with these two patches, as there
are other bpf objects that might need similar additions. Would
appreciate any inputs on this.
Thank you very much.
Regards,
Harshit
Harshit Mogalapalli (2):
bpftool: Print map ID upon creation and support JSON output
selftests/bpf: Add test for bpftool map ID printing
tools/bpf/bpftool/map.c | 24 +++++++++++---
.../testing/selftests/bpf/test_bpftool_map.sh | 32 +++++++++++++++++++
2 files changed, 52 insertions(+), 4 deletions(-)
--
2.50.1
The script "ethtool-common.sh" is not installed in INSTALL_PATH, and
triggers some errors when I try to run the test
'drivers/net/netdevsim/ethtool-coalesce.sh':
TAP version 13
1..1
# timeout set to 600
# selftests: drivers/net/netdevsim: ethtool-coalesce.sh
# ./ethtool-coalesce.sh: line 4: ethtool-common.sh: No such file or directory
# ./ethtool-coalesce.sh: line 25: make_netdev: command not found
# ethtool: bad command line argument(s)
# ./ethtool-coalesce.sh: line 124: check: command not found
# ./ethtool-coalesce.sh: line 126: [: -eq: unary operator expected
# FAILED /0 checks
not ok 1 selftests: drivers/net/netdevsim: ethtool-coalesce.sh # exit=1
Install this file to avoid this error. After this patch:
TAP version 13
1..1
# timeout set to 600
# selftests: drivers/net/netdevsim: ethtool-coalesce.sh
# PASSED all 22 checks
ok 1 selftests: drivers/net/netdevsim: ethtool-coalesce.sh
Fixes: fbb8531e58bd ("selftests: extract common functions in ethtool-common.sh")
Signed-off-by: Wang Liang <wangliang74(a)huawei.com>
---
tools/testing/selftests/drivers/net/netdevsim/Makefile | 2 ++
1 file changed, 2 insertions(+)
diff --git a/tools/testing/selftests/drivers/net/netdevsim/Makefile b/tools/testing/selftests/drivers/net/netdevsim/Makefile
index daf51113c827..653141a654a0 100644
--- a/tools/testing/selftests/drivers/net/netdevsim/Makefile
+++ b/tools/testing/selftests/drivers/net/netdevsim/Makefile
@@ -20,4 +20,6 @@ TEST_PROGS := \
udp_tunnel_nic.sh \
# end of TEST_PROGS
+TEST_FILES := ethtool-common.sh
+
include ../../../lib.mk
--
2.34.1
Hey all,
This patch series refactors the vsock selftest VM infrastructure to
improve test run times, reduce false-positives, improve logging
generally, and fix several bugs.
It also prepares for future tests which make heavy usage of these
refactored functions and have new requirements such as simultaneous QEMU
processes.
These patches were broken off from this prior series:
https://lore.kernel.org/all/20251021-vsock-vmtest-v7-0-0661b7b6f081@meta.co…
To: Stefano Garzarella <sgarzare(a)redhat.com>
To: Shuah Khan <shuah(a)kernel.org>
To: Jakub Kicinski <kuba(a)kernel.org>
To: Bobby Eshleman <bobbyeshleman(a)gmail.com>
Cc: virtualization(a)lists.linux.dev
Cc: netdev(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Signed-off-by: Bobby Eshleman <bobbyeshleman(a)meta.com>
---
Bobby Eshleman (12):
selftests/vsock: improve logging in vmtest.sh
selftests/vsock: make wait_for_listener() work even if pipefail is on
selftests/vsock: reuse logic for vsock_test through wrapper functions
selftests/vsock: avoid multi-VM pidfile collisions with QEMU
selftests/vsock: do not unconditionally die if qemu fails
selftests/vsock: speed up tests by reducing the QEMU pidfile timeout
selftests/vsock: add check_result() for pass/fail counting
selftests/vsock: identify and execute tests that can re-use VM
selftests/vsock: add BUILD=0 definition
selftests/vsock: avoid false-positives when checking dmesg
selftests/vsock: add 1.37 to tested virtme-ng versions
selftests/vsock: add vsock_loopback module loading
tools/testing/selftests/vsock/vmtest.sh | 345 +++++++++++++++++++++-----------
1 file changed, 227 insertions(+), 118 deletions(-)
---
base-commit: 962ac5ca99a5c3e7469215bf47572440402dfd59
change-id: 20251021-vsock-selftests-fixes-and-improvements-057440ffb2fa
Best regards,
--
Bobby Eshleman <bobbyeshleman(a)meta.com>
Currently it is not possible to disable streaming mode via ptrace on SME
only systems, the interface for doing this is to write via NT_ARM_SVE but
such writes will be rejected on a system without SVE support. Enable this
functionality by allowing userspace to write SVE_PT_REGS_FPSIMD format data
via NT_ARM_SVE with the vector length set to 0 on SME only systems. Such
writes currently error since we require that a vector length is specified
which should minimise the risk that existing software is relying on current
behaviour.
Reads are not supported since I am not aware of any use case for this and
there is some risk that an existing userspace application may be confused if
it reads NT_ARM_SVE on a system without SVE. Existing kernels will return
FPSIMD formatted register state from NT_ARM_SVE if full SVE state is not
stored, for example if the task has not used SVE. Returning a vector length
of 0 would create a risk that software could try to do things like allocate
space for register state with zero sizes, while returning a vector length of
128 bits would look like SVE is supported. It seems safer to just not make
the changes to add read support.
It remains possible for userspace to detect a SME only system via the ptrace
interface only since reads of NT_ARM_SSVE and NT_ARM_ZA will suceed while
reads of NT_ARM_SVE will fail. Read/write access to the FPSIMD registers in
non-streaming mode is available via REGSET_FPR.
The aim is is to make a minimally invasive change, no operation that would
previously have succeeded will be affected, and we use a previously defined
interface in new circumstances rather than define completely new ABI.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Changes in v2:
- Rebase onto v6.18-rc1
- Link to v1: https://lore.kernel.org/r/20250820-arm64-sme-ptrace-sme-only-v1-0-f7c22b287…
---
Mark Brown (3):
arm64/sme: Support disabling streaming mode via ptrace on SME only systems
kselftst/arm64: Test NT_ARM_SVE FPSIMD format writes on non-SVE systems
kselftest/arm64: Cover disabling streaming mode without SVE in fp-ptrace
Documentation/arch/arm64/sve.rst | 5 +++
arch/arm64/kernel/ptrace.c | 40 +++++++++++++++---
tools/testing/selftests/arm64/fp/fp-ptrace.c | 5 +--
tools/testing/selftests/arm64/fp/sve-ptrace.c | 61 +++++++++++++++++++++++++++
4 files changed, 100 insertions(+), 11 deletions(-)
---
base-commit: cb6649f6217c0331b885cf787f1d175963e2a1d2
change-id: 20250717-arm64-sme-ptrace-sme-only-1fb850600ea0
Best regards,
--
Mark Brown <broonie(a)kernel.org>
From: Benjamin Berg <benjamin.berg(a)intel.com>
This patchset is an attempt to start a nolibc port of UML. The goal is
to port UML to use nolibc in smaller chunks to make the switch more
manageable.
Using nolibc has the advantage that it is a smaller runtime and it
allows us to be in full control about all memory mappings that are done.
Another libc on the other hand might map memory unaware of UML, causing
collisions with the UML memory layout. Such mappings could even happen
before UML has fully initialized (e.g. rseq being mapped into the
physical or vmalloc memory areas).
There are three parts to this patchset:
* Two patches to use tools/include headers instead of kernel headers
for userspace files.
* A few nolibc fixes and a new NOLIBC_NO_RUNTIME compile flag for it
* Finally nolibc build support for UML and switching two files while
adding the appropriate support in nolibc itself.
v1 of this patchset was
https://lore.kernel.org/all/20250915071115.1429196-1-benjamin@sipsolutions.…
v2:
https://lore.kernel.org/all/20250919153420.727385-1-benjamin@sipsolutions.n…
Changes in v3:
- sys_ptrace is now not a varadic function
- improve printf %m implementation
- keep perror as function available with NOLIBC_IGNORE_ERRNO
- change syscall guard and fix i386 build
Changes in v2:
- add sys/uio.h and sys/ptrace.h to nolibc
- Use NOLIBC_NO_RUNTIME to disable nolibc startup code
- Fix out-of-tree build
- various small improvements and cleanups
Benjamin
Benjamin Berg (12):
tools compiler.h: fix __used definition
um: use tools/include for user files
tools/nolibc/stdio: let perror work when NOLIBC_IGNORE_ERRNO is set
tools/nolibc/dirent: avoid errno in readdir_r
tools/nolibc: implement %m if errno is not defined
tools/nolibc: use __fallthrough__ rather than fallthrough
tools/nolibc: add option to disable runtime
um: add infrastructure to build files using nolibc
um: use nolibc for the --showconfig implementation
tools/nolibc: add uio.h with readv and writev
tools/nolibc: add ptrace support
um: switch ptrace FP register access to nolibc
arch/um/Makefile | 38 ++++++++++++---
arch/um/include/shared/init.h | 2 +-
arch/um/include/shared/os.h | 2 +
arch/um/include/shared/user.h | 6 ---
arch/um/kernel/Makefile | 2 +-
arch/um/kernel/skas/stub.c | 1 +
arch/um/kernel/skas/stub_exe.c | 4 +-
arch/um/os-Linux/skas/process.c | 6 +--
arch/um/os-Linux/start_up.c | 4 +-
arch/um/scripts/Makefile.rules | 10 +++-
arch/x86/um/Makefile | 6 ++-
arch/x86/um/os-Linux/Makefile | 5 +-
arch/x86/um/os-Linux/registers.c | 20 ++------
arch/x86/um/user-offsets.c | 1 -
tools/include/linux/compiler.h | 2 +-
tools/include/nolibc/Makefile | 2 +
tools/include/nolibc/arch-arm.h | 2 +
tools/include/nolibc/arch-arm64.h | 2 +
tools/include/nolibc/arch-loongarch.h | 2 +
tools/include/nolibc/arch-m68k.h | 2 +
tools/include/nolibc/arch-mips.h | 2 +
tools/include/nolibc/arch-powerpc.h | 2 +
tools/include/nolibc/arch-riscv.h | 2 +
tools/include/nolibc/arch-s390.h | 2 +
tools/include/nolibc/arch-sh.h | 2 +
tools/include/nolibc/arch-sparc.h | 2 +
tools/include/nolibc/arch-x86.h | 4 ++
tools/include/nolibc/compiler.h | 4 +-
tools/include/nolibc/crt.h | 3 ++
tools/include/nolibc/dirent.h | 6 +--
tools/include/nolibc/nolibc.h | 2 +
tools/include/nolibc/stackprotector.h | 2 +
tools/include/nolibc/stdio.h | 10 +++-
tools/include/nolibc/stdlib.h | 2 +
tools/include/nolibc/sys.h | 3 +-
tools/include/nolibc/sys/auxv.h | 3 ++
tools/include/nolibc/sys/ptrace.h | 44 ++++++++++++++++++
tools/include/nolibc/sys/uio.h | 49 ++++++++++++++++++++
tools/testing/selftests/nolibc/nolibc-test.c | 11 +++++
39 files changed, 221 insertions(+), 53 deletions(-)
create mode 100644 tools/include/nolibc/sys/ptrace.h
create mode 100644 tools/include/nolibc/sys/uio.h
--
2.51.0
Update Rae's email address for the KUnit entry. Also add an entry to
.mailmap to map former google email to current gmail address.
Signed-off-by: Rae Moar <rmoar(a)google.com>
---
I am leaving Google and am going through and cleaning up my @google.com
address in the relevant places. Note that Friday, November 7 2025 is my
last day at Google after which I will lose access to this email account
so any future updates or comments after Friday will come from my
@gmail.com account.
.mailmap | 1 +
MAINTAINERS | 2 +-
2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/.mailmap b/.mailmap
index d2edd256b19d..2fcf7e4a5cfd 100644
--- a/.mailmap
+++ b/.mailmap
@@ -642,6 +642,7 @@ Qais Yousef <qyousef(a)layalina.io> <qais.yousef(a)arm.com>
Quentin Monnet <qmo(a)kernel.org> <quentin.monnet(a)netronome.com>
Quentin Monnet <qmo(a)kernel.org> <quentin(a)isovalent.com>
Quentin Perret <qperret(a)qperret.net> <quentin.perret(a)arm.com>
+Rae Moar <raemoar63(a)gmail.com> <rmoar(a)google.com>
Rafael J. Wysocki <rjw(a)rjwysocki.net> <rjw(a)sisk.pl>
Rajeev Nandan <quic_rajeevny(a)quicinc.com> <rajeevny(a)codeaurora.org>
Rajendra Nayak <quic_rjendra(a)quicinc.com> <rnayak(a)codeaurora.org>
diff --git a/MAINTAINERS b/MAINTAINERS
index 46126ce2f968..eefcff990987 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13601,7 +13601,7 @@ F: fs/smb/server/
KERNEL UNIT TESTING FRAMEWORK (KUnit)
M: Brendan Higgins <brendan.higgins(a)linux.dev>
M: David Gow <davidgow(a)google.com>
-R: Rae Moar <rmoar(a)google.com>
+R: Rae Moar <raemoar63(a)gmail.com>
L: linux-kselftest(a)vger.kernel.org
L: kunit-dev(a)googlegroups.com
S: Maintained
base-commit: 9de5f847ef8fa205f4fd704a381d32ecb5b66da9
--
2.51.1.851.g4ebd6896fd-goog
From: Chia-Yu Chang <chia-yu.chang(a)nokia-bell-labs.com>
Hello,
Plesae find the v5 AccECN case handling patch series, which covers
several excpetional case handling of Accurate ECN spec (RFC9768),
adds new identifiers to be used by CC modules, adds ecn_delta into
rate_sample, and keeps the ACE counter for computation, etc.
This patch series is part of the full AccECN patch series, which is available at
https://github.com/L4STeam/linux-net-next/commits/upstream_l4steam/
Best regards,
Chia-Yu
---
v5:
- Move previous #11 in v4 in latter patch after discussion with RFC author.
- Add #3 to update the comments for SKB_GSO_TCP_ECN and SKB_GSO_TCP_ACCECN. (Parav Pandit <parav(a)nvidia.com>)
- Add gro self-test for TCP CWR flag in #4. (Eric Dumazet <edumazet(a)google.com>)
- Add fixes: tag into #7 (Paolo Abeni <pabeni(a)redhat.com>)
- Update commit message of #8 and if condition check (Paolo Abeni <pabeni(a)redhat.com>)
- Add empty line between variable declarations and code in #13 (Paolo Abeni <pabeni(a)redhat.com>)
v4:
- Add previous #13 in v2 back after dicussion with the RFC author.
- Add TCP_ACCECN_OPTION_PERSIST to tcp_ecn_option sysctl to ignore AccECN fallback policy on sending AccECN option.
v3:
- Add additional min() check if pkts_acked_ewma is not initialized in #1. (Paolo Abeni <pabeni(a)redhat.com>)
- Change TCP_CONG_WANTS_ECT_1 into individual flag add helper function INET_ECN_xmit_wants_ect_1() in #3. (Paolo Abeni <pabeni(a)redhat.com>)
- Add empty line between variable declarations and code in #4. (Paolo Abeni <pabeni(a)redhat.com>)
- Update commit message to fix old AccECN commits in #5. (Paolo Abeni <pabeni(a)redhat.com>)
- Remove unnecessary brackets in #10. (Paolo Abeni <pabeni(a)redhat.com>)
- Move patch #3 in v2 to a later Prague patch serise and remove patch #13 in v2. (Paolo Abeni <pabeni(a)redhat.com>)
---
Chia-Yu Chang (12):
net: update commnets for SKB_GSO_TCP_ECN and SKB_GSO_TCP_ACCECN
selftests/net: gro: add self-test for TCP CWR flag
tcp: L4S ECT(1) identifier and NEEDS_ACCECN for CC modules
tcp: disable RFC3168 fallback identifier for CC modules
tcp: accecn: handle unexpected AccECN negotiation feedback
tcp: accecn: retransmit downgraded SYN in AccECN negotiation
tcp: move increment of num_retrans
tcp: accecn: retransmit SYN/ACK without AccECN option or non-AccECN
SYN/ACK
tcp: accecn: unset ECT if receive or send ACE=0 in AccECN negotiaion
tcp: accecn: fallback outgoing half link to non-AccECN
tcp: accecn: detect loss ACK w/ AccECN option and add
TCP_ACCECN_OPTION_PERSIST
tcp: accecn: enable AccECN
Ilpo Järvinen (2):
tcp: try to avoid safer when ACKs are thinned
gro: flushing when CWR is set negatively affects AccECN
Documentation/networking/ip-sysctl.rst | 4 +-
.../networking/net_cachelines/tcp_sock.rst | 1 +
include/linux/skbuff.h | 13 ++-
include/linux/tcp.h | 4 +-
include/net/inet_ecn.h | 20 +++-
include/net/tcp.h | 32 ++++++-
include/net/tcp_ecn.h | 92 ++++++++++++++-----
net/ipv4/sysctl_net_ipv4.c | 4 +-
net/ipv4/tcp.c | 2 +
net/ipv4/tcp_cong.c | 10 +-
net/ipv4/tcp_input.c | 37 +++++++-
net/ipv4/tcp_minisocks.c | 40 +++++---
net/ipv4/tcp_offload.c | 3 +-
net/ipv4/tcp_output.c | 42 ++++++---
tools/testing/selftests/net/gro.c | 80 +++++++++++-----
15 files changed, 294 insertions(+), 90 deletions(-)
--
2.34.1
Hi all,
This series refactors the VMA count limit code to improve clarity,
test coverage, and observability.
The VMA count limit, controlled by sysctl_max_map_count, is a safeguard
that prevents a single process from consuming excessive kernel memory
by creating too many memory mappings.
A major change since v3 is the first patch in the series which instead of
attempting to fix overshooting the limit now documents that this is the
intended behavior. As Hugh pointed out, the lenient check (>) in do_mmap()
and do_brk_flags() is intentional to allow for potential VMA merges or
expansions when the process is at the sysctl_max_map_count limit.
The consensus is that this historical behavior is correct but non-obvious.
This series now focuses on making that behavior clear and the surrounding
code more robust. Based on feedback from Lorenzo and David, this series
retains the helper function and the rename of map_count.
The refined v4 series is now structured as follows:
1. Documents the lenient VMA count checks with comments to clarify
their purpose.
2. Adds a comprehensive selftest to codify the expected behavior at the
limit, including the lenient mmap case.
3. Introduces max_vma_count() to abstract the max map count sysctl,
making the sysctl static and converting all callers to use the new
helper.
4. Renames mm_struct->map_count to the more explicit vma_count for
better code clarity.
5. Adds a tracepoint for observability when a process fails to
allocate a VMA due to the count limit.
Tested on x86_64 and arm64:
1. Build test:
allyesconfig for rename
2. Selftests:
cd tools/testing/selftests/mm && \
make && \
./run_vmtests.sh -t max_vma_count
3. vma tests:
cd tools/testing/vma && \
make && \
./vma
Link to v3:
https://lore.kernel.org/r/20251013235259.589015-1-kaleshsingh@google.com/
Thanks to everyone for the valuable discussion on previous revisions.
-- Kalesh
Kalesh Singh (5):
mm: Document lenient map_count checks
mm/selftests: add max_vma_count tests
mm: Introduce max_vma_count() to abstract the max map count sysctl
mm: rename mm_struct::map_count to vma_count
mm/tracing: introduce trace_mm_insufficient_vma_slots event
MAINTAINERS | 2 +
fs/binfmt_elf.c | 2 +-
fs/coredump.c | 2 +-
include/linux/mm.h | 2 -
include/linux/mm_types.h | 2 +-
include/trace/events/vma.h | 32 +
kernel/fork.c | 2 +-
mm/debug.c | 2 +-
mm/internal.h | 3 +
mm/mmap.c | 25 +-
mm/mremap.c | 13 +-
mm/nommu.c | 8 +-
mm/util.c | 1 -
mm/vma.c | 42 +-
mm/vma_internal.h | 2 +
tools/testing/selftests/mm/.gitignore | 1 +
tools/testing/selftests/mm/Makefile | 1 +
.../selftests/mm/max_vma_count_tests.c | 716 ++++++++++++++++++
tools/testing/selftests/mm/run_vmtests.sh | 5 +
tools/testing/vma/vma.c | 32 +-
tools/testing/vma/vma_internal.h | 13 +-
21 files changed, 856 insertions(+), 52 deletions(-)
create mode 100644 include/trace/events/vma.h
create mode 100644 tools/testing/selftests/mm/max_vma_count_tests.c
base-commit: b227c04932039bccc21a0a89cd6df50fa57e4716
--
2.51.1.851.g4ebd6896fd-goog
Overall, we encountered a warning [1] that can be triggered by running the
selftest I provided.
MPTCP creates subflows for data transmission between two endpoints.
However, BPF can use sockops to perform additional operations when TCP
completes the three-way handshake. The issue arose because we used sockmap
in sockops, which replaces sk->sk_prot and some handlers. Since subflows
also have their own specialized handlers, this creates a conflict and leads
to traffic failure. Therefore, we need to reject operations targeting
subflows.
This patchset simply prevents the combination of subflows and sockmap
without changing any functionality.
A complete integration of MPTCP and sockmap would require more effort, for
example, we would need to retrieve the parent socket from subflows in
sockmap and implement handlers like read_skb.
If maintainers don't object, we can further improve this in subsequent
work.
[1] truncated warning:
[ 18.234652] ------------[ cut here ]------------
[ 18.234664] WARNING: CPU: 1 PID: 388 at net/mptcp/protocol.c:68 mptcp_stream_accept+0x34c/0x380
[ 18.234726] Modules linked in:
[ 18.234755] RIP: 0010:mptcp_stream_accept+0x34c/0x380
[ 18.234762] RSP: 0018:ffffc90000cf3cf8 EFLAGS: 00010202
[ 18.234800] PKRU: 55555554
[ 18.234806] Call Trace:
[ 18.234810] <TASK>
[ 18.234837] do_accept+0xeb/0x190
[ 18.234861] ? __x64_sys_pselect6+0x61/0x80
[ 18.234898] ? _raw_spin_unlock+0x12/0x30
[ 18.234915] ? alloc_fd+0x11e/0x190
[ 18.234925] __sys_accept4+0x8c/0x100
[ 18.234930] __x64_sys_accept+0x1f/0x30
[ 18.234933] x64_sys_call+0x202f/0x20f0
[ 18.234966] do_syscall_64+0x72/0x9a0
[ 18.234979] ? switch_fpu_return+0x60/0xf0
[ 18.234993] ? irqentry_exit_to_user_mode+0xdb/0x1e0
[ 18.235002] ? irqentry_exit+0x3f/0x50
[ 18.235005] ? clear_bhb_loop+0x50/0xa0
[ 18.235022] ? clear_bhb_loop+0x50/0xa0
[ 18.235025] ? clear_bhb_loop+0x50/0xa0
[ 18.235028] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 18.235066] </TASK>
[ 18.235109] ---[ end trace 0000000000000000 ]---
---
v2: https://lore.kernel.org/bpf/20251020060503.325369-1-jiayuan.chen@linux.dev/…
Some advice suggested by Jakub Sitnicki
v1: https://lore.kernel.org/mptcp/a0a2b87119a06c5ffaa51427a0964a05534fe6f1@linu…
Some advice from Matthieu Baerts.
Jiayuan Chen (3):
net,mptcp: fix proto fallback detection with BPF sockmap
bpf,sockmap: disallow MPTCP sockets from sockmap
selftests/bpf: Add mptcp test with sockmap
net/core/sock_map.c | 27 ++++
net/mptcp/protocol.c | 9 +-
.../testing/selftests/bpf/prog_tests/mptcp.c | 150 ++++++++++++++++++
.../selftests/bpf/progs/mptcp_sockmap.c | 43 +++++
4 files changed, 227 insertions(+), 2 deletions(-)
create mode 100644 tools/testing/selftests/bpf/progs/mptcp_sockmap.c
--
2.43.0
Some network selftests defined variable-sized types defined at the end of
struct causing -Wgnu-variable-sized-type-not-at-end warning.
warning:
timestamping.c:285:18: warning: field 'cm' with variable sized type
'struct cmsghdr' not at the end of a struct or class is a GNU
extension [-Wgnu-variable-sized-type-not-at-end]
285 | struct cmsghdr cm;
| ^
ipsec.c:835:5: warning: field 'u' with variable sized type 'union
(unnamed union at ipsec.c:831:3)' not at the end of a struct or class
is a GNU extension [-Wgnu-variable-sized-type-not-at-end]
835 | } u;
| ^
This patch move these field at the end of struct to fix these warnings.
Signed-off-by: Ankit Khushwaha <ankitkhushwaha.linux(a)gmail.com>
---
tools/testing/selftests/net/ipsec.c | 2 +-
tools/testing/selftests/net/timestamping.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c
index 0ccf484b1d9d..36083c8f884f 100644
--- a/tools/testing/selftests/net/ipsec.c
+++ b/tools/testing/selftests/net/ipsec.c
@@ -828,12 +828,12 @@ static int xfrm_state_pack_algo(struct nlmsghdr *nh, size_t req_sz,
struct xfrm_desc *desc)
{
struct {
+ char buf[XFRM_ALGO_KEY_BUF_SIZE];
union {
struct xfrm_algo alg;
struct xfrm_algo_aead aead;
struct xfrm_algo_auth auth;
} u;
- char buf[XFRM_ALGO_KEY_BUF_SIZE];
} alg = {};
size_t alen, elen, clen, aelen;
unsigned short type;
diff --git a/tools/testing/selftests/net/timestamping.c b/tools/testing/selftests/net/timestamping.c
index 044bc0e9ed81..ad2be2143698 100644
--- a/tools/testing/selftests/net/timestamping.c
+++ b/tools/testing/selftests/net/timestamping.c
@@ -282,8 +282,8 @@ static void recvpacket(int sock, int recvmsg_flags,
struct iovec entry;
struct sockaddr_in from_addr;
struct {
- struct cmsghdr cm;
char control[512];
+ struct cmsghdr cm;
} control;
int res;
--
2.51.0
This small patchset is about avoid verifier bug warning when tnum_overlap()
is called with zero mask intersection.
v2:
- fix runtime error
v1:
https://lore.kernel.org/all/20251026163806.3300636-1-kafai.wan@linux.dev/
---
KaFai Wan (2):
bpf: Fix tnum_overlap to check for zero mask intersection
selftests/bpf: Range analysis test case for JEQ
kernel/bpf/tnum.c | 2 ++
.../selftests/bpf/progs/verifier_bounds.c | 23 +++++++++++++++++++
2 files changed, 25 insertions(+)
--
2.43.0
Socket APIs like recvfrom(), accept(), and getsockname() expect socklen_t*
arg, but tests were using int variables. This causes -Wpointer-sign
warnings on platforms where socklen_t is unsigned.
Change the variable type from int to socklen_t to resolve the warning and
ensure type safety across platforms.
warning fixed:
sctp_collision.c:62:70: warning: passing 'int *' to parameter of
type 'socklen_t *' (aka 'unsigned int *') converts between pointers to
integer types with different sign [-Wpointer-sign]
62 | ret = recvfrom(sd, buf, sizeof(buf),
0, (struct sockaddr *)&daddr, &len);
| ^~~~
/usr/include/sys/socket.h:165:27: note: passing argument to
parameter '__addr_len' here
165 | socklen_t *__restrict __addr_len);
| ^
Signed-off-by: Ankit Khushwaha <ankitkhushwaha.linux(a)gmail.com>
---
tools/testing/selftests/net/netfilter/sctp_collision.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/net/netfilter/sctp_collision.c b/tools/testing/selftests/net/netfilter/sctp_collision.c
index 21bb1cfd8a85..91df996367e9 100644
--- a/tools/testing/selftests/net/netfilter/sctp_collision.c
+++ b/tools/testing/selftests/net/netfilter/sctp_collision.c
@@ -9,7 +9,8 @@
int main(int argc, char *argv[])
{
struct sockaddr_in saddr = {}, daddr = {};
- int sd, ret, len = sizeof(daddr);
+ int sd, ret;
+ socklen_t len = sizeof(daddr);
struct timeval tv = {25, 0};
char buf[] = "hello";
--
2.51.0
From: Steven Rostedt <rostedt(a)goodmis.org>
The tracing selftest "event-filter-function.tc" was failing because it
first runs the "sample_events" function that triggers the kmem_cache_free
event and it looks at what function was used during a call to "ls".
But the first time it calls this, it could trigger events that are used to
pull pages into the page cache.
The rest of the test uses the function it finds during that call to see if
it will be called in subsequent "sample_events" calls. But if there's no
need to pull pages into the page cache, it will not trigger that function
and the test will fail.
Call the "sample_events" twice to trigger all the page cache work before
it calls it to find a function to use in subsequent checks.
Cc: stable(a)vger.kernel.org
Fixes: eb50d0f250e96 ("selftests/ftrace: Choose target function for filter test from samples")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
.../selftests/ftrace/test.d/filter/event-filter-function.tc | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc
index c62165fabd0c..003f612f57b0 100644
--- a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc
+++ b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc
@@ -20,6 +20,10 @@ sample_events() {
echo 0 > tracing_on
echo 0 > events/enable
+# Clear functions caused by page cache; run sample_events twice
+sample_events
+sample_events
+
echo "Get the most frequently calling function"
echo > trace
sample_events
--
2.51.0
This is a follow up patch for commit 495d2d8133fd("selftests/bpf: Attempt
to build BPF programs with -Wsign-compare") from Alexei Starovoitov[1]
to be able to enable -Wsign-compare C compilation flag for clang since
-Wall doesn't add it and BPF programs are built with clang.This has the
benefit to catch problematic comparisons in future tests as quoted from
the commit message:"
int i = -1;
unsigned int j = 1;
if (i < j) // this is false.
long i = -1;
unsigned int j = 1;
if (i < j) // this is true.
C standard for reference:
- If either operand is unsigned long the other shall be converted to
unsigned long.
- Otherwise, if one operand is a long int and the other unsigned int,
then if a long int can represent all the values of an unsigned int,
the unsigned int shall be converted to a long int;
otherwise both operands shall be converted to unsigned long int.
- Otherwise, if either operand is long, the other shall be
converted to long.
- Otherwise, if either operand is unsigned, the other shall be
converted to unsigned.
Unfortunately clang's -Wsign-compare is very noisy.
It complains about (s32)a == (u32)b which is safe and doen't
have surprising behavior."
This specific patch supresses the following warnings when
-Wsign-compare is enabled:
1 warning generated.
progs/bpf_iter_bpf_percpu_array_map.c:35:16: warning: comparison of
integers of different signs: 'int' and 'const volatile __u32'
(aka 'const volatile unsigned int') [-Wsign-compare]
35 | for (i = 0; i < num_cpus; i++) {
| ~ ^ ~~~~~~~~
1 warning generated.
progs/bpf_qdisc_fifo.c:93:2: warning: comparison of integers of
different signs: 'int' and '__u32'
(aka 'unsigned int') [-Wsign-compare]
93 | bpf_for(i, 0, sch->q.qlen) {
| ^ ~ ~~~~~~~~~~~
Should be noted that many more similar changes are still needed in order
to be able to enable the -Wsign-compare flag since -Werror is enabled and
would cause compilation of bpf selftests to fail.
[1].
Link:https://github.com/torvalds/linux/commit/495d2d8133fd1407519170a5238f4…
Signed-off-by: Mehdi Ben Hadj Khelifa <mehdi.benhadjkhelifa(a)gmail.com>
---
Changelog:
Changes from v3:
-Downsized the patch as suggested by vivek yadav[2].
-Changed the commit message as suggested by Daniel Borkmann[3].
Link:https://lore.kernel.org/all/20250925103559.14876-1-mehdi.benhadjkhelif…
Changes from v2:
-Split up the patch into a patch series as suggested by vivek
-Include only changes to variable types with no casting by my mentor
david
-Removed the -Wsign-compare in Makefile to avoid compilation errors
until adding casting for rest of comparisons.
Link:https://lore.kernel.org/bpf/20250924195731.6374-1-mehdi.benhadjkhelifa…
Changes from v1:
- Fix CI failed builds where it failed due to do missing .c and
.h files in my patch for working in mainline.
Link:https://lore.kernel.org/bpf/20250924162408.815137-1-mehdi.benhadjkheli…
[2]:https://lore.kernel.org/all/CABPSWR7_w3mxr74wCDEF=MYYuG2F_vMJeD-dqotc8MD…
[3]:https://lore.kernel.org/all/5ad26663-a3cc-4bf4-9d6f-8213ac8e8ce6@iogearb…
.../testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c | 2 +-
tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c
index 9fdea8cd4c6f..0baf00463f35 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c
@@ -24,7 +24,7 @@ int dump_bpf_percpu_array_map(struct bpf_iter__bpf_map_elem *ctx)
__u32 *key = ctx->key;
void *pptr = ctx->value;
__u32 step;
- int i;
+ __u32 i;
if (key == (void *)0 || pptr == (void *)0)
return 0;
diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c
index 1de2be3e370b..7a639dcb23a9 100644
--- a/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c
+++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c
@@ -88,7 +88,7 @@ void BPF_PROG(bpf_fifo_reset, struct Qdisc *sch)
{
struct bpf_list_node *node;
struct skb_node *skbn;
- int i;
+ __u32 i;
bpf_for(i, 0, sch->q.qlen) {
struct sk_buff *skb = NULL;
--
2.51.1.dirty
This small patchset is about avoid verifier bug warning when conditional
jumps on same register when the register holds a scalar with range.
v2:
- Enhance is_branch_taken() and is_scalar_branch_taken() to handle
branch direction computation for same register. (Eduard and Alexei)
- Update the selftest.
v1:
https://lore.kernel.org/bpf/20251022164457.1203756-1-kafai.wan@linux.dev/
---
KaFai Wan (2):
bpf: Skip bounds adjustment for conditional jumps on same register
selftests/bpf: Add test for BPF_JGT on same register
kernel/bpf/verifier.c | 32 +++++++++++++++++++
.../selftests/bpf/progs/verifier_bounds.c | 18 +++++++++++
2 files changed, 50 insertions(+)
--
2.43.0
From: Jack Thomson <jackabt(a)amazon.com>
This patch series adds ARM64 support for the KVM_PRE_FAULT_MEMORY
feature, which was previously only available on x86 [1]. This allows us
to reduce the number of stage-2 faults during execution. This is of
benefit in post-copy migration scenarios, particularly in memory
intensive applications, where we are experiencing high latencies due to
the stage-2 faults.
Patch Overview:
- The first patch adds support for the KVM_PRE_FAULT_MEMORY ioctl
on arm64.
- The second patch fixes an issue with unaligned mmap allocations
in the selftests.
- The third patch updates the pre_fault_memory_test to support
arm64.
- The last patch extends the pre_fault_memory_test to cover
different vm memory backings.
=== Changes Since v1 [2] ===
Addressing feedback from Oliver:
- No pre-fault flag is passed to user_mem_abort() or gmem_abort() now
aborts are synthesized.
- Remove retry loop from kvm_arch_vcpu_pre_fault_memory()
[1]: https://lore.kernel.org/kvm/20240710174031.312055-1-pbonzini@redhat.com
[2]: https://lore.kernel.org/all/20250911134648.58945-1-jackabt.amazon@gmail.com
Jack Thomson (4):
KVM: arm64: Add pre_fault_memory implementation
KVM: selftests: Fix unaligned mmap allocations
KVM: selftests: Enable pre_fault_memory_test for arm64
KVM: selftests: Add option for different backing in pre-fault tests
Documentation/virt/kvm/api.rst | 3 +-
arch/arm64/kvm/Kconfig | 1 +
arch/arm64/kvm/arm.c | 1 +
arch/arm64/kvm/mmu.c | 73 +++++++++++-
tools/testing/selftests/kvm/Makefile.kvm | 1 +
tools/testing/selftests/kvm/lib/kvm_util.c | 12 +-
.../selftests/kvm/pre_fault_memory_test.c | 110 +++++++++++++-----
7 files changed, 163 insertions(+), 38 deletions(-)
base-commit: 42188667be387867d2bf763d028654cbad046f7b
--
2.43.0
From: Wilfred Mallawa <wilfred.mallawa(a)wdc.com>
During a handshake, an endpoint may specify a maximum record size limit.
Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the
maximum record size. Meaning that, the outgoing records from the kernel
can exceed a lower size negotiated during the handshake. In such a case,
the TLS endpoint must send a fatal "record_overflow" alert [1], and
thus the record is discarded.
Upcoming Western Digital NVMe-TCP hardware controllers implement TLS
support. For these devices, supporting TLS record size negotiation is
necessary because the maximum TLS record size supported by the controller
is less than the default 16KB currently used by the kernel.
Currently, there is no way to inform the kernel of such a limit. This patch
adds support to a new setsockopt() option `TLS_TX_MAX_PAYLOAD_LEN` that
allows for setting the maximum plaintext fragment size. Once set, outgoing
records are no larger than the size specified. This option can be used to
specify the record size limit.
[1] https://www.rfc-editor.org/rfc/rfc8449
Signed-off-by: Wilfred Mallawa <wilfred.mallawa(a)wdc.com>
---
V7 -> V8:
- Fixup HTML doc indentation
- Drop the getsockopt() change in V7 where ContentType was included in the
max payload length
---
Documentation/networking/tls.rst | 20 ++++++++++
include/net/tls.h | 3 ++
include/uapi/linux/tls.h | 2 +
net/tls/tls_device.c | 2 +-
net/tls/tls_main.c | 64 ++++++++++++++++++++++++++++++++
net/tls/tls_sw.c | 2 +-
6 files changed, 91 insertions(+), 2 deletions(-)
diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index 36cc7afc2527..980c442d7161 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -280,6 +280,26 @@ If the record decrypted turns out to had been padded or is not a data
record it will be decrypted again into a kernel buffer without zero copy.
Such events are counted in the ``TlsDecryptRetry`` statistic.
+TLS_TX_MAX_PAYLOAD_LEN
+~~~~~~~~~~~~~~~~~~~~~~
+
+Specifies the maximum size of the plaintext payload for transmitted TLS records.
+
+When this option is set, the kernel enforces the specified limit on all outgoing
+TLS records. No plaintext fragment will exceed this size. This option can be used
+to implement the TLS Record Size Limit extension [1].
+
+* For TLS 1.2, the value corresponds directly to the record size limit.
+* For TLS 1.3, the value should be set to record_size_limit - 1, since
+ the record size limit includes one additional byte for the ContentType
+ field.
+
+The valid range for this option is 64 to 16384 bytes for TLS 1.2, and 63 to
+16384 bytes for TLS 1.3. The lower minimum for TLS 1.3 accounts for the
+extra byte used by the ContentType field.
+
+[1] https://datatracker.ietf.org/doc/html/rfc8449
+
Statistics
==========
diff --git a/include/net/tls.h b/include/net/tls.h
index 857340338b69..f2af113728aa 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -53,6 +53,8 @@ struct tls_rec;
/* Maximum data size carried in a TLS record */
#define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14)
+/* Minimum record size limit as per RFC8449 */
+#define TLS_MIN_RECORD_SIZE_LIM ((size_t)1 << 6)
#define TLS_HEADER_SIZE 5
#define TLS_NONCE_OFFSET TLS_HEADER_SIZE
@@ -226,6 +228,7 @@ struct tls_context {
u8 rx_conf:3;
u8 zerocopy_sendfile:1;
u8 rx_no_pad:1;
+ u16 tx_max_payload_len;
int (*push_pending_record)(struct sock *sk, int flags);
void (*sk_write_space)(struct sock *sk);
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index b66a800389cc..b8b9c42f848c 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -41,6 +41,7 @@
#define TLS_RX 2 /* Set receive parameters */
#define TLS_TX_ZEROCOPY_RO 3 /* TX zerocopy (only sendfile now) */
#define TLS_RX_EXPECT_NO_PAD 4 /* Attempt opportunistic zero-copy */
+#define TLS_TX_MAX_PAYLOAD_LEN 5 /* Maximum plaintext size */
/* Supported versions */
#define TLS_VERSION_MINOR(ver) ((ver) & 0xFF)
@@ -194,6 +195,7 @@ enum {
TLS_INFO_RXCONF,
TLS_INFO_ZC_RO_TX,
TLS_INFO_RX_NO_PAD,
+ TLS_INFO_TX_MAX_PAYLOAD_LEN,
__TLS_INFO_MAX,
};
#define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index caa2b5d24622..4d29b390aed9 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -462,7 +462,7 @@ static int tls_push_data(struct sock *sk,
/* TLS_HEADER_SIZE is not counted as part of the TLS record, and
* we need to leave room for an authentication tag.
*/
- max_open_record_len = TLS_MAX_PAYLOAD_SIZE +
+ max_open_record_len = tls_ctx->tx_max_payload_len +
prot->prepend_size;
do {
rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 39a2ab47fe72..56ce0bc8317b 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -541,6 +541,28 @@ static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval,
return 0;
}
+static int do_tls_getsockopt_tx_payload_len(struct sock *sk, char __user *optval,
+ int __user *optlen)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ u16 payload_len = ctx->tx_max_payload_len;
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ if (len < sizeof(payload_len))
+ return -EINVAL;
+
+ if (put_user(sizeof(payload_len), optlen))
+ return -EFAULT;
+
+ if (copy_to_user(optval, &payload_len, sizeof(payload_len)))
+ return -EFAULT;
+
+ return 0;
+}
+
static int do_tls_getsockopt(struct sock *sk, int optname,
char __user *optval, int __user *optlen)
{
@@ -560,6 +582,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname,
case TLS_RX_EXPECT_NO_PAD:
rc = do_tls_getsockopt_no_pad(sk, optval, optlen);
break;
+ case TLS_TX_MAX_PAYLOAD_LEN:
+ rc = do_tls_getsockopt_tx_payload_len(sk, optval, optlen);
+ break;
default:
rc = -ENOPROTOOPT;
break;
@@ -809,6 +834,32 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval,
return rc;
}
+static int do_tls_setsockopt_tx_payload_len(struct sock *sk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ struct tls_sw_context_tx *sw_ctx = tls_sw_ctx_tx(ctx);
+ u16 value;
+ bool tls_13 = ctx->prot_info.version == TLS_1_3_VERSION;
+
+ if (sw_ctx && sw_ctx->open_rec)
+ return -EBUSY;
+
+ if (sockptr_is_null(optval) || optlen != sizeof(value))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&value, optval, sizeof(value)))
+ return -EFAULT;
+
+ if (value < TLS_MIN_RECORD_SIZE_LIM - (tls_13 ? 1 : 0) ||
+ value > TLS_MAX_PAYLOAD_SIZE)
+ return -EINVAL;
+
+ ctx->tx_max_payload_len = value;
+
+ return 0;
+}
+
static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
unsigned int optlen)
{
@@ -830,6 +881,11 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
case TLS_RX_EXPECT_NO_PAD:
rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
break;
+ case TLS_TX_MAX_PAYLOAD_LEN:
+ lock_sock(sk);
+ rc = do_tls_setsockopt_tx_payload_len(sk, optval, optlen);
+ release_sock(sk);
+ break;
default:
rc = -ENOPROTOOPT;
break;
@@ -1019,6 +1075,7 @@ static int tls_init(struct sock *sk)
ctx->tx_conf = TLS_BASE;
ctx->rx_conf = TLS_BASE;
+ ctx->tx_max_payload_len = TLS_MAX_PAYLOAD_SIZE;
update_sk_prot(sk, ctx);
out:
write_unlock_bh(&sk->sk_callback_lock);
@@ -1108,6 +1165,12 @@ static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin)
goto nla_failure;
}
+ err = nla_put_u16(skb, TLS_INFO_TX_MAX_PAYLOAD_LEN,
+ ctx->tx_max_payload_len);
+
+ if (err)
+ goto nla_failure;
+
rcu_read_unlock();
nla_nest_end(skb, start);
return 0;
@@ -1129,6 +1192,7 @@ static size_t tls_get_info_size(const struct sock *sk, bool net_admin)
nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */
nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */
nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */
+ nla_total_size(sizeof(u16)) + /* TLS_INFO_TX_MAX_PAYLOAD_LEN */
0;
return size;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index d17135369980..9937d4c810f2 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1079,7 +1079,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
orig_size = msg_pl->sg.size;
full_record = false;
try_to_copy = msg_data_left(msg);
- record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
+ record_room = tls_ctx->tx_max_payload_len - msg_pl->sg.size;
if (try_to_copy >= record_room) {
try_to_copy = record_room;
full_record = true;
--
2.51.0
When compiling the BPF selftests with Clang versions that do not support
the addr_space_cast builtin, the build fails with assembly errors in
"verifier_ldsx.c" [1].
The root cause is that the inline assembly using addr_space_cast is
being processed by a compiler that lacks this feature. To resolve this,
wrap the affected code sections (specifically the arena_ldsx_* test
functions) with #if defined(__BPF_FEATURE_ADDR_SPACE_CAST). This
ensures the code is only compiled when the Clang supports the necessary
feature, preventing build failures on older or incompatible compiler
versions.
This change maintains test coverage for systems with support while
allowing the tests to build successfully in all environments.
[1]:
root:tools/testing/selftests/bpf$ make
CLNG-BPF [test_progs] verifier_ldsx.bpf.o
progs/verifier_ldsx.c:322:2: error: invalid operand for instruction
322 | "r1 = %[arena] ll;"
| ^
<inline asm>:1:52: note: instantiated into assembly here
1 | r1 = arena ll;r0 = 0xdeadbeef;r0 = addr_space_cast(r0,...
| ^
Fixes: f61654912404 ("selftests: bpf: Add tests for signed loads from arena")
Signed-off-by: Jiayuan Chen <jiayuan.chen(a)linux.dev>
---
tools/testing/selftests/bpf/progs/verifier_ldsx.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/tools/testing/selftests/bpf/progs/verifier_ldsx.c b/tools/testing/selftests/bpf/progs/verifier_ldsx.c
index c8494b682c31..cefa02e417d3 100644
--- a/tools/testing/selftests/bpf/progs/verifier_ldsx.c
+++ b/tools/testing/selftests/bpf/progs/verifier_ldsx.c
@@ -263,6 +263,7 @@ __naked void ldsx_ctx_8(void)
: __clobber_all);
}
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
SEC("syscall")
__description("Arena LDSX Disasm")
__success
@@ -425,6 +426,7 @@ __naked void arena_ldsx_s32(void *ctx)
: __clobber_all
);
}
+#endif
/* to retain debug info for BTF generation */
void kfunc_root(void)
--
2.43.0
This series is the start of adding full DMABUF support to
iommufd. Currently it is limited to only work with VFIO's DMABUF exporter.
It sits on top of Leon's series to add a DMABUF exporter to VFIO:
https://lore.kernel.org/all/cover.1760368250.git.leon@kernel.org/
The existing IOMMU_IOAS_MAP_FILE is enhanced to detect DMABUF fd's, but
otherwise works the same as it does today for a memfd. The user can select
a slice of the FD to map into the ioas and if the underliyng alignment
requirements are met it will be placed in the iommu_domain.
Though limited, it is enough to allow a VMM like QEMU to connect MMIO BAR
memory from VFIO to an iommu_domain controlled by iommufd. This is used
for PCI Peer to Peer support in VMs, and is the last feature that the VFIO
type 1 container has that iommufd couldn't do.
The VFIO type1 version extracts raw PFNs from VMAs, which has no lifetime
control and is a use-after-free security problem.
Instead iommufd relies on revokable DMABUFs. Whenever VFIO thinks there
should be no access to the MMIO it can shoot down the mapping in iommufd
which will unmap it from the iommu_domain. There is no automatic remap,
this is a safety protocol so the kernel doesn't get stuck. Userspace is
expected to know it is doing something that will revoke the dmabuf and
map/unmap it around the activity. Eg when QEMU goes to issue FLR it should
do the map/unmap to iommufd.
Since DMABUF is missing some key general features for this use case it
relies on a "private interconnect" between VFIO and iommufd via the
vfio_pci_dma_buf_iommufd_map() call.
The call confirms the DMABUF has revoke semantics and delivers a phys_addr
for the memory suitable for use with iommu_map().
Medium term there is a desire to expand the supported DMABUFs to include
GPU drivers to support DPDK/SPDK type use cases so future series will work
to add a general concept of revoke and a general negotiation of
interconnect to remove vfio_pci_dma_buf_iommufd_map().
I also plan another series to modify iommufd's vfio_compat to
transparently pull a dmabuf out of a VFIO VMA to emulate more of the uAPI
of type1.
The latest series for interconnect negotation to exchange a phys_addr is:
https://lore.kernel.org/r/20251027044712.1676175-1-vivek.kasireddy@intel.com
And the discussion for design of revoke is here:
https://lore.kernel.org/dri-devel/20250114173103.GE5556@nvidia.com/
This is on github: https://github.com/jgunthorpe/linux/commits/iommufd_dmabuf
The branch has various modifications to Leon's series I've suggested.
Jason Gunthorpe (8):
iommufd: Add DMABUF to iopt_pages
iommufd: Do not map/unmap revoked DMABUFs
iommufd: Allow a DMABUF to be revoked
iommufd: Allow MMIO pages in a batch
iommufd: Have pfn_reader process DMABUF iopt_pages
iommufd: Have iopt_map_file_pages convert the fd to a file
iommufd: Accept a DMABUF through IOMMU_IOAS_MAP_FILE
iommufd/selftest: Add some tests for the dmabuf flow
drivers/iommu/iommufd/io_pagetable.c | 74 +++-
drivers/iommu/iommufd/io_pagetable.h | 53 ++-
drivers/iommu/iommufd/ioas.c | 8 +-
drivers/iommu/iommufd/iommufd_private.h | 13 +-
drivers/iommu/iommufd/iommufd_test.h | 10 +
drivers/iommu/iommufd/main.c | 10 +
drivers/iommu/iommufd/pages.c | 407 ++++++++++++++++--
drivers/iommu/iommufd/selftest.c | 142 ++++++
tools/testing/selftests/iommu/iommufd.c | 43 ++
tools/testing/selftests/iommu/iommufd_utils.h | 44 ++
10 files changed, 741 insertions(+), 63 deletions(-)
base-commit: fc882154e421f82677925d33577226e776bb07a4
--
2.43.0
This series adds namespace support to vhost-vsock and loopback. It does
not add namespaces to any of the other guest transports (virtio-vsock,
hyperv, or vmci).
The current revision supports two modes: local and global. Local
mode is complete isolation of namespaces, while global mode is complete
sharing between namespaces of CIDs (the original behavior).
The mode is set using /proc/sys/net/vsock/ns_mode.
Modes are per-netns and write-once. This allows a system to configure
namespaces independently (some may share CIDs, others are completely
isolated). This also supports future possible mixed use cases, where
there may be namespaces in global mode spinning up VMs while there are
mixed mode namespaces that provide services to the VMs, but are not
allowed to allocate from the global CID pool (this mode not implemented
in this series).
If a socket or VM is created when a namespace is global but the
namespace changes to local, the socket or VM will continue working
normally. That is, the socket or VM assumes the mode behavior of the
namespace at the time the socket/VM was created. The original mode is
captured in vsock_create() and so occurs at the time of socket(2) and
accept(2) for sockets and open(2) on /dev/vhost-vsock for VMs. This
prevents a socket/VM connection from suddenly breaking due to a
namespace mode change. Any new sockets/VMs created after the mode change
will adopt the new mode's behavior.
Additionally, added tests for the new namespace features:
tools/testing/selftests/vsock/vmtest.sh
1..30
ok 1 vm_server_host_client
ok 2 vm_client_host_server
ok 3 vm_loopback
ok 4 ns_host_vsock_ns_mode_ok
ok 5 ns_host_vsock_ns_mode_write_once_ok
ok 6 ns_global_same_cid_fails
ok 7 ns_local_same_cid_ok
ok 8 ns_global_local_same_cid_ok
ok 9 ns_local_global_same_cid_ok
ok 10 ns_diff_global_host_connect_to_global_vm_ok
ok 11 ns_diff_global_host_connect_to_local_vm_fails
ok 12 ns_diff_global_vm_connect_to_global_host_ok
ok 13 ns_diff_global_vm_connect_to_local_host_fails
ok 14 ns_diff_local_host_connect_to_local_vm_fails
ok 15 ns_diff_local_vm_connect_to_local_host_fails
ok 16 ns_diff_global_to_local_loopback_local_fails
ok 17 ns_diff_local_to_global_loopback_fails
ok 18 ns_diff_local_to_local_loopback_fails
ok 19 ns_diff_global_to_global_loopback_ok
ok 20 ns_same_local_loopback_ok
ok 21 ns_same_local_host_connect_to_local_vm_ok
ok 22 ns_same_local_vm_connect_to_local_host_ok
ok 23 ns_mode_change_connection_continue_vm_ok
ok 24 ns_mode_change_connection_continue_host_ok
ok 25 ns_mode_change_connection_continue_both_ok
ok 26 ns_delete_vm_ok
ok 27 ns_delete_host_ok
ok 28 ns_delete_both_ok
ok 29 ns_loopback_global_global_late_module_load_ok
ok 30 ns_loopback_local_local_late_module_load_fails
SUMMARY: PASS=30 SKIP=0 FAIL=0
Dependent on series:
https://lore.kernel.org/all/20251022-vsock-selftests-fixes-and-improvements…
Thanks again for everyone's help and reviews!
Signed-off-by: Bobby Eshleman <bobbyeshleman(a)gmail.com>
To: Stefano Garzarella <sgarzare(a)redhat.com>
To: Shuah Khan <shuah(a)kernel.org>
To: David S. Miller <davem(a)davemloft.net>
To: Eric Dumazet <edumazet(a)google.com>
To: Jakub Kicinski <kuba(a)kernel.org>
To: Paolo Abeni <pabeni(a)redhat.com>
To: Simon Horman <horms(a)kernel.org>
To: Stefan Hajnoczi <stefanha(a)redhat.com>
To: Michael S. Tsirkin <mst(a)redhat.com>
To: Jason Wang <jasowang(a)redhat.com>
To: Xuan Zhuo <xuanzhuo(a)linux.alibaba.com>
To: Eugenio Pérez <eperezma(a)redhat.com>
To: K. Y. Srinivasan <kys(a)microsoft.com>
To: Haiyang Zhang <haiyangz(a)microsoft.com>
To: Wei Liu <wei.liu(a)kernel.org>
To: Dexuan Cui <decui(a)microsoft.com>
To: Bryan Tan <bryan-bt.tan(a)broadcom.com>
To: Vishnu Dasa <vishnu.dasa(a)broadcom.com>
To: Broadcom internal kernel review list <bcm-kernel-feedback-list(a)broadcom.com>
Cc: virtualization(a)lists.linux.dev
Cc: netdev(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Cc: kvm(a)vger.kernel.org
Cc: linux-hyperv(a)vger.kernel.org
Cc: berrange(a)redhat.com
Changes in v8:
- Break generic cleanup/refactoring patches into standalone series,
remove those from this series
- Link to dependency: https://lore.kernel.org/all/20251022-vsock-selftests-fixes-and-improvements…
- Link to v7: https://lore.kernel.org/r/20251021-vsock-vmtest-v7-0-0661b7b6f081@meta.com
Changes in v7:
- fix hv_sock build
- break out vmtest patches into distinct, more well-scoped patches
- change `orig_net_mode` to `net_mode`
- many fixes and style changes in per-patch change sets (see individual
patches for specific changes)
- optimize `virtio_vsock_skb_cb` layout
- update commit messages with more useful descriptions
- vsock_loopback: use orig_net_mode instead of current net mode
- add tests for edge cases (ns deletion, mode changing, loopback module
load ordering)
- Link to v6: https://lore.kernel.org/r/20250916-vsock-vmtest-v6-0-064d2eb0c89d@meta.com
Changes in v6:
- define behavior when mode changes to local while socket/VM is alive
- af_vsock: clarify description of CID behavior
- af_vsock: use stronger langauge around CID rules (dont use "may")
- af_vsock: improve naming of buf/buffer
- af_vsock: improve string length checking on proc writes
- vsock_loopback: add space in struct to clarify lock protection
- vsock_loopback: do proper cleanup/unregister on vsock_loopback_exit()
- vsock_loopback: use virtio_vsock_skb_net() instead of sock_net()
- vsock_loopback: set loopback to NULL after kfree()
- vsock_loopback: use pernet_operations and remove callback mechanism
- vsock_loopback: add macros for "global" and "local"
- vsock_loopback: fix length checking
- vmtest.sh: check for namespace support in vmtest.sh
- Link to v5: https://lore.kernel.org/r/20250827-vsock-vmtest-v5-0-0ba580bede5b@meta.com
Changes in v5:
- /proc/net/vsock_ns_mode -> /proc/sys/net/vsock/ns_mode
- vsock_global_net -> vsock_global_dummy_net
- fix netns lookup in vhost_vsock to respect pid namespaces
- add callbacks for vsock_loopback to avoid circular dependency
- vmtest.sh loads vsock_loopback module
- remove vsock_net_mode_can_set()
- change vsock_net_write_mode() to return true/false based on success
- make vsock_net_mode enum instead of u8
- Link to v4: https://lore.kernel.org/r/20250805-vsock-vmtest-v4-0-059ec51ab111@meta.com
Changes in v4:
- removed RFC tag
- implemented loopback support
- renamed new tests to better reflect behavior
- completed suite of tests with permutations of ns modes and vsock_test
as guest/host
- simplified socat bridging with unix socket instead of tcp + veth
- only use vsock_test for success case, socat for failure case (context
in commit message)
- lots of cleanup
Changes in v3:
- add notion of "modes"
- add procfs /proc/net/vsock_ns_mode
- local and global modes only
- no /dev/vhost-vsock-netns
- vmtest.sh already merged, so new patch just adds new tests for NS
- Link to v2:
https://lore.kernel.org/kvm/20250312-vsock-netns-v2-0-84bffa1aa97a@gmail.com
Changes in v2:
- only support vhost-vsock namespaces
- all g2h namespaces retain old behavior, only common API changes
impacted by vhost-vsock changes
- add /dev/vhost-vsock-netns for "opt-in"
- leave /dev/vhost-vsock to old behavior
- removed netns module param
- Link to v1:
https://lore.kernel.org/r/20200116172428.311437-1-sgarzare@redhat.com
Changes in v1:
- added 'netns' module param to vsock.ko to enable the
network namespace support (disabled by default)
- added 'vsock_net_eq()' to check the "net" assigned to a socket
only when 'netns' support is enabled
- Link to RFC: https://patchwork.ozlabs.org/cover/1202235/
---
Bobby Eshleman (14):
vsock: a per-net vsock NS mode state
vsock/virtio: pack struct virtio_vsock_skb_cb
vsock: add netns to vsock skb cb
vsock: add netns to vsock core
vsock/loopback: add netns support
vsock/virtio: add netns to virtio transport common
vhost/vsock: add netns support
selftests/vsock: add namespace helpers to vmtest.sh
selftests/vsock: prepare vm management helpers for namespaces
selftests/vsock: add tests for proc sys vsock ns_mode
selftests/vsock: add namespace tests for CID collisions
selftests/vsock: add tests for host <-> vm connectivity with namespaces
selftests/vsock: add tests for namespace deletion and mode changes
selftests/vsock: add tests for module loading order
MAINTAINERS | 1 +
drivers/vhost/vsock.c | 48 +-
include/linux/virtio_vsock.h | 47 +-
include/net/af_vsock.h | 70 ++-
include/net/net_namespace.h | 4 +
include/net/netns/vsock.h | 22 +
net/vmw_vsock/af_vsock.c | 264 +++++++-
net/vmw_vsock/virtio_transport.c | 7 +-
net/vmw_vsock/virtio_transport_common.c | 21 +-
net/vmw_vsock/vsock_loopback.c | 89 ++-
tools/testing/selftests/vsock/vmtest.sh | 1044 ++++++++++++++++++++++++++++++-
11 files changed, 1532 insertions(+), 85 deletions(-)
---
base-commit: 962ac5ca99a5c3e7469215bf47572440402dfd59
change-id: 20250325-vsock-vmtest-b3a21d2102c2
prerequisite-message-id: <20251022-vsock-selftests-fixes-and-improvements-v1-0-edeb179d6463(a)meta.com>
prerequisite-patch-id: a2eecc3851f2509ed40009a7cab6990c6d7cfff5
prerequisite-patch-id: 501db2100636b9c8fcb3b64b8b1df797ccbede85
prerequisite-patch-id: ba1a2f07398a035bc48ef72edda41888614be449
prerequisite-patch-id: fd5cc5445aca9355ce678e6d2bfa89fab8a57e61
prerequisite-patch-id: 795ab4432ffb0843e22b580374782e7e0d99b909
prerequisite-patch-id: 1499d263dc933e75366c09e045d2125ca39f7ddd
prerequisite-patch-id: f92d99bb1d35d99b063f818a19dcda999152d74c
prerequisite-patch-id: e3296f38cdba6d903e061cff2bbb3e7615e8e671
prerequisite-patch-id: bc4662b4710d302d4893f58708820fc2a0624325
prerequisite-patch-id: f8991f2e98c2661a706183fde6b35e2b8d9aedcf
prerequisite-patch-id: 44bf9ed69353586d284e5ee63d6fffa30439a698
prerequisite-patch-id: d50621bc630eeaf608bbaf260370c8dabf6326df
Best regards,
--
Bobby Eshleman <bobbyeshleman(a)meta.com>
This series fixes a race condition in netconsole's userdata handling
where concurrent message transmission could read partially updated
userdata fields, resulting in corrupted netconsole output.
The first patch adds a selftest that reproduces the race condition by
continuously sending messages while rapidly changing userdata values,
detecting any torn reads in the output.
The second patch fixes the issue by ensuring update_userdata() holds
the target_list_lock while updating both extradata_complete and
userdata_length, preventing readers from seeing inconsistent state.
This targets net tree as it fixes a bug introduced in commit df03f830d099
("net: netconsole: cache userdata formatted string in netconsole_target").
Signed-off-by: Gustavo Luiz Duarte <gustavold(a)gmail.com>
Changes in v2:
- Added testcase to Makefile.
- Reordered fix and testcase to avoid failure in CI.
- testcase: delay cleanup until child process are killed, plus shellcheck fixes.
- Link to v1: https://lore.kernel.org/all/20251020-netconsole-fix-race-v1-0-b775be30ee8a@…
---
Gustavo Luiz Duarte (2):
netconsole: Fix race condition in between reader and writer of userdata
selftests: netconsole: Add race condition test for userdata corruption
drivers/net/netconsole.c | 5 ++
tools/testing/selftests/drivers/net/Makefile | 1 +
.../selftests/drivers/net/netcons_race_userdata.sh | 87 ++++++++++++++++++++++
3 files changed, 93 insertions(+)
---
base-commit: d63f0391d6c7b75e1a847e1a26349fa8cad0004d
change-id: 20251020-netconsole-fix-race-f465f37b57ea
Best regards,
--
Gustavo Duarte <gustavold(a)meta.com>
This small patchset is about avoid verifier bug warning when tnum_overlap()
is called with zero mask.
---
KaFai Wan (2):
bpf: Fix tnum_overlap to check for zero mask first
selftests/bpf: Range analysis test case for JEQ
kernel/bpf/tnum.c | 2 ++
.../selftests/bpf/progs/verifier_bounds.c | 23 +++++++++++++++++++
2 files changed, 25 insertions(+)
--
2.43.0
Some network selftests defined variable-sized types variable at the middle
of struct causing -Wgnu-variable-sized-type-not-at-end warning.
warning:
timestamping.c:285:18: warning: field 'cm' with variable sized type
'struct cmsghdr' not at the end of a struct or class is a GNU
extension [-Wgnu-variable-sized-type-not-at-end]
285 | struct cmsghdr cm;
| ^
ipsec.c:835:5: warning: field 'u' with variable sized type 'union
(unnamed union at ipsec.c:831:3)' not at the end of a struct or class
is a GNU extension [-Wgnu-variable-sized-type-not-at-end]
835 | } u;
| ^
This patch move these field at the end of struct to fix these warnings.
Signed-off-by: Ankit Khushwaha <ankitkhushwaha.linux(a)gmail.com>
---
Changelog:
v2: https://lore.kernel.org/linux-kselftest/20251027050856.30270-1-ankitkhushwa…
- fixed typos in the commit msg.
---
tools/testing/selftests/net/ipsec.c | 2 +-
tools/testing/selftests/net/timestamping.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c
index 0ccf484b1d9d..36083c8f884f 100644
--- a/tools/testing/selftests/net/ipsec.c
+++ b/tools/testing/selftests/net/ipsec.c
@@ -828,12 +828,12 @@ static int xfrm_state_pack_algo(struct nlmsghdr *nh, size_t req_sz,
struct xfrm_desc *desc)
{
struct {
+ char buf[XFRM_ALGO_KEY_BUF_SIZE];
union {
struct xfrm_algo alg;
struct xfrm_algo_aead aead;
struct xfrm_algo_auth auth;
} u;
- char buf[XFRM_ALGO_KEY_BUF_SIZE];
} alg = {};
size_t alen, elen, clen, aelen;
unsigned short type;
diff --git a/tools/testing/selftests/net/timestamping.c b/tools/testing/selftests/net/timestamping.c
index 044bc0e9ed81..ad2be2143698 100644
--- a/tools/testing/selftests/net/timestamping.c
+++ b/tools/testing/selftests/net/timestamping.c
@@ -282,8 +282,8 @@ static void recvpacket(int sock, int recvmsg_flags,
struct iovec entry;
struct sockaddr_in from_addr;
struct {
- struct cmsghdr cm;
char control[512];
+ struct cmsghdr cm;
} control;
int res;
--
2.51.0
The following powerpc ppc6xx_defconfig build regressions noticed on the
Linux next-20251027 tag with gcc-14 and gcc-8.
* powerpc, build
- gcc-14-ppc6xx_defconfig
- gcc-8-ppc6xx_defconfig
First seen on next-20251027
Good: next-20251024
Bad: next-20251027
Regression Analysis:
- New regression? yes
- Reproducibility? yes
Build regression: next-20251027: backlight.c:59:39: error: implicit
declaration of function 'of_find_node_by_name'; did you mean
'bus_find_device_by_name'?
Build regression: next-20251027: include/linux/math.h:167:43: error:
first argument to '__builtin_choose_expr' not a constant
Build regression: next-20251027: via-pmu-backlight.c:22:20: error:
'FB_BACKLIGHT_LEVELS' undeclared here (not in a function)
Build regression: next-20251027: minmax.h:71:17: error: first argument
to '__builtin_choose_expr' not a constant
Build regression: next-20251027: compiler.h:168:17: error:
'__UNIQUE_ID_x__286' undeclared (first use in this function); did you
mean '__UNIQUE_ID_y__287'?
Reported-by: Linux Kernel Functional Testing <lkft(a)linaro.org>
## Build error
arch/powerpc/platforms/powermac/backlight.c: In function
'pmac_has_backlight_type':
arch/powerpc/platforms/powermac/backlight.c:59:39: error: implicit
declaration of function 'of_find_node_by_name'; did you mean
'bus_find_device_by_name'? [-Wimplicit-function-declaration]
59 | struct device_node* bk_node =
of_find_node_by_name(NULL, "backlight");
| ^~~~~~~~~~~~~~~~~~~~
| bus_find_device_by_name
arch/powerpc/platforms/powermac/backlight.c:59:39: error:
initialization of 'struct device_node *' from 'int' makes pointer from
integer without a cast [-Wint-conversion]
arch/powerpc/platforms/powermac/backlight.c:60:17: error: implicit
declaration of function 'of_property_match_string'
[-Wimplicit-function-declaration]
60 | int i = of_property_match_string(bk_node,
"backlight-control", type);
| ^~~~~~~~~~~~~~~~~~~~~~~~
arch/powerpc/platforms/powermac/backlight.c:62:9: error: implicit
declaration of function 'of_node_put'
[-Wimplicit-function-declaration]
62 | of_node_put(bk_node);
| ^~~~~~~~~~~
drivers/macintosh/via-pmu-backlight.c:22:20: error:
'FB_BACKLIGHT_LEVELS' undeclared here (not in a function)
22 | static u8 bl_curve[FB_BACKLIGHT_LEVELS];
| ^~~~~~~~~~~~~~~~~~~
In file included from <command-line>:
drivers/macintosh/via-pmu-backlight.c: In function 'pmu_backlight_curve_lookup':
include/linux/compiler.h:168:17: error: '__UNIQUE_ID_x__286'
undeclared (first use in this function); did you mean
'__UNIQUE_ID_y__287'?
168 | __PASTE(__UNIQUE_ID_, \
| ^~~~~~~~~~~~
drivers/macintosh/via-pmu-backlight.c:45:23: note: in expansion of macro 'max'
45 | max = max((int)bl_curve[i], max);
| ^~~
include/linux/minmax.h:71:17: error: first argument to
'__builtin_choose_expr' not a constant
71 | (typeof(__builtin_choose_expr(sizeof(ux) > 4, 1LL,
1L)))(ux) >= 0)
| ^~~~~~~~~~~~~~~~~~~~~
include/linux/compiler_types.h:577:23: note: in definition of macro
'__compiletime_assert'
577 | if (!(condition))
\
| ^~~~~~~~~
drivers/macintosh/via-pmu-backlight.c:45:23: note: in expansion of macro 'max'
45 | max = max((int)bl_curve[i], max);
| ^~~
include/linux/minmax.h:71:17: error: first argument to
'__builtin_choose_expr' not a constant
71 | (typeof(__builtin_choose_expr(sizeof(ux) > 4, 1LL,
1L)))(ux) >= 0)
| ^~~~~~~~~~~~~~~~~~~~~
include/linux/compiler_types.h:577:23: note: in definition of macro
'__compiletime_assert'
577 | if (!(condition))
\
| ^~~~~~~~~
include/linux/minmax.h:112:25: note: in expansion of macro '__careful_cmp'
112 | #define max(x, y) __careful_cmp(max, x, y)
| ^~~~~~~~~~~~~
drivers/macintosh/via-pmu-backlight.c:45:23: note: in expansion of macro 'max'
45 | max = max((int)bl_curve[i], max);
| ^~~
In file included from include/linux/kernel.h:27,
from arch/powerpc/include/asm/page.h:11,
from arch/powerpc/include/asm/thread_info.h:13,
from include/linux/thread_info.h:60,
from arch/powerpc/include/asm/ptrace.h:342,
from drivers/macintosh/via-pmu-backlight.c:11:
include/linux/math.h:162:17: error: first argument to
'__builtin_choose_expr' not a constant
162 | __builtin_choose_expr(
\
| ^~~~~~~~~~~~~~~~~~~~~
drivers/macintosh/via-pmu-backlight.c: In function
'pmu_backlight_get_level_brightness':
drivers/macintosh/via-pmu-backlight.c:63:38: error: 'FB_BACKLIGHT_MAX'
undeclared (first use in this function); did you mean 'BACKLIGHT_RAW'?
63 | pmulevel = bl_curve[level] * FB_BACKLIGHT_MAX / MAX_PMU_LEVEL;
| ^~~~~~~~~~~~~~~~
| BACKLIGHT_RAW
drivers/macintosh/via-pmu-backlight.c:58:51: warning: parameter
'level' set but not used [-Wunused-but-set-parameter]
58 | static int pmu_backlight_get_level_brightness(int level)
| ~~~~^~~~~
drivers/macintosh/via-pmu-backlight.c: In function 'pmu_backlight_init':
drivers/macintosh/via-pmu-backlight.c:144:17: error: implicit
declaration of function 'of_machine_is_compatible'
[-Wimplicit-function-declaration]
144 | of_machine_is_compatible("AAPL,3400/2400") ||
| ^~~~~~~~~~~~~~~~~~~~~~~~
drivers/macintosh/via-pmu-backlight.c: At top level:
drivers/macintosh/via-pmu-backlight.c:22:11: warning: 'bl_curve'
defined but not used [-Wunused-variable]
22 | static u8 bl_curve[FB_BACKLIGHT_LEVELS];
| ^~~~~~~~
make[5]: *** [scripts/Makefile.build:287:
drivers/macintosh/via-pmu-backlight.o] Error 1
make[5]: Target 'drivers/macintosh/' not remade because of errors.
## Source
* Kernel version: 6.18.0-rc2-next-20251027
* Git tree: https://kernel.googlesource.com/pub/scm/linux/kernel/git/next/linux-next.git
* Git describe: next-20251027
* Git commit: 8fec172c82c2b5f6f8e47ab837c1dc91ee3d1b87
* Architectures: powerpc
* Toolchains: gcc-14
* Kconfigs: defconfig
## Build
* Test log: https://storage.tuxsuite.com/public/linaro/lkft/builds/34dKrlb77LGOQQSoC8FH…
* Test details:
https://regressions.linaro.org/lkft/linux-next-master/next-20251027/build/g…
* Build plan: https://tuxapi.tuxsuite.com/v1/groups/linaro/projects/lkft/builds/34dKrlb77…
* Build link: https://storage.tuxsuite.com/public/linaro/lkft/builds/34dKrlb77LGOQQSoC8FH…
* Kernel config:
https://storage.tuxsuite.com/public/linaro/lkft/builds/34dKrlb77LGOQQSoC8FH…
--
Linaro LKFT
Hello,
IIUC this is the first independent patch series for guest_memfd's in-place
conversion series! Happy to finally bring this out on its own.
Previous versions of this feature, part of other series, are available at
[1][2][3].
Many prior discussions have led up to these main features of this series, and
these are the main points I'd like feedback on.
1. Having private/shared status stored in a maple tree (Thanks Michael for your
support of using maple trees over xarrays for performance! [4]).
2. Having a new guest_memfd ioctl (not a vm ioctl) that performs conversions.
3. Using ioctls/structs/input attribute similar to the existing vm ioctl
KVM_SET_MEMORY_ATTRIBUTES to perform conversions.
4. Storing requested attributes directly in the maple tree.
5. Using a KVM module-wide param to toggle between setting memory attributes via
vm and guest_memfd ioctls (making them mututally exclusive - a single loaded
KVM module can only do one of the two.)
6. Skipping LRU in guest_memfd folios - make guest_memfd folios not participate
in LRU to avoid LRU refcounts from interfering with conversions.
This series is based on kvm/next, followed by
+ v12 of NUMA mempolicy support patches [5]
+ 3 cleanup patches from Sean [6][7][8]
Everything is stitched together here for your convenience
https://github.com/googleprodkernel/linux-cc/commits/guest_memfd-inplace-co…
Thank you all for helping with this series!
If I missed out your comment from a previous series, it's not intentional!
Please do raise it again.
TODOs:
+ There might be an issue with memory failure handling because when guest_memfd
folios stop participating in LRU. From a preliminary analysis,
HWPoisonHandlable() is only true if PageLRU() is true. This needs further
investigation.
[1] https://lore.kernel.org/all/bd163de3118b626d1005aa88e71ef2fb72f0be0f.172600…
[2] https://lore.kernel.org/all/20250117163001.2326672-6-tabba@google.com/
[3] https://lore.kernel.org/all/b784326e9ccae6a08388f1bf39db70a2204bdc51.174726…
[4] https://lore.kernel.org/all/20250529054227.hh2f4jmyqf6igd3i@amd.com/
[5] https://lore.kernel.org/all/20251007221420.344669-1-seanjc@google.com/T/
[6] https://lore.kernel.org/all/20250924174255.2141847-1-seanjc@google.com/
[7] https://lore.kernel.org/all/20251007224515.374516-1-seanjc@google.com/
[8] https://lore.kernel.org/all/20251007223625.369939-1-seanjc@google.com/
Ackerley Tng (19):
KVM: guest_memfd: Update kvm_gmem_populate() to use gmem attributes
KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
KVM: guest_memfd: Don't set FGP_ACCESSED when getting folios
KVM: guest_memfd: Skip LRU for guest_memfd folios
KVM: guest_memfd: Add support for KVM_SET_MEMORY_ATTRIBUTES
KVM: selftests: Update framework to use KVM_SET_MEMORY_ATTRIBUTES2
KVM: selftests: guest_memfd: Test basic single-page conversion flow
KVM: selftests: guest_memfd: Test conversion flow when INIT_SHARED
KVM: selftests: guest_memfd: Test indexing in guest_memfd
KVM: selftests: guest_memfd: Test conversion before allocation
KVM: selftests: guest_memfd: Convert with allocated folios in
different layouts
KVM: selftests: guest_memfd: Test precision of conversion
KVM: selftests: guest_memfd: Test that truncation does not change
shared/private status
KVM: selftests: guest_memfd: Test conversion with elevated page
refcount
KVM: selftests: Reset shared memory after hole-punching
KVM: selftests: Provide function to look up guest_memfd details from
gpa
KVM: selftests: Make TEST_EXPECT_SIGBUS thread-safe
KVM: selftests: Update private_mem_conversions_test to mmap()
guest_memfd
KVM: selftests: Add script to exercise private_mem_conversions_test
Sean Christopherson (18):
KVM: guest_memfd: Introduce per-gmem attributes, use to guard user
mappings
KVM: Rename KVM_GENERIC_MEMORY_ATTRIBUTES to KVM_VM_MEMORY_ATTRIBUTES
KVM: Enumerate support for PRIVATE memory iff kvm_arch_has_private_mem
is defined
KVM: Stub in ability to disable per-VM memory attribute tracking
KVM: guest_memfd: Wire up kvm_get_memory_attributes() to per-gmem
attributes
KVM: guest_memfd: Enable INIT_SHARED on guest_memfd for x86 Coco VMs
KVM: Move KVM_VM_MEMORY_ATTRIBUTES config definition to x86
KVM: Let userspace disable per-VM mem attributes, enable per-gmem
attributes
KVM: selftests: Create gmem fd before "regular" fd when adding memslot
KVM: selftests: Rename guest_memfd{,_offset} to gmem_{fd,offset}
KVM: selftests: Add support for mmap() on guest_memfd in core library
KVM: selftests: Add helpers for calling ioctls on guest_memfd
KVM: selftests: guest_memfd: Test that shared/private status is
consistent across processes
KVM: selftests: Add selftests global for guest memory attributes
capability
KVM: selftests: Provide common function to set memory attributes
KVM: selftests: Check fd/flags provided to mmap() when setting up
memslot
KVM: selftests: Update pre-fault test to work with per-guest_memfd
attributes
KVM: selftests: Update private memory exits test work with per-gmem
attributes
Documentation/virt/kvm/api.rst | 72 ++-
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/Kconfig | 15 +-
arch/x86/kvm/mmu/mmu.c | 4 +-
arch/x86/kvm/x86.c | 13 +-
include/linux/kvm_host.h | 44 +-
include/trace/events/kvm.h | 4 +-
include/uapi/linux/kvm.h | 17 +
mm/filemap.c | 1 +
mm/memcontrol.c | 2 +
tools/testing/selftests/kvm/.gitignore | 1 +
tools/testing/selftests/kvm/Makefile.kvm | 1 +
.../kvm/guest_memfd_conversions_test.c | 498 ++++++++++++++++++
.../testing/selftests/kvm/include/kvm_util.h | 127 ++++-
.../testing/selftests/kvm/include/test_util.h | 29 +-
tools/testing/selftests/kvm/lib/kvm_util.c | 128 +++--
tools/testing/selftests/kvm/lib/test_util.c | 7 -
.../selftests/kvm/pre_fault_memory_test.c | 2 +-
.../kvm/x86/private_mem_conversions_test.c | 55 +-
.../kvm/x86/private_mem_conversions_test.py | 159 ++++++
.../kvm/x86/private_mem_kvm_exits_test.c | 36 +-
virt/kvm/Kconfig | 4 +-
virt/kvm/guest_memfd.c | 414 +++++++++++++--
virt/kvm/kvm_main.c | 104 +++-
24 files changed, 1554 insertions(+), 185 deletions(-)
create mode 100644 tools/testing/selftests/kvm/guest_memfd_conversions_test.c
create mode 100755 tools/testing/selftests/kvm/x86/private_mem_conversions_test.py
--
2.51.0.858.gf9c4a03a3a-goog
Fix compilation error in UPROBE_setup caused by pointer type mismatch
in ternary expression. The probed_uretprobe and probed_uprobe function
pointers have different type attributes (__attribute__((nocf_check))),
which causes the conditional operator to fail with:
seccomp_bpf.c:5175:74: error: pointer type mismatch in conditional
expression [-Wincompatible-pointer-types]
Cast both function pointers to 'const void *' to match the expected
parameter type of get_uprobe_offset(), resolving the type mismatch
while preserving the function selection logic.
Signed-off-by: Nirbhay Sharma <nirbhay.lkd(a)gmail.com>
---
tools/testing/selftests/seccomp/seccomp_bpf.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 874f17763536..e13ffe18ef95 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -5172,7 +5172,8 @@ FIXTURE_SETUP(UPROBE)
ASSERT_GE(bit, 0);
}
- offset = get_uprobe_offset(variant->uretprobe ? probed_uretprobe : probed_uprobe);
+ offset = get_uprobe_offset(variant->uretprobe ?
+ (const void *)probed_uretprobe : (const void *)probed_uprobe);
ASSERT_GE(offset, 0);
if (variant->uretprobe)
--
2.48.1
Changelog:
v8:
Added review-bys and addressed comments from Mike Rapoport and
Pratyush Yadav.
Added "memblock: Unpreserve memory in case of error" to handle
rollback if preserve fails half way through.
This series refactors the KHO framework to better support in-kernel
users like the upcoming LUO. The current design, which relies on a
notifier chain and debugfs for control, is too restrictive for direct
programmatic use.
The core of this rework is the removal of the notifier chain in favor of
a direct registration API. This decouples clients from the shutdown-time
finalization sequence, allowing them to manage their preserved state
more flexibly and at any time.
In support of this new model, this series also:
- Exports kho_finalize() and kho_abort() for programmatic control.
- Makes the debugfs interface optional.
- Introduces APIs to unpreserve memory and fixes a bug in the abort
path where client state was being incorrectly discarded. Note that
this is an interim step, as a more comprehensive fix is planned as
part of the stateless KHO work [1].
- Moves all KHO code into a new kernel/liveupdate/ directory to
consolidate live update components.
[1] https://lore.kernel.org/all/20251020100306.2709352-1-jasonmiu@google.com
Mike Rapoport (Microsoft) (1):
kho: drop notifiers
Pasha Tatashin (7):
kho: allow to drive kho from within kernel
kho: make debugfs interface optional
kho: add interfaces to unpreserve folios and page ranges
kho: don't unpreserve memory during abort
liveupdate: kho: move to kernel/liveupdate
liveupdate: kho: move kho debugfs directory to liveupdate
memblock: Unpreserve memory in case of error
Documentation/core-api/kho/concepts.rst | 2 +-
MAINTAINERS | 3 +-
include/linux/kexec_handover.h | 53 +-
init/Kconfig | 2 +
kernel/Kconfig.kexec | 24 -
kernel/Makefile | 3 +-
kernel/kexec_handover_internal.h | 16 -
kernel/liveupdate/Kconfig | 39 ++
kernel/liveupdate/Makefile | 5 +
kernel/{ => liveupdate}/kexec_handover.c | 508 +++++++-----------
.../{ => liveupdate}/kexec_handover_debug.c | 0
kernel/liveupdate/kexec_handover_debugfs.c | 219 ++++++++
kernel/liveupdate/kexec_handover_internal.h | 56 ++
lib/test_kho.c | 33 +-
mm/memblock.c | 82 ++-
tools/testing/selftests/kho/init.c | 2 +-
tools/testing/selftests/kho/vmtest.sh | 1 +
17 files changed, 590 insertions(+), 458 deletions(-)
delete mode 100644 kernel/kexec_handover_internal.h
create mode 100644 kernel/liveupdate/Kconfig
create mode 100644 kernel/liveupdate/Makefile
rename kernel/{ => liveupdate}/kexec_handover.c (80%)
rename kernel/{ => liveupdate}/kexec_handover_debug.c (100%)
create mode 100644 kernel/liveupdate/kexec_handover_debugfs.c
create mode 100644 kernel/liveupdate/kexec_handover_internal.h
base-commit: 72fb0170ef1f45addf726319c52a0562b6913707
--
2.51.1.821.gb6fe4d2222-goog
This patch series suggests fixes for several corner cases in the RISC-V
vector ptrace implementation:
- init vector context with proper vlenb, to avoid reading zero vlenb
by an early attached debugger
- follow gdbserver expectations and return ENODATA instead of EINVAL
if vector extension is supported but not yet activated for the
traced process
- validate input vector csr registers in ptrace, to maintain an accurate
view of the tracee's vector context across multiple halt/resume
debug cycles
For detailed description see the appropriate commit messages. A new test
suite v_ptrace is added into the tools/testing/selftests/riscv/vector
to verify some of the vector ptrace functionality and corner cases.
Previous versions:
- v2: https://lore.kernel.org/linux-riscv/20250821173957.563472-1-geomatsi@gmail.…
- v1: https://lore.kernel.org/linux-riscv/20251007115840.2320557-1-geomatsi@gmail…
Changes in v3:
Address the review comments by Andy Chiu and rework the approach:
- drop forced vector context save entirely
- perform strict validation of vector csr regs in ptrace
Changes in v2:
- add thread_info flag to allow to force vector context save
- force vector context save after vector ptrace to ensure valid vector
context in the next ptrace operations
- force vector context save on the first context switch after vector
context init to get proper vlenb
---
Ilya Mamay (1):
riscv: ptrace: return ENODATA for inactive vector extension
Sergey Matyukevich (8):
selftests: riscv: test ptrace vector interface
selftests: riscv: verify initial vector state with ptrace
riscv: vector: init vector context with proper vlenb
riscv: csr: define vector registers elements
riscv: ptrace: validate input vector csr registers
selftests: riscv: verify ptrace rejects invalid vector csr inputs
selftests: riscv: verify ptrace accepts valid vector csr values
selftests: riscv: verify syscalls discard vector context
arch/riscv/include/asm/csr.h | 11 +
arch/riscv/kernel/ptrace.c | 72 +-
arch/riscv/kernel/vector.c | 12 +-
.../testing/selftests/riscv/vector/.gitignore | 1 +
tools/testing/selftests/riscv/vector/Makefile | 5 +-
.../testing/selftests/riscv/vector/v_ptrace.c | 737 ++++++++++++++++++
6 files changed, 830 insertions(+), 8 deletions(-)
create mode 100644 tools/testing/selftests/riscv/vector/v_ptrace.c
base-commit: 72761a7e31225971d0b29d9195e0ffa986b77867
--
2.51.0
From: Nai-Chen Cheng <bleach1827(a)gmail.com>
[ Upstream commit d3f7457da7b9527a06dbcbfaf666aa51ac2eeb53 ]
The selftests 'make clean' does not clean the net/lib because it only
processes $(TARGETS) and ignores $(INSTALL_DEP_TARGETS). This leaves
compiled objects in net/lib after cleaning, requiring manual cleanup.
Include $(INSTALL_DEP_TARGETS) in clean target to ensure net/lib
dependency is properly cleaned.
Signed-off-by: Nai-Chen Cheng <bleach1827(a)gmail.com>
Reviewed-by: Simon Horman <horms(a)kernel.org>
Tested-by: Simon Horman <horms(a)kernel.org> # build-tested
Acked-by: Shuah Khan <skhan(a)linuxfoundation.org>
Link: https://patch.msgid.link/20250910-selftests-makefile-clean-v1-1-29e7f496cd8…
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
LLM Generated explanations, may be completely bogus:
YES – updating `tools/testing/selftests/Makefile:319` to iterate over
`$(TARGETS) $(INSTALL_DEP_TARGETS)` brings the `clean` recipe in line
with the existing `all` and `install` loops
(`tools/testing/selftests/Makefile:211` and `:276`), so the implicit
net/lib helper that gets built for networking selftests is now removed
during `make clean`. This fixes a real annoyance where stale objects
remain under `net/lib/` after cleaning, forcing manual removal and
risking rebuild flakiness if headers or tooling change. The change is
tiny, selftests-only, and mirrors logic already exercised in other
targets, so regression risk is negligible. Just make sure the stable
branch you target already carries the dependency hook that introduced
`INSTALL_DEP_TARGETS` (commit b86761ff6374813cdf64ffd6b95ddd1813c435d8
or equivalent); older branches without that infrastructure don’t need
this patch. Natural follow-up: once backported, run `make -C
tools/testing/selftests clean` after building the net tests to confirm
the stale net/lib objects are cleared.
tools/testing/selftests/Makefile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 030da61dbff3a..a2d8e1093b005 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -314,7 +314,7 @@ gen_tar: install
@echo "Created ${TAR_PATH}"
clean:
- @for TARGET in $(TARGETS); do \
+ @for TARGET in $(TARGETS) $(INSTALL_DEP_TARGETS); do \
BUILD_TARGET=$$BUILD/$$TARGET; \
$(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\
done;
--
2.51.0
The current implementation of the arp monitor builds a list of vlan-tags by
following the chain of net_devices above the bond. See bond_verify_device_path().
Unfortunately, with some configurations, this is not possible. One example is
when an ovs switch is configured above the bond.
This change extends the "arp_ip_target" parameter format to allow for a list of
vlan tags to be included for each arp target. This new list of tags is optional
and may be omitted to preserve the current format and process of discovering
vlans.
The new format for arp_ip_target is:
arp_ip_target ipv4-address[vlan-tag\...],...
For example:
arp_ip_target 10.0.0.1[10/20]
arp_ip_target 10.0.0.1[] (used to disable vlan discovery)
Changes since V13
Thanks for the help Paolo:
- Changed first argument of bond_option_arp_ip_target_add() to a const.
- Changed first argument of bond_arp_target_to_string to a const.
- Added compiler time check of size argument to: bond_arp_target_to_string(),
BUILD_BUG_ON(size != BOND_OPTION_STRING_MAX_SIZE);
- In bond_arp_send_all() I changed the condition for both the allocation and
the free calls to be the same to improve the clarity of the code.
- Removed extra tab in bond_fill_info().
- Updated update bond_get_size() to reflect the increased payload for the
arp_ip_target option.
- Corrected indentation and alignment in bond-arp-ip-target.sh.
Changes since V12
Fixed uninitialized variable in bond_option_arp_ip_targets_set() (patch 4)
causing a CI failure.
Changes since V11
No Change.
Changes since V10
Thanks Paolo:
- 1/7 Changed the layout of struct bond_arp_target to reduce size of the struct.
- 3/7 Fixed format 'size-num' -> 'size - num'
- 7/7 Updated selftest (bond-arp-ip-target.sh). Removed sleep 10 in check_failure_count().
Added call to tc to verify arp probes are reaching the target interface. Then I verify that
the Link Failure counts are not increasing over "time". Arp probes are sent every 100ms,
two missed probes will trigger a Link failure. A one second wait between checking counts
should be be more than sufficient. This speeds up the execution of the test.
Thanks Nikolay:
- 4/7 In bond_option_arp_ip_targets_clear() I changed the definition of empty_target to empty_target = {}.
- bond_validate_tags() now verifies input is a multiple of sizeof(struct bond_vlan_tag).
Updated VID validity check to use: !tags->vlan_id || tags->vlan_id >= VLAN_VID_MASK) as suggested.
- In bond_option_arp_ip_targets_set() removed the redundant length check of target.target_ip.
- Added kfree(target.tags) when bond_option_arp_ip_target_add() results in an error.
- Removed the caching of struct bond_vlan_tag returned by bond_verify_device_path(), Nikolay
pointed out that caching tags prevented the detection of VLAN configuration changes.
Added a kfree(tags) for tags allocated in bond_verify_device_path().
Jay, Nikolay and I had a discussion regarding locking when adding, deleting or changing vlan tags.
Jay pointed out that user supplied tags that are stashed in the bond configuration and can only be
changed via user space this can be done safely in an RCU manner as netlink always operates with RTNL
held. If user space provided tags and then replumbs things, it'll be on user space to update the tags
in a safe manor.
I was concerned about changing options on a configured bond, I found that attempting to change
a bonds configuration (using "ip set") will abort the attempt to make a change if the bond's state is
"UP" or has slaves configured. Therefor the configuration and operational side of a bond is separated.
I agree with Jay that the existing locking scheme is sufficient.
Change since V9
Fix kdoc build error.
Changes since V8:
Moved the #define BOND_MAX_VLAN_TAGS from patch 6 to patch 3.
Thanks Simon for catching the bisection break.
Changes since V7:
These changes should eliminate the CI failures I have been seeing.
1) patch 2, changed type of bond_opt_value.extra_len to size_t.
2) Patch 4, added bond_validate_tags() to validate the array of bond_vlan_tag provided by
the user.
Changes since V6:
1) I made a number of changes to fix the failure seen in the
kernel CI. I am still unable to reproduce the this failure, hopefully I
have fixed it. These change are in patch #4 to functions:
bond_option_arp_ip_targets_clear() and
bond_option_arp_ip_targets_set()
Changes since V5: Only the last 2 patches have changed since V5.
1) Fixed sparse warning in bond_fill_info().
2) Also in bond_fill_info() I resolved data.addr uninitialized when if condition is not met.
Thank you Simon for catching this. Note: The change is different that what I shared earlier.
3) Fixed shellcheck warnings in test script: Blocked source warning, Ignored specific unassigned
references and exported ALL_TESTS to resolve a reference warning.
Changes since V4:
1)Dropped changes to proc and sysfs APIs to bonding. These APIs
do not need to be updated to support new functionality. Netlink
and iproute2 have been updated to do the right thing, but the
other APIs are more or less frozen in the past.
2)Jakub reported a warning triggered in bond_info_seq_show() during
testing. I was unable to reproduce this warning or identify
it with code inspection. However, all my changes to bond_info_seq_show()
have been dropped as unnecessary (see above).
Hopefully this will resolve the issue.
3)Selftest script has been updated based on the results of shellcheck.
Two unresolved references that are not possible to resolve are all
that remain.
4)A patch was added updating bond_info_fill()
to support "ip -d show <bond-device>" command.
The inclusion of a list of vlan tags is optional. The new logic
preserves both forward and backward compatibility with the kernel
and iproute2 versions.
Changes since V3:
1) Moved the parsing of the extended arp_ip_target out of the kernel and into
userspace (ip command). A separate patch to iproute2 to follow shortly.
2) Split up the patch set to make review easier.
Please see iproute changes in a separate posting.
Thank you for your time and reviews.
Signed-off-by: David Wilder <wilder(a)us.ibm.com>
David Wilder (7):
bonding: Adding struct bond_arp_target
bonding: Adding extra_len field to struct bond_opt_value.
bonding: arp_ip_target helpers.
bonding: Processing extended arp_ip_target from user space.
bonding: Update to bond_arp_send_all() to use supplied vlan tags
bonding: Update for extended arp_ip_target format.
bonding: Selftest and documentation for the arp_ip_target parameter.
Documentation/networking/bonding.rst | 11 +
drivers/net/bonding/bond_main.c | 48 +++--
drivers/net/bonding/bond_netlink.c | 39 +++-
drivers/net/bonding/bond_options.c | 146 ++++++++++---
drivers/net/bonding/bond_procfs.c | 4 +-
drivers/net/bonding/bond_sysfs.c | 4 +-
include/net/bond_options.h | 29 ++-
include/net/bonding.h | 67 +++++-
.../selftests/drivers/net/bonding/Makefile | 1 +
.../drivers/net/bonding/bond-arp-ip-target.sh | 204 ++++++++++++++++++
10 files changed, 474 insertions(+), 79 deletions(-)
create mode 100755 tools/testing/selftests/drivers/net/bonding/bond-arp-ip-target.sh
--
2.50.1
When running parameterized tests, each test case is initialized with
kunit_init_test(). This function takes the test_case->log as a parameter
but it clears it via string_stream_clear() on each iteration.
This results in only the log from the last parameter being preserved in
the test_case->log and the results from the previous parameters are lost
from the debugfs entry.
Fix this by manually setting the param_test.log to the test_case->log
after it has been initialized. This prevents kunit_init_test() from
clearing the log on each iteration.
Fixes: 4b59300ba4d2 ("kunit: Add parent kunit for parameterized test context")
Signed-off-by: Carlos Llamas <cmllamas(a)google.com>
---
lib/kunit/test.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/lib/kunit/test.c b/lib/kunit/test.c
index bb66ea1a3eac..62eb529824c6 100644
--- a/lib/kunit/test.c
+++ b/lib/kunit/test.c
@@ -745,7 +745,8 @@ int kunit_run_tests(struct kunit_suite *suite)
.param_index = ++test.param_index,
.parent = &test,
};
- kunit_init_test(¶m_test, test_case->name, test_case->log);
+ kunit_init_test(¶m_test, test_case->name, NULL);
+ param_test.log = test_case->log;
kunit_run_case_catch_errors(suite, test_case, ¶m_test);
if (param_desc[0] == '\0') {
--
2.51.1.821.gb6fe4d2222-goog
Fix to avoid the usage of the `ret` variable uninitialized in the
following macro expansions.
It solves the following warning:
In file included from netlink-dumps.c:21:
netlink-dumps.c: In function ‘dump_extack’:
../kselftest_harness.h:788:35: warning: ‘ret’ may be used uninitialized [-Wmaybe-uninitialized]
788 | intmax_t __exp_print = (intmax_t)__exp; \
| ^~~~~~~~~~~
../kselftest_harness.h:631:9: note: in expansion of macro ‘__EXPECT’
631 | __EXPECT(expected, #expected, seen, #seen, ==, 0)
| ^~~~~~~~
netlink-dumps.c:169:9: note: in expansion of macro ‘EXPECT_EQ’
169 | EXPECT_EQ(ret, FOUND_EXTACK);
| ^~~~~~~~~
The issue can be reproduced, building the tests, with the command:
make -C tools/testing/selftests TARGETS=net
Signed-off-by: Alessandro Zanni <alessandro.zanni87(a)gmail.com>
---
Notes:
v2: applied the reverse christmas tree order
tools/testing/selftests/net/netlink-dumps.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/net/netlink-dumps.c b/tools/testing/selftests/net/netlink-dumps.c
index 7618ebe528a4..7de360c029c6 100644
--- a/tools/testing/selftests/net/netlink-dumps.c
+++ b/tools/testing/selftests/net/netlink-dumps.c
@@ -111,8 +111,8 @@ static const struct {
TEST(dump_extack)
{
+ int i, cnt, ret = 0;
int netlink_sock;
- int i, cnt, ret;
char buf[8192];
int one = 1;
ssize_t n;
--
2.43.0
For a while now we have supported file handles for pidfds. This has
proven to be very useful.
Extend the concept to cover namespaces as well. After this patchset it
is possible to encode and decode namespace file handles using the
commong name_to_handle_at() and open_by_handle_at() apis.
Namespaces file descriptors can already be derived from pidfds which
means they aren't subject to overmount protection bugs. IOW, it's
irrelevant if the caller would not have access to an appropriate
/proc/<pid>/ns/ directory as they could always just derive the namespace
based on a pidfd already.
It has the same advantage as pidfds. It's possible to reliably and for
the lifetime of the system refer to a namespace without pinning any
resources and to compare them.
Permission checking is kept simple. If the caller is located in the
namespace the file handle refers to they are able to open it otherwise
they must hold privilege over the owning namespace of the relevant
namespace.
Both the network namespace and the mount namespace already have an
associated cookie that isn't recycled and is fully exposed to userspace.
Move this into ns_common and use the same id space for all namespaces so
they can trivially and reliably be compared.
There's more coming based on the iterator infrastructure but the series
is large enough and focuses on file handles.
Extensive selftests included. I still have various other test-suites to
run but it holds up so far.
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
---
Christian Brauner (32):
pidfs: validate extensible ioctls
nsfs: validate extensible ioctls
block: use extensible_ioctl_valid()
ns: move to_ns_common() to ns_common.h
nsfs: add nsfs.h header
ns: uniformly initialize ns_common
mnt: use ns_common_init()
ipc: use ns_common_init()
cgroup: use ns_common_init()
pid: use ns_common_init()
time: use ns_common_init()
uts: use ns_common_init()
user: use ns_common_init()
net: use ns_common_init()
ns: remove ns_alloc_inum()
nstree: make iterator generic
mnt: support iterator
cgroup: support iterator
ipc: support iterator
net: support iterator
pid: support iterator
time: support iterator
userns: support iterator
uts: support iterator
ns: add to_<type>_ns() to respective headers
nsfs: add current_in_namespace()
nsfs: support file handles
nsfs: support exhaustive file handles
nsfs: add missing id retrieval support
tools: update nsfs.h uapi header
selftests/namespaces: add identifier selftests
selftests/namespaces: add file handle selftests
block/blk-integrity.c | 8 +-
fs/fhandle.c | 6 +
fs/internal.h | 1 +
fs/mount.h | 10 +-
fs/namespace.c | 156 +--
fs/nsfs.c | 266 +++-
fs/pidfs.c | 2 +-
include/linux/cgroup.h | 5 +
include/linux/exportfs.h | 6 +
include/linux/fs.h | 14 +
include/linux/ipc_namespace.h | 5 +
include/linux/ns_common.h | 29 +
include/linux/nsfs.h | 40 +
include/linux/nsproxy.h | 11 -
include/linux/nstree.h | 89 ++
include/linux/pid_namespace.h | 5 +
include/linux/proc_ns.h | 32 +-
include/linux/time_namespace.h | 9 +
include/linux/user_namespace.h | 5 +
include/linux/utsname.h | 5 +
include/net/net_namespace.h | 6 +
include/uapi/linux/fcntl.h | 1 +
include/uapi/linux/nsfs.h | 12 +-
init/main.c | 2 +
ipc/msgutil.c | 1 +
ipc/namespace.c | 12 +-
ipc/shm.c | 2 +
kernel/Makefile | 2 +-
kernel/cgroup/cgroup.c | 2 +
kernel/cgroup/namespace.c | 24 +-
kernel/nstree.c | 233 ++++
kernel/pid_namespace.c | 13 +-
kernel/time/namespace.c | 23 +-
kernel/user_namespace.c | 17 +-
kernel/utsname.c | 28 +-
net/core/net_namespace.c | 59 +-
tools/include/uapi/linux/nsfs.h | 23 +-
tools/testing/selftests/namespaces/.gitignore | 2 +
tools/testing/selftests/namespaces/Makefile | 7 +
tools/testing/selftests/namespaces/config | 7 +
.../selftests/namespaces/file_handle_test.c | 1410 ++++++++++++++++++++
tools/testing/selftests/namespaces/nsid_test.c | 986 ++++++++++++++
42 files changed, 3306 insertions(+), 270 deletions(-)
---
base-commit: 8f5ae30d69d7543eee0d70083daf4de8fe15d585
change-id: 20250905-work-namespace-c68826dda0d4
This patch adds support for the Zalasr ISA extension, which supplies the
real load acquire/store release instructions.
The specification can be found here:
https://github.com/riscv/riscv-zalasr/blob/main/chapter2.adoc
This patch seires has been tested with ltp on Qemu with Brensan's zalasr
support patch[1].
Some false positive spacing error happens during patch checking. Thus I
CCed maintainers of checkpatch.pl as well.
[1] https://lore.kernel.org/all/CAGPSXwJEdtqW=nx71oufZp64nK6tK=0rytVEcz4F-gfvCO…
v4:
- Apply acquire/release semantics to arch_atomic operations. Thanks
to Andrea.
v3:
- Apply acquire/release semantics to arch_xchg/arch_cmpxchg operations
so as to ensure FENCE.TSO ordering between operations which precede the
UNLOCK+LOCK sequence and operations which follow the sequence. Thanks
to Andrea.
- Support hwprobe of Zalasr.
- Allow Zalasr extensions for Guest/VM.
v2:
- Adjust the order of Zalasr and Zalrsc in dt-bindings. Thanks to
Conor.
Xu Lu (10):
riscv: Add ISA extension parsing for Zalasr
dt-bindings: riscv: Add Zalasr ISA extension description
riscv: hwprobe: Export Zalasr extension
riscv: Introduce Zalasr instructions
riscv: Apply Zalasr to smp_load_acquire/smp_store_release
riscv: Apply acquire/release semantics to arch_xchg/arch_cmpxchg
operations
riscv: Apply acquire/release semantics to arch_atomic operations
riscv: Remove arch specific __atomic_acquire/release_fence
RISC-V: KVM: Allow Zalasr extensions for Guest/VM
RISC-V: KVM: selftests: Add Zalasr extensions to get-reg-list test
Documentation/arch/riscv/hwprobe.rst | 5 +-
.../devicetree/bindings/riscv/extensions.yaml | 5 +
arch/riscv/include/asm/atomic.h | 70 ++++++++-
arch/riscv/include/asm/barrier.h | 91 +++++++++--
arch/riscv/include/asm/cmpxchg.h | 144 +++++++++---------
arch/riscv/include/asm/fence.h | 4 -
arch/riscv/include/asm/hwcap.h | 1 +
arch/riscv/include/asm/insn-def.h | 79 ++++++++++
arch/riscv/include/uapi/asm/hwprobe.h | 1 +
arch/riscv/include/uapi/asm/kvm.h | 1 +
arch/riscv/kernel/cpufeature.c | 1 +
arch/riscv/kernel/sys_hwprobe.c | 1 +
arch/riscv/kvm/vcpu_onereg.c | 2 +
.../selftests/kvm/riscv/get-reg-list.c | 4 +
14 files changed, 314 insertions(+), 95 deletions(-)
--
2.20.1
This small patchset is about avoid verifier bug warning when conditional
jumps on same register when the register holds a scalar with range.
---
KaFai Wan (2):
bpf: Skip bounds adjustment for conditional jumps on same register
selftests/bpf: Add test for conditional jumps on same register
kernel/bpf/verifier.c | 4 ++++
.../selftests/bpf/progs/verifier_bounds.c | 17 +++++++++++++++++
2 files changed, 21 insertions(+)
--
2.43.0
These patches are taken from the LUOv4 series [1] and address recent
comments from Pratyush.
They apply onto mm/mm-nonmm-unstable.
This series refactors the KHO framework to better support in-kernel
users like the upcoming LUO. The current design, which relies on a
notifier chain and debugfs for control, is too restrictive for direct
programmatic use.
The core of this rework is the removal of the notifier chain in favor of
a direct registration API. This decouples clients from the shutdown-time
finalization sequence, allowing them to manage their preserved state
more flexibly and at any time.
In support of this new model, this series also:
- Exports kho_finalize() and kho_abort() for programmatic control.
- Makes the debugfs interface optional.
- Introduces APIs to unpreserve memory and fixes a bug in the abort
path where client state was being incorrectly discarded. Note that
this is an interim step, as a more comprehensive fix is planned as
part of the stateless KHO work [2].
- Moves all KHO code into a new kernel/liveupdate/ directory to
consolidate live update components.
[1] https://lore.kernel.org/all/20250929010321.3462457-1-pasha.tatashin@soleen.…
[2] https://lore.kernel.org/all/20251001011941.1513050-1-jasonmiu@google.com
Mike Rapoport (Microsoft) (1):
kho: drop notifiers
Pasha Tatashin (6):
kho: allow to drive kho from within kernel
kho: make debugfs interface optional
kho: add interfaces to unpreserve folios and page ranges
kho: don't unpreserve memory during abort
liveupdate: kho: move to kernel/liveupdate
liveupdate: kho: move kho debugfs directory to liveupdate
Documentation/core-api/kho/concepts.rst | 2 +-
MAINTAINERS | 3 +-
include/linux/kexec_handover.h | 53 +-
init/Kconfig | 2 +
kernel/Kconfig.kexec | 24 -
kernel/Makefile | 3 +-
kernel/kexec_handover_internal.h | 16 -
kernel/liveupdate/Kconfig | 39 ++
kernel/liveupdate/Makefile | 5 +
kernel/{ => liveupdate}/kexec_handover.c | 513 +++++++-----------
.../{ => liveupdate}/kexec_handover_debug.c | 0
kernel/liveupdate/kexec_handover_debugfs.c | 219 ++++++++
kernel/liveupdate/kexec_handover_internal.h | 56 ++
lib/test_kho.c | 30 +-
mm/memblock.c | 62 +--
tools/testing/selftests/kho/init.c | 2 +-
tools/testing/selftests/kho/vmtest.sh | 1 +
17 files changed, 576 insertions(+), 454 deletions(-)
delete mode 100644 kernel/kexec_handover_internal.h
create mode 100644 kernel/liveupdate/Kconfig
create mode 100644 kernel/liveupdate/Makefile
rename kernel/{ => liveupdate}/kexec_handover.c (80%)
rename kernel/{ => liveupdate}/kexec_handover_debug.c (100%)
create mode 100644 kernel/liveupdate/kexec_handover_debugfs.c
create mode 100644 kernel/liveupdate/kexec_handover_internal.h
base-commit: 4d90027271cfa0d89473d1e288af52fda9a74935
--
2.51.0.915.g61a8936c21-goog
Hello,
this is the v2 of test_tc_tunnel conversion into test_progs framework.
test_tc_tunnel.sh tests a variety of tunnels based on BPF: packets are
encapsulated by a BPF program on the client egress. We then check that
those packets can be decapsulated on server ingress side, either thanks
to kernel-based or BPF-based decapsulation. Those tests are run thanks
to two veths in two dedicated namespaces.
- patches 1 and 2 are preparatory patches
- patch 3 introduce tc_tunnel test into test_progs
- patch 4 gets rid of the test_tc_tunnel.sh script
The new test has been executed both in some x86 local qemu machine, as
well as in CI:
# ./test_progs -a tc_tunnel
#454/1 tc_tunnel/ipip_none:OK
#454/2 tc_tunnel/ipip6_none:OK
#454/3 tc_tunnel/ip6tnl_none:OK
#454/4 tc_tunnel/sit_none:OK
#454/5 tc_tunnel/vxlan_eth:OK
#454/6 tc_tunnel/ip6vxlan_eth:OK
#454/7 tc_tunnel/gre_none:OK
#454/8 tc_tunnel/gre_eth:OK
#454/9 tc_tunnel/gre_mpls:OK
#454/10 tc_tunnel/ip6gre_none:OK
#454/11 tc_tunnel/ip6gre_eth:OK
#454/12 tc_tunnel/ip6gre_mpls:OK
#454/13 tc_tunnel/udp_none:OK
#454/14 tc_tunnel/udp_eth:OK
#454/15 tc_tunnel/udp_mpls:OK
#454/16 tc_tunnel/ip6udp_none:OK
#454/17 tc_tunnel/ip6udp_eth:OK
#454/18 tc_tunnel/ip6udp_mpls:OK
#454 tc_tunnel:OK
Summary: 1/18 PASSED, 0 SKIPPED, 0 FAILED
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore(a)bootlin.com>
---
Changes in v2:
- declare a single tc_prog_attach helper rather than multiple,
intermediate helpers
- move the new helper to network_helpers.c rather than a dedicated
file
- do not rename existing tc_helpers.c/h pair (drop patch)
- keep only the minimal set of needed NS switches
- Link to v1: https://lore.kernel.org/r/20251017-tc_tunnel-v1-0-2d86808d86b2@bootlin.com
---
Alexis Lothoré (eBPF Foundation) (4):
selftests/bpf: add tc helpers
selftests/bpf: make test_tc_tunnel.bpf.c compatible with big endian platforms
selftests/bpf: integrate test_tc_tunnel.sh tests into test_progs
selftests/bpf: remove test_tc_tunnel.sh
tools/testing/selftests/bpf/Makefile | 1 -
tools/testing/selftests/bpf/network_helpers.c | 45 ++
tools/testing/selftests/bpf/network_helpers.h | 16 +
.../selftests/bpf/prog_tests/test_tc_tunnel.c | 660 +++++++++++++++++++++
.../testing/selftests/bpf/prog_tests/test_tunnel.c | 107 +---
tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 95 ++-
tools/testing/selftests/bpf/test_tc_tunnel.sh | 320 ----------
7 files changed, 776 insertions(+), 468 deletions(-)
---
base-commit: b92bbe400a50e4eb033b378252292d1cc19cabae
change-id: 20250811-tc_tunnel-c61342683f18
Best regards,
--
Alexis Lothoré, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com
This path introduces several kfuncs to help BPF programs determine their
current execution context. When hooking functions for statistics, we often
need to use current->comm to get the process name.
However, these hooked functions can be called from either process context
or interrupt context. When called from interrupt context, the current we
obtain may refer to the process that was interrupted, which may not be
what we need.
These new kfuncs expose APIs that allow users to determine the actual
execution context.
Jiayuan Chen (2):
bpf: Add kfuncs for detecting execution context
selftests/bpf: Add selftests for context detection kfuncs
kernel/bpf/helpers.c | 45 +++++++++++++++++++
.../selftests/bpf/prog_tests/context.c | 32 +++++++++++++
.../selftests/bpf/progs/context_prog.c | 33 ++++++++++++++
3 files changed, 110 insertions(+)
create mode 100644 tools/testing/selftests/bpf/prog_tests/context.c
create mode 100644 tools/testing/selftests/bpf/progs/context_prog.c
--
2.43.0
This series fixes a memory corruption bug in KHO that occurs when KFENCE
is enabled.
The root cause is that KHO metadata, allocated via kzalloc(), can be
randomly serviced by kfence_alloc(). When a kernel boots via KHO, the
early memblock allocator is restricted to a "scratch area". This forces
the KFENCE pool to be allocated within this scratch area, creating a
conflict. If KHO metadata is subsequently placed in this pool, it gets
corrupted during the next kexec operation.
The series is structured in two parts:
Patch 1/2 introduces a debug-only feature (CONFIG_KEXEC_HANDOVER_DEBUG)
that adds checks to detect and fail any operation that attempts to place
KHO metadata or preserved memory within the scratch area. This serves as
a validation and diagnostic tool to confirm the problem without
affecting production builds.
Patch 2/2 provides the fix by modifying KHO to allocate its metadata
directly from the buddy allocator instead of SLUB. This bypasses the
KFENCE interception entirely.
Pasha Tatashin (2):
liveupdate: kho: warn and fail on metadata or preserved memory in
scratch area
liveupdate: kho: allocate metadata directly from the buddy allocator
kernel/liveupdate/Kconfig | 15 ++++++
kernel/liveupdate/kexec_handover.c | 51 ++++++++++++++++-----
kernel/liveupdate/kexec_handover_debug.c | 18 ++++++++
kernel/liveupdate/kexec_handover_internal.h | 9 ++++
4 files changed, 81 insertions(+), 12 deletions(-)
base-commit: 0b2f041c47acb45db82b4e847af6e17eb66cd32d
--
2.51.0.788.g6d19910ace-goog
Need to talk to a real person Spirit Airlines? Calling Spirit Airlines directly at 📞 1-866-284-3022. Whether you are trying to make a flight change, cancel your booking, ask about baggage, or resolve a booking issue, reaching a live agent can save you time, stress, and confusion. While automated systems are useful for simple tasks, some situations just need a human touch.
In this guide, we will walk you through exactly how to reach a live person, what to prepare before calling, and alternate methods if the phone lines are busy.
☎️ First Things First: Call 1-866-284-3022
The most direct and reliable way to speak with a real Spirit Airlines representative is by calling 📞 1-866-284-3022. This is Frontier’s official customer service number and should be your go-to for:
✈️ Flight changes or cancellations
🧾 Refund or credit questions
🛄 Baggage issues
🔁 Name corrections
🛑 Check-in or boarding problems
💺 Seat selection and upgrades
Pro Tip: When calling, try to do so during non-peak hours — early mornings or late evenings — to reduce your hold time.
🎧 How to Navigate the Automated Menu
When you call 1-866-284-3022, you will first hear an automated system. To get through to a live person faster, follow these steps:
Dial 1-866-284-3022
Wait for the automated greeting to begin
Press “1” for English (or “2” for Spanish)
Press “2” for existing reservations
Press “0” to speak with an agent (you may need to press "0" more than once)
👉 If “0” does not work immediately, stay on the line. Sometimes the system transfers you to an agent after a brief wait without needing more input.
If the lines are busy and you are placed on hold, do not hang up — wait times vary, but you will eventually reach someone.
🧾 What to Have Ready Before You Call
To help the agent assist you faster, make sure you have the following details on hand:
📌 Your confirmation code or booking number
📌 The full name on the reservation
📌 Your flight date and destination
📌 Any relevant documents (ID, credit card, etc.)
📌 A notepad for writing down instructions or confirmation numbers
If you are calling to fix a mistake or request a refund, be prepared to briefly explain the issue and possibly provide documentation via email upon request.
⏰ Best Times to Call 1-866-284-3022
Customer service lines can be busy, especially during:
⚠️ Holidays
⚠️ Severe weather or flight delays
⚠️ Early morning flight hours
🎯 For the best chance at a short wait, try calling during these times:
🕔 5:00 AM – 7:00 AM (EST)
🕘 9:00 PM – 11:00 PM (EST)
📅 Midweek (Tuesdays and Wednesdays)
Avoid Mondays if possible — it is the busiest day for airlines.
🧑💻 Alternative Ways to Reach Spirit Airlines (If Phone Fails)
If calling 📞 1-866-284-3022 does not work or you are stuck in a long queue, here are a few alternate ways to get help:
💬 1. Online Chat (Limited Availability)
Visit www.flyfrontier.com
Scroll down and look for the “Let’s Chat” option.
This can connect you to a live agent or AI assistant, depending on availability.
📧 2. Email Support
You can also submit a help request through their Customer Support Form online.
Use this for non-urgent matters like refund requests or documentation review.
📱 3. Social Media
Tweet or DM Spirit Airlines on platforms like Twitter/X (@FlyFrontier) or send a message via Facebook.
Sometimes social media agents respond faster than the phone team during high-volume periods.
📲 4. Mobile App
Download the Spirit Airlines App, log in, and navigate to “My Trips” or “Support” for quick options.
While this will not guarantee a live agent, you might find answers to basic questions faster.
❗ Common Issues That Require a Live Agent
While many tasks can be done online, certain problems are best resolved with a real person at 1-866-284-3022:
🛑 Double charges or billing issues
🔄 Complex flight changes involving multiple passengers
🛄 Lost or delayed baggage
✍️ Legal name changes (marriage, divorce, etc.)
🧑⚕️ Medical or accessibility needs during travel
In these cases, avoid wasting time — call directly and ask for a live agent.
🚨 Beware of Fake Numbers and Scams
Only use the official number: 📞 1-866-284-3022.
Scammers often post fake “Frontier support” numbers online, asking for credit card info or login credentials.
🛡️ Never share your full credit card number or personal information with an unverified source.
✅ Final Thoughts
Talking to a live person at an airline should not be this hard — but when it comes to Spirit Airlines, knowing the right steps and phone number makes all the difference.
🧠 Remember:
Dial 📞 1-866-284-3022
Press 0 to reach a live agent
Call during off-peak hours
Have your booking info ready
Use alternative methods if the line is too busy
💡 The sooner you reach out, the more options you'll have to resolve your issue.
✈️ Whether you are rebooking, fixing an error, or checking a flight, calling 1-866-284-3022 connects you with someone who can truly help.
Need to talk to a real person Southwest Airlines? Calling Southwest Airlines directly at 📞 1-866-284-3022. Whether you are trying to make a flight change, cancel your booking, ask about baggage, or resolve a booking issue, reaching a live agent can save you time, stress, and confusion. While automated systems are useful for simple tasks, some situations just need a human touch.
In this guide, we will walk you through exactly how to reach a live person, what to prepare before calling, and alternate methods if the phone lines are busy.
☎️ First Things First: Call 1-866-284-3022
The most direct and reliable way to speak with a real Southwest Airlines representative is by calling 📞 1-866-284-3022. This is Frontier’s official customer service number and should be your go-to for:
✈️ Flight changes or cancellations
🧾 Refund or credit questions
🛄 Baggage issues
🔁 Name corrections
🛑 Check-in or boarding problems
💺 Seat selection and upgrades
Pro Tip: When calling, try to do so during non-peak hours — early mornings or late evenings — to reduce your hold time.
🎧 How to Navigate the Automated Menu
When you call 1-866-284-3022, you will first hear an automated system. To get through to a live person faster, follow these steps:
Dial 1-866-284-3022
Wait for the automated greeting to begin
Press “1” for English (or “2” for Spanish)
Press “2” for existing reservations
Press “0” to speak with an agent (you may need to press "0" more than once)
👉 If “0” does not work immediately, stay on the line. Sometimes the system transfers you to an agent after a brief wait without needing more input.
If the lines are busy and you are placed on hold, do not hang up — wait times vary, but you will eventually reach someone.
🧾 What to Have Ready Before You Call
To help the agent assist you faster, make sure you have the following details on hand:
📌 Your confirmation code or booking number
📌 The full name on the reservation
📌 Your flight date and destination
📌 Any relevant documents (ID, credit card, etc.)
📌 A notepad for writing down instructions or confirmation numbers
If you are calling to fix a mistake or request a refund, be prepared to briefly explain the issue and possibly provide documentation via email upon request.
⏰ Best Times to Call 1-866-284-3022
Customer service lines can be busy, especially during:
⚠️ Holidays
⚠️ Severe weather or flight delays
⚠️ Early morning flight hours
🎯 For the best chance at a short wait, try calling during these times:
🕔 5:00 AM – 7:00 AM (EST)
🕘 9:00 PM – 11:00 PM (EST)
📅 Midweek (Tuesdays and Wednesdays)
Avoid Mondays if possible — it is the busiest day for airlines.
🧑💻 Alternative Ways to Reach Southwest Airlines (If Phone Fails)
If calling 📞 1-866-284-3022 does not work or you are stuck in a long queue, here are a few alternate ways to get help:
💬 1. Online Chat (Limited Availability)
Visit www.flyfrontier.com
Scroll down and look for the “Let’s Chat” option.
This can connect you to a live agent or AI assistant, depending on availability.
📧 2. Email Support
You can also submit a help request through their Customer Support Form online.
Use this for non-urgent matters like refund requests or documentation review.
📱 3. Social Media
Tweet or DM Southwest Airlines on platforms like Twitter/X (@FlyFrontier) or send a message via Facebook.
Sometimes social media agents respond faster than the phone team during high-volume periods.
📲 4. Mobile App
Download the Southwest Airlines App, log in, and navigate to “My Trips” or “Support” for quick options.
While this will not guarantee a live agent, you might find answers to basic questions faster.
❗ Common Issues That Require a Live Agent
While many tasks can be done online, certain problems are best resolved with a real person at 1-866-284-3022:
🛑 Double charges or billing issues
🔄 Complex flight changes involving multiple passengers
🛄 Lost or delayed baggage
✍️ Legal name changes (marriage, divorce, etc.)
🧑⚕️ Medical or accessibility needs during travel
In these cases, avoid wasting time — call directly and ask for a live agent.
🚨 Beware of Fake Numbers and Scams
Only use the official number: 📞 1-866-284-3022.
Scammers often post fake “Frontier support” numbers online, asking for credit card info or login credentials.
🛡️ Never share your full credit card number or personal information with an unverified source.
✅ Final Thoughts
Talking to a live person at an airline should not be this hard — but when it comes to Southwest Airlines, knowing the right steps and phone number makes all the difference.
🧠 Remember:
Dial 📞 1-866-284-3022
Press 0 to reach a live agent
Call during off-peak hours
Have your booking info ready
Use alternative methods if the line is too busy
💡 The sooner you reach out, the more options you'll have to resolve your issue.
✈️ Whether you are rebooking, fixing an error, or checking a flight, calling 1-866-284-3022 connects you with someone who can truly help.
Need to talk to a real person SkyWest Airlines? Calling SkyWest Airlines directly at 📞 1-866-284-3022. Whether you are trying to make a flight change, cancel your booking, ask about baggage, or resolve a booking issue, reaching a live agent can save you time, stress, and confusion. While automated systems are useful for simple tasks, some situations just need a human touch.
In this guide, we will walk you through exactly how to reach a live person, what to prepare before calling, and alternate methods if the phone lines are busy.
☎️ First Things First: Call 1-866-284-3022
The most direct and reliable way to speak with a real SkyWest Airlines representative is by calling 📞 1-866-284-3022. This is Frontier’s official customer service number and should be your go-to for:
✈️ Flight changes or cancellations
🧾 Refund or credit questions
🛄 Baggage issues
🔁 Name corrections
🛑 Check-in or boarding problems
💺 Seat selection and upgrades
Pro Tip: When calling, try to do so during non-peak hours — early mornings or late evenings — to reduce your hold time.
🎧 How to Navigate the Automated Menu
When you call 1-866-284-3022, you will first hear an automated system. To get through to a live person faster, follow these steps:
Dial 1-866-284-3022
Wait for the automated greeting to begin
Press “1” for English (or “2” for Spanish)
Press “2” for existing reservations
Press “0” to speak with an agent (you may need to press "0" more than once)
👉 If “0” does not work immediately, stay on the line. Sometimes the system transfers you to an agent after a brief wait without needing more input.
If the lines are busy and you are placed on hold, do not hang up — wait times vary, but you will eventually reach someone.
🧾 What to Have Ready Before You Call
To help the agent assist you faster, make sure you have the following details on hand:
📌 Your confirmation code or booking number
📌 The full name on the reservation
📌 Your flight date and destination
📌 Any relevant documents (ID, credit card, etc.)
📌 A notepad for writing down instructions or confirmation numbers
If you are calling to fix a mistake or request a refund, be prepared to briefly explain the issue and possibly provide documentation via email upon request.
⏰ Best Times to Call 1-866-284-3022
Customer service lines can be busy, especially during:
⚠️ Holidays
⚠️ Severe weather or flight delays
⚠️ Early morning flight hours
🎯 For the best chance at a short wait, try calling during these times:
🕔 5:00 AM – 7:00 AM (EST)
🕘 9:00 PM – 11:00 PM (EST)
📅 Midweek (Tuesdays and Wednesdays)
Avoid Mondays if possible — it is the busiest day for airlines.
🧑💻 Alternative Ways to Reach SkyWest Airlines (If Phone Fails)
If calling 📞 1-866-284-3022 does not work or you are stuck in a long queue, here are a few alternate ways to get help:
💬 1. Online Chat (Limited Availability)
Visit www.flyfrontier.com
Scroll down and look for the “Let’s Chat” option.
This can connect you to a live agent or AI assistant, depending on availability.
📧 2. Email Support
You can also submit a help request through their Customer Support Form online.
Use this for non-urgent matters like refund requests or documentation review.
📱 3. Social Media
Tweet or DM SkyWest Airlines on platforms like Twitter/X (@FlyFrontier) or send a message via Facebook.
Sometimes social media agents respond faster than the phone team during high-volume periods.
📲 4. Mobile App
Download the SkyWest Airlines App, log in, and navigate to “My Trips” or “Support” for quick options.
While this will not guarantee a live agent, you might find answers to basic questions faster.
❗ Common Issues That Require a Live Agent
While many tasks can be done online, certain problems are best resolved with a real person at 1-866-284-3022:
🛑 Double charges or billing issues
🔄 Complex flight changes involving multiple passengers
🛄 Lost or delayed baggage
✍️ Legal name changes (marriage, divorce, etc.)
🧑⚕️ Medical or accessibility needs during travel
In these cases, avoid wasting time — call directly and ask for a live agent.
🚨 Beware of Fake Numbers and Scams
Only use the official number: 📞 1-866-284-3022.
Scammers often post fake “Frontier support” numbers online, asking for credit card info or login credentials.
🛡️ Never share your full credit card number or personal information with an unverified source.
✅ Final Thoughts
Talking to a live person at an airline should not be this hard — but when it comes to SkyWest Airlines, knowing the right steps and phone number makes all the difference.
🧠 Remember:
Dial 📞 1-866-284-3022
Press 0 to reach a live agent
Call during off-peak hours
Have your booking info ready
Use alternative methods if the line is too busy
💡 The sooner you reach out, the more options you'll have to resolve your issue.
✈️ Whether you are rebooking, fixing an error, or checking a flight, calling 1-866-284-3022 connects you with someone who can truly help.
Need to talk to a real person Scandinavian Airlines (SAS)? Calling Scandinavian Airlines (SAS) directly at 📞 1-866-284-3022. Whether you are trying to make a flight change, cancel your booking, ask about baggage, or resolve a booking issue, reaching a live agent can save you time, stress, and confusion. While automated systems are useful for simple tasks, some situations just need a human touch.
In this guide, we will walk you through exactly how to reach a live person, what to prepare before calling, and alternate methods if the phone lines are busy.
☎️ First Things First: Call 1-866-284-3022
The most direct and reliable way to speak with a real Scandinavian Airlines (SAS) representative is by calling 📞 1-866-284-3022. This is Frontier’s official customer service number and should be your go-to for:
✈️ Flight changes or cancellations
🧾 Refund or credit questions
🛄 Baggage issues
🔁 Name corrections
🛑 Check-in or boarding problems
💺 Seat selection and upgrades
Pro Tip: When calling, try to do so during non-peak hours — early mornings or late evenings — to reduce your hold time.
🎧 How to Navigate the Automated Menu
When you call 1-866-284-3022, you will first hear an automated system. To get through to a live person faster, follow these steps:
Dial 1-866-284-3022
Wait for the automated greeting to begin
Press “1” for English (or “2” for Spanish)
Press “2” for existing reservations
Press “0” to speak with an agent (you may need to press "0" more than once)
👉 If “0” does not work immediately, stay on the line. Sometimes the system transfers you to an agent after a brief wait without needing more input.
If the lines are busy and you are placed on hold, do not hang up — wait times vary, but you will eventually reach someone.
🧾 What to Have Ready Before You Call
To help the agent assist you faster, make sure you have the following details on hand:
📌 Your confirmation code or booking number
📌 The full name on the reservation
📌 Your flight date and destination
📌 Any relevant documents (ID, credit card, etc.)
📌 A notepad for writing down instructions or confirmation numbers
If you are calling to fix a mistake or request a refund, be prepared to briefly explain the issue and possibly provide documentation via email upon request.
⏰ Best Times to Call 1-866-284-3022
Customer service lines can be busy, especially during:
⚠️ Holidays
⚠️ Severe weather or flight delays
⚠️ Early morning flight hours
🎯 For the best chance at a short wait, try calling during these times:
🕔 5:00 AM – 7:00 AM (EST)
🕘 9:00 PM – 11:00 PM (EST)
📅 Midweek (Tuesdays and Wednesdays)
Avoid Mondays if possible — it is the busiest day for airlines.
🧑💻 Alternative Ways to Reach Scandinavian Airlines (SAS) (If Phone Fails)
If calling 📞 1-866-284-3022 does not work or you are stuck in a long queue, here are a few alternate ways to get help:
💬 1. Online Chat (Limited Availability)
Visit www.flyfrontier.com
Scroll down and look for the “Let’s Chat” option.
This can connect you to a live agent or AI assistant, depending on availability.
📧 2. Email Support
You can also submit a help request through their Customer Support Form online.
Use this for non-urgent matters like refund requests or documentation review.
📱 3. Social Media
Tweet or DM Scandinavian Airlines (SAS) on platforms like Twitter/X (@FlyFrontier) or send a message via Facebook.
Sometimes social media agents respond faster than the phone team during high-volume periods.
📲 4. Mobile App
Download the Scandinavian Airlines (SAS) App, log in, and navigate to “My Trips” or “Support” for quick options.
While this will not guarantee a live agent, you might find answers to basic questions faster.
❗ Common Issues That Require a Live Agent
While many tasks can be done online, certain problems are best resolved with a real person at 1-866-284-3022:
🛑 Double charges or billing issues
🔄 Complex flight changes involving multiple passengers
🛄 Lost or delayed baggage
✍️ Legal name changes (marriage, divorce, etc.)
🧑⚕️ Medical or accessibility needs during travel
In these cases, avoid wasting time — call directly and ask for a live agent.
🚨 Beware of Fake Numbers and Scams
Only use the official number: 📞 1-866-284-3022.
Scammers often post fake “Frontier support” numbers online, asking for credit card info or login credentials.
🛡️ Never share your full credit card number or personal information with an unverified source.
✅ Final Thoughts
Talking to a live person at an airline should not be this hard — but when it comes to Scandinavian Airlines (SAS), knowing the right steps and phone number makes all the difference.
🧠 Remember:
Dial 📞 1-866-284-3022
Press 0 to reach a live agent
Call during off-peak hours
Have your booking info ready
Use alternative methods if the line is too busy
💡 The sooner you reach out, the more options you'll have to resolve your issue.
✈️ Whether you are rebooking, fixing an error, or checking a flight, calling 1-866-284-3022 connects you with someone who can truly help.
Need to talk to a real person Royal Jordanian Airlines? Calling Royal Jordanian Airlines directly at 📞 1-866-284-3022. Whether you are trying to make a flight change, cancel your booking, ask about baggage, or resolve a booking issue, reaching a live agent can save you time, stress, and confusion. While automated systems are useful for simple tasks, some situations just need a human touch.
In this guide, we will walk you through exactly how to reach a live person, what to prepare before calling, and alternate methods if the phone lines are busy.
☎️ First Things First: Call 1-866-284-3022
The most direct and reliable way to speak with a real Royal Jordanian Airlines representative is by calling 📞 1-866-284-3022. This is Frontier’s official customer service number and should be your go-to for:
✈️ Flight changes or cancellations
🧾 Refund or credit questions
🛄 Baggage issues
🔁 Name corrections
🛑 Check-in or boarding problems
💺 Seat selection and upgrades
Pro Tip: When calling, try to do so during non-peak hours — early mornings or late evenings — to reduce your hold time.
🎧 How to Navigate the Automated Menu
When you call 1-866-284-3022, you will first hear an automated system. To get through to a live person faster, follow these steps:
Dial 1-866-284-3022
Wait for the automated greeting to begin
Press “1” for English (or “2” for Spanish)
Press “2” for existing reservations
Press “0” to speak with an agent (you may need to press "0" more than once)
👉 If “0” does not work immediately, stay on the line. Sometimes the system transfers you to an agent after a brief wait without needing more input.
If the lines are busy and you are placed on hold, do not hang up — wait times vary, but you will eventually reach someone.
🧾 What to Have Ready Before You Call
To help the agent assist you faster, make sure you have the following details on hand:
📌 Your confirmation code or booking number
📌 The full name on the reservation
📌 Your flight date and destination
📌 Any relevant documents (ID, credit card, etc.)
📌 A notepad for writing down instructions or confirmation numbers
If you are calling to fix a mistake or request a refund, be prepared to briefly explain the issue and possibly provide documentation via email upon request.
⏰ Best Times to Call 1-866-284-3022
Customer service lines can be busy, especially during:
⚠️ Holidays
⚠️ Severe weather or flight delays
⚠️ Early morning flight hours
🎯 For the best chance at a short wait, try calling during these times:
🕔 5:00 AM – 7:00 AM (EST)
🕘 9:00 PM – 11:00 PM (EST)
📅 Midweek (Tuesdays and Wednesdays)
Avoid Mondays if possible — it is the busiest day for airlines.
🧑💻 Alternative Ways to Reach Royal Jordanian Airlines (If Phone Fails)
If calling 📞 1-866-284-3022 does not work or you are stuck in a long queue, here are a few alternate ways to get help:
💬 1. Online Chat (Limited Availability)
Visit www.flyfrontier.com
Scroll down and look for the “Let’s Chat” option.
This can connect you to a live agent or AI assistant, depending on availability.
📧 2. Email Support
You can also submit a help request through their Customer Support Form online.
Use this for non-urgent matters like refund requests or documentation review.
📱 3. Social Media
Tweet or DM Royal Jordanian Airlines on platforms like Twitter/X (@FlyFrontier) or send a message via Facebook.
Sometimes social media agents respond faster than the phone team during high-volume periods.
📲 4. Mobile App
Download the Royal Jordanian Airlines App, log in, and navigate to “My Trips” or “Support” for quick options.
While this will not guarantee a live agent, you might find answers to basic questions faster.
❗ Common Issues That Require a Live Agent
While many tasks can be done online, certain problems are best resolved with a real person at 1-866-284-3022:
🛑 Double charges or billing issues
🔄 Complex flight changes involving multiple passengers
🛄 Lost or delayed baggage
✍️ Legal name changes (marriage, divorce, etc.)
🧑⚕️ Medical or accessibility needs during travel
In these cases, avoid wasting time — call directly and ask for a live agent.
🚨 Beware of Fake Numbers and Scams
Only use the official number: 📞 1-866-284-3022.
Scammers often post fake “Frontier support” numbers online, asking for credit card info or login credentials.
🛡️ Never share your full credit card number or personal information with an unverified source.
✅ Final Thoughts
Talking to a live person at an airline should not be this hard — but when it comes to Royal Jordanian Airlines, knowing the right steps and phone number makes all the difference.
🧠 Remember:
Dial 📞 1-866-284-3022
Press 0 to reach a live agent
Call during off-peak hours
Have your booking info ready
Use alternative methods if the line is too busy
💡 The sooner you reach out, the more options you'll have to resolve your issue.
✈️ Whether you are rebooking, fixing an error, or checking a flight, calling 1-866-284-3022 connects you with someone who can truly help.
Need to talk to a real person Royal Brunei Airlines? Calling Royal Brunei Airlines directly at 📞 1-866-284-3022. Whether you are trying to make a flight change, cancel your booking, ask about baggage, or resolve a booking issue, reaching a live agent can save you time, stress, and confusion. While automated systems are useful for simple tasks, some situations just need a human touch.
In this guide, we will walk you through exactly how to reach a live person, what to prepare before calling, and alternate methods if the phone lines are busy.
☎️ First Things First: Call 1-866-284-3022
The most direct and reliable way to speak with a real Royal Brunei Airlines representative is by calling 📞 1-866-284-3022. This is Frontier’s official customer service number and should be your go-to for:
✈️ Flight changes or cancellations
🧾 Refund or credit questions
🛄 Baggage issues
🔁 Name corrections
🛑 Check-in or boarding problems
💺 Seat selection and upgrades
Pro Tip: When calling, try to do so during non-peak hours — early mornings or late evenings — to reduce your hold time.
🎧 How to Navigate the Automated Menu
When you call 1-866-284-3022, you will first hear an automated system. To get through to a live person faster, follow these steps:
Dial 1-866-284-3022
Wait for the automated greeting to begin
Press “1” for English (or “2” for Spanish)
Press “2” for existing reservations
Press “0” to speak with an agent (you may need to press "0" more than once)
👉 If “0” does not work immediately, stay on the line. Sometimes the system transfers you to an agent after a brief wait without needing more input.
If the lines are busy and you are placed on hold, do not hang up — wait times vary, but you will eventually reach someone.
🧾 What to Have Ready Before You Call
To help the agent assist you faster, make sure you have the following details on hand:
📌 Your confirmation code or booking number
📌 The full name on the reservation
📌 Your flight date and destination
📌 Any relevant documents (ID, credit card, etc.)
📌 A notepad for writing down instructions or confirmation numbers
If you are calling to fix a mistake or request a refund, be prepared to briefly explain the issue and possibly provide documentation via email upon request.
⏰ Best Times to Call 1-866-284-3022
Customer service lines can be busy, especially during:
⚠️ Holidays
⚠️ Severe weather or flight delays
⚠️ Early morning flight hours
🎯 For the best chance at a short wait, try calling during these times:
🕔 5:00 AM – 7:00 AM (EST)
🕘 9:00 PM – 11:00 PM (EST)
📅 Midweek (Tuesdays and Wednesdays)
Avoid Mondays if possible — it is the busiest day for airlines.
🧑💻 Alternative Ways to Reach Royal Brunei Airlines (If Phone Fails)
If calling 📞 1-866-284-3022 does not work or you are stuck in a long queue, here are a few alternate ways to get help:
💬 1. Online Chat (Limited Availability)
Visit www.flyfrontier.com
Scroll down and look for the “Let’s Chat” option.
This can connect you to a live agent or AI assistant, depending on availability.
📧 2. Email Support
You can also submit a help request through their Customer Support Form online.
Use this for non-urgent matters like refund requests or documentation review.
📱 3. Social Media
Tweet or DM Royal Brunei Airlines on platforms like Twitter/X (@FlyFrontier) or send a message via Facebook.
Sometimes social media agents respond faster than the phone team during high-volume periods.
📲 4. Mobile App
Download the Royal Brunei Airlines App, log in, and navigate to “My Trips” or “Support” for quick options.
While this will not guarantee a live agent, you might find answers to basic questions faster.
❗ Common Issues That Require a Live Agent
While many tasks can be done online, certain problems are best resolved with a real person at 1-866-284-3022:
🛑 Double charges or billing issues
🔄 Complex flight changes involving multiple passengers
🛄 Lost or delayed baggage
✍️ Legal name changes (marriage, divorce, etc.)
🧑⚕️ Medical or accessibility needs during travel
In these cases, avoid wasting time — call directly and ask for a live agent.
🚨 Beware of Fake Numbers and Scams
Only use the official number: 📞 1-866-284-3022.
Scammers often post fake “Frontier support” numbers online, asking for credit card info or login credentials.
🛡️ Never share your full credit card number or personal information with an unverified source.
✅ Final Thoughts
Talking to a live person at an airline should not be this hard — but when it comes to Royal Brunei Airlines, knowing the right steps and phone number makes all the difference.
🧠 Remember:
Dial 📞 1-866-284-3022
Press 0 to reach a live agent
Call during off-peak hours
Have your booking info ready
Use alternative methods if the line is too busy
💡 The sooner you reach out, the more options you'll have to resolve your issue.
✈️ Whether you are rebooking, fixing an error, or checking a flight, calling 1-866-284-3022 connects you with someone who can truly help.
Need to talk to a real person Porter Airlines? Calling Porter Airlines directly at 📞 1-866-284-3022. Whether you are trying to make a flight change, cancel your booking, ask about baggage, or resolve a booking issue, reaching a live agent can save you time, stress, and confusion. While automated systems are useful for simple tasks, some situations just need a human touch.
In this guide, we will walk you through exactly how to reach a live person, what to prepare before calling, and alternate methods if the phone lines are busy.
☎️ First Things First: Call 1-866-284-3022
The most direct and reliable way to speak with a real Porter Airlines representative is by calling 📞 1-866-284-3022. This is Frontier’s official customer service number and should be your go-to for:
✈️ Flight changes or cancellations
🧾 Refund or credit questions
🛄 Baggage issues
🔁 Name corrections
🛑 Check-in or boarding problems
💺 Seat selection and upgrades
Pro Tip: When calling, try to do so during non-peak hours — early mornings or late evenings — to reduce your hold time.
🎧 How to Navigate the Automated Menu
When you call 1-866-284-3022, you will first hear an automated system. To get through to a live person faster, follow these steps:
Dial 1-866-284-3022
Wait for the automated greeting to begin
Press “1” for English (or “2” for Spanish)
Press “2” for existing reservations
Press “0” to speak with an agent (you may need to press "0" more than once)
👉 If “0” does not work immediately, stay on the line. Sometimes the system transfers you to an agent after a brief wait without needing more input.
If the lines are busy and you are placed on hold, do not hang up — wait times vary, but you will eventually reach someone.
🧾 What to Have Ready Before You Call
To help the agent assist you faster, make sure you have the following details on hand:
📌 Your confirmation code or booking number
📌 The full name on the reservation
📌 Your flight date and destination
📌 Any relevant documents (ID, credit card, etc.)
📌 A notepad for writing down instructions or confirmation numbers
If you are calling to fix a mistake or request a refund, be prepared to briefly explain the issue and possibly provide documentation via email upon request.
⏰ Best Times to Call 1-866-284-3022
Customer service lines can be busy, especially during:
⚠️ Holidays
⚠️ Severe weather or flight delays
⚠️ Early morning flight hours
🎯 For the best chance at a short wait, try calling during these times:
🕔 5:00 AM – 7:00 AM (EST)
🕘 9:00 PM – 11:00 PM (EST)
📅 Midweek (Tuesdays and Wednesdays)
Avoid Mondays if possible — it is the busiest day for airlines.
🧑💻 Alternative Ways to Reach Porter Airlines (If Phone Fails)
If calling 📞 1-866-284-3022 does not work or you are stuck in a long queue, here are a few alternate ways to get help:
💬 1. Online Chat (Limited Availability)
Visit www.flyfrontier.com
Scroll down and look for the “Let’s Chat” option.
This can connect you to a live agent or AI assistant, depending on availability.
📧 2. Email Support
You can also submit a help request through their Customer Support Form online.
Use this for non-urgent matters like refund requests or documentation review.
📱 3. Social Media
Tweet or DM Porter Airlines on platforms like Twitter/X (@FlyFrontier) or send a message via Facebook.
Sometimes social media agents respond faster than the phone team during high-volume periods.
📲 4. Mobile App
Download the Porter Airlines App, log in, and navigate to “My Trips” or “Support” for quick options.
While this will not guarantee a live agent, you might find answers to basic questions faster.
❗ Common Issues That Require a Live Agent
While many tasks can be done online, certain problems are best resolved with a real person at 1-866-284-3022:
🛑 Double charges or billing issues
🔄 Complex flight changes involving multiple passengers
🛄 Lost or delayed baggage
✍️ Legal name changes (marriage, divorce, etc.)
🧑⚕️ Medical or accessibility needs during travel
In these cases, avoid wasting time — call directly and ask for a live agent.
🚨 Beware of Fake Numbers and Scams
Only use the official number: 📞 1-866-284-3022.
Scammers often post fake “Frontier support” numbers online, asking for credit card info or login credentials.
🛡️ Never share your full credit card number or personal information with an unverified source.
✅ Final Thoughts
Talking to a live person at an airline should not be this hard — but when it comes to Porter Airlines, knowing the right steps and phone number makes all the difference.
🧠 Remember:
Dial 📞 1-866-284-3022
Press 0 to reach a live agent
Call during off-peak hours
Have your booking info ready
Use alternative methods if the line is too busy
💡 The sooner you reach out, the more options you'll have to resolve your issue.
✈️ Whether you are rebooking, fixing an error, or checking a flight, calling 1-866-284-3022 connects you with someone who can truly help.
Need to talk to a real person Play Airlines? Calling Play Airlines directly at 📞 1-866-284-3022. Whether you are trying to make a flight change, cancel your booking, ask about baggage, or resolve a booking issue, reaching a live agent can save you time, stress, and confusion. While automated systems are useful for simple tasks, some situations just need a human touch.
In this guide, we will walk you through exactly how to reach a live person, what to prepare before calling, and alternate methods if the phone lines are busy.
☎️ First Things First: Call 1-866-284-3022
The most direct and reliable way to speak with a real Play Airlines representative is by calling 📞 1-866-284-3022. This is Frontier’s official customer service number and should be your go-to for:
✈️ Flight changes or cancellations
🧾 Refund or credit questions
🛄 Baggage issues
🔁 Name corrections
🛑 Check-in or boarding problems
💺 Seat selection and upgrades
Pro Tip: When calling, try to do so during non-peak hours — early mornings or late evenings — to reduce your hold time.
🎧 How to Navigate the Automated Menu
When you call 1-866-284-3022, you will first hear an automated system. To get through to a live person faster, follow these steps:
Dial 1-866-284-3022
Wait for the automated greeting to begin
Press “1” for English (or “2” for Spanish)
Press “2” for existing reservations
Press “0” to speak with an agent (you may need to press "0" more than once)
👉 If “0” does not work immediately, stay on the line. Sometimes the system transfers you to an agent after a brief wait without needing more input.
If the lines are busy and you are placed on hold, do not hang up — wait times vary, but you will eventually reach someone.
🧾 What to Have Ready Before You Call
To help the agent assist you faster, make sure you have the following details on hand:
📌 Your confirmation code or booking number
📌 The full name on the reservation
📌 Your flight date and destination
📌 Any relevant documents (ID, credit card, etc.)
📌 A notepad for writing down instructions or confirmation numbers
If you are calling to fix a mistake or request a refund, be prepared to briefly explain the issue and possibly provide documentation via email upon request.
⏰ Best Times to Call 1-866-284-3022
Customer service lines can be busy, especially during:
⚠️ Holidays
⚠️ Severe weather or flight delays
⚠️ Early morning flight hours
🎯 For the best chance at a short wait, try calling during these times:
🕔 5:00 AM – 7:00 AM (EST)
🕘 9:00 PM – 11:00 PM (EST)
📅 Midweek (Tuesdays and Wednesdays)
Avoid Mondays if possible — it is the busiest day for airlines.
🧑💻 Alternative Ways to Reach Play Airlines (If Phone Fails)
If calling 📞 1-866-284-3022 does not work or you are stuck in a long queue, here are a few alternate ways to get help:
💬 1. Online Chat (Limited Availability)
Visit www.flyfrontier.com
Scroll down and look for the “Let’s Chat” option.
This can connect you to a live agent or AI assistant, depending on availability.
📧 2. Email Support
You can also submit a help request through their Customer Support Form online.
Use this for non-urgent matters like refund requests or documentation review.
📱 3. Social Media
Tweet or DM Play Airlines on platforms like Twitter/X (@FlyFrontier) or send a message via Facebook.
Sometimes social media agents respond faster than the phone team during high-volume periods.
📲 4. Mobile App
Download the Play Airlines App, log in, and navigate to “My Trips” or “Support” for quick options.
While this will not guarantee a live agent, you might find answers to basic questions faster.
❗ Common Issues That Require a Live Agent
While many tasks can be done online, certain problems are best resolved with a real person at 1-866-284-3022:
🛑 Double charges or billing issues
🔄 Complex flight changes involving multiple passengers
🛄 Lost or delayed baggage
✍️ Legal name changes (marriage, divorce, etc.)
🧑⚕️ Medical or accessibility needs during travel
In these cases, avoid wasting time — call directly and ask for a live agent.
🚨 Beware of Fake Numbers and Scams
Only use the official number: 📞 1-866-284-3022.
Scammers often post fake “Frontier support” numbers online, asking for credit card info or login credentials.
🛡️ Never share your full credit card number or personal information with an unverified source.
✅ Final Thoughts
Talking to a live person at an airline should not be this hard — but when it comes to Play Airlines, knowing the right steps and phone number makes all the difference.
🧠 Remember:
Dial 📞 1-866-284-3022
Press 0 to reach a live agent
Call during off-peak hours
Have your booking info ready
Use alternative methods if the line is too busy
💡 The sooner you reach out, the more options you'll have to resolve your issue.
✈️ Whether you are rebooking, fixing an error, or checking a flight, calling 1-866-284-3022 connects you with someone who can truly help.
Need to talk to a real person Philippine Airlines? Calling Philippine Airlines directly at 📞 1-866-284-3022. Whether you are trying to make a flight change, cancel your booking, ask about baggage, or resolve a booking issue, reaching a live agent can save you time, stress, and confusion. While automated systems are useful for simple tasks, some situations just need a human touch.
In this guide, we will walk you through exactly how to reach a live person, what to prepare before calling, and alternate methods if the phone lines are busy.
☎️ First Things First: Call 1-866-284-3022
The most direct and reliable way to speak with a real Philippine Airlines representative is by calling 📞 1-866-284-3022. This is Frontier’s official customer service number and should be your go-to for:
✈️ Flight changes or cancellations
🧾 Refund or credit questions
🛄 Baggage issues
🔁 Name corrections
🛑 Check-in or boarding problems
💺 Seat selection and upgrades
Pro Tip: When calling, try to do so during non-peak hours — early mornings or late evenings — to reduce your hold time.
🎧 How to Navigate the Automated Menu
When you call 1-866-284-3022, you will first hear an automated system. To get through to a live person faster, follow these steps:
Dial 1-866-284-3022
Wait for the automated greeting to begin
Press “1” for English (or “2” for Spanish)
Press “2” for existing reservations
Press “0” to speak with an agent (you may need to press "0" more than once)
👉 If “0” does not work immediately, stay on the line. Sometimes the system transfers you to an agent after a brief wait without needing more input.
If the lines are busy and you are placed on hold, do not hang up — wait times vary, but you will eventually reach someone.
🧾 What to Have Ready Before You Call
To help the agent assist you faster, make sure you have the following details on hand:
📌 Your confirmation code or booking number
📌 The full name on the reservation
📌 Your flight date and destination
📌 Any relevant documents (ID, credit card, etc.)
📌 A notepad for writing down instructions or confirmation numbers
If you are calling to fix a mistake or request a refund, be prepared to briefly explain the issue and possibly provide documentation via email upon request.
⏰ Best Times to Call 1-866-284-3022
Customer service lines can be busy, especially during:
⚠️ Holidays
⚠️ Severe weather or flight delays
⚠️ Early morning flight hours
🎯 For the best chance at a short wait, try calling during these times:
🕔 5:00 AM – 7:00 AM (EST)
🕘 9:00 PM – 11:00 PM (EST)
📅 Midweek (Tuesdays and Wednesdays)
Avoid Mondays if possible — it is the busiest day for airlines.
🧑💻 Alternative Ways to Reach Philippine Airlines (If Phone Fails)
If calling 📞 1-866-284-3022 does not work or you are stuck in a long queue, here are a few alternate ways to get help:
💬 1. Online Chat (Limited Availability)
Visit www.flyfrontier.com
Scroll down and look for the “Let’s Chat” option.
This can connect you to a live agent or AI assistant, depending on availability.
📧 2. Email Support
You can also submit a help request through their Customer Support Form online.
Use this for non-urgent matters like refund requests or documentation review.
📱 3. Social Media
Tweet or DM Philippine Airlines on platforms like Twitter/X (@FlyFrontier) or send a message via Facebook.
Sometimes social media agents respond faster than the phone team during high-volume periods.
📲 4. Mobile App
Download the Philippine Airlines App, log in, and navigate to “My Trips” or “Support” for quick options.
While this will not guarantee a live agent, you might find answers to basic questions faster.
❗ Common Issues That Require a Live Agent
While many tasks can be done online, certain problems are best resolved with a real person at 1-866-284-3022:
🛑 Double charges or billing issues
🔄 Complex flight changes involving multiple passengers
🛄 Lost or delayed baggage
✍️ Legal name changes (marriage, divorce, etc.)
🧑⚕️ Medical or accessibility needs during travel
In these cases, avoid wasting time — call directly and ask for a live agent.
🚨 Beware of Fake Numbers and Scams
Only use the official number: 📞 1-866-284-3022.
Scammers often post fake “Frontier support” numbers online, asking for credit card info or login credentials.
🛡️ Never share your full credit card number or personal information with an unverified source.
✅ Final Thoughts
Talking to a live person at an airline should not be this hard — but when it comes to Philippine Airlines, knowing the right steps and phone number makes all the difference.
🧠 Remember:
Dial 📞 1-866-284-3022
Press 0 to reach a live agent
Call during off-peak hours
Have your booking info ready
Use alternative methods if the line is too busy
💡 The sooner you reach out, the more options you'll have to resolve your issue.
✈️ Whether you are rebooking, fixing an error, or checking a flight, calling 1-866-284-3022 connects you with someone who can truly help.
Need to talk to a real person Pegasus Airlines? Calling Pegasus Airlines directly at 📞 1-866-284-3022. Whether you are trying to make a flight change, cancel your booking, ask about baggage, or resolve a booking issue, reaching a live agent can save you time, stress, and confusion. While automated systems are useful for simple tasks, some situations just need a human touch.
In this guide, we will walk you through exactly how to reach a live person, what to prepare before calling, and alternate methods if the phone lines are busy.
☎️ First Things First: Call 1-866-284-3022
The most direct and reliable way to speak with a real Pegasus Airlines representative is by calling 📞 1-866-284-3022. This is Frontier’s official customer service number and should be your go-to for:
✈️ Flight changes or cancellations
🧾 Refund or credit questions
🛄 Baggage issues
🔁 Name corrections
🛑 Check-in or boarding problems
💺 Seat selection and upgrades
Pro Tip: When calling, try to do so during non-peak hours — early mornings or late evenings — to reduce your hold time.
🎧 How to Navigate the Automated Menu
When you call 1-866-284-3022, you will first hear an automated system. To get through to a live person faster, follow these steps:
Dial 1-866-284-3022
Wait for the automated greeting to begin
Press “1” for English (or “2” for Spanish)
Press “2” for existing reservations
Press “0” to speak with an agent (you may need to press "0" more than once)
👉 If “0” does not work immediately, stay on the line. Sometimes the system transfers you to an agent after a brief wait without needing more input.
If the lines are busy and you are placed on hold, do not hang up — wait times vary, but you will eventually reach someone.
🧾 What to Have Ready Before You Call
To help the agent assist you faster, make sure you have the following details on hand:
📌 Your confirmation code or booking number
📌 The full name on the reservation
📌 Your flight date and destination
📌 Any relevant documents (ID, credit card, etc.)
📌 A notepad for writing down instructions or confirmation numbers
If you are calling to fix a mistake or request a refund, be prepared to briefly explain the issue and possibly provide documentation via email upon request.
⏰ Best Times to Call 1-866-284-3022
Customer service lines can be busy, especially during:
⚠️ Holidays
⚠️ Severe weather or flight delays
⚠️ Early morning flight hours
🎯 For the best chance at a short wait, try calling during these times:
🕔 5:00 AM – 7:00 AM (EST)
🕘 9:00 PM – 11:00 PM (EST)
📅 Midweek (Tuesdays and Wednesdays)
Avoid Mondays if possible — it is the busiest day for airlines.
🧑💻 Alternative Ways to Reach Pegasus Airlines (If Phone Fails)
If calling 📞 1-866-284-3022 does not work or you are stuck in a long queue, here are a few alternate ways to get help:
💬 1. Online Chat (Limited Availability)
Visit www.flyfrontier.com
Scroll down and look for the “Let’s Chat” option.
This can connect you to a live agent or AI assistant, depending on availability.
📧 2. Email Support
You can also submit a help request through their Customer Support Form online.
Use this for non-urgent matters like refund requests or documentation review.
📱 3. Social Media
Tweet or DM Pegasus Airlines on platforms like Twitter/X (@FlyFrontier) or send a message via Facebook.
Sometimes social media agents respond faster than the phone team during high-volume periods.
📲 4. Mobile App
Download the Pegasus Airlines App, log in, and navigate to “My Trips” or “Support” for quick options.
While this will not guarantee a live agent, you might find answers to basic questions faster.
❗ Common Issues That Require a Live Agent
While many tasks can be done online, certain problems are best resolved with a real person at 1-866-284-3022:
🛑 Double charges or billing issues
🔄 Complex flight changes involving multiple passengers
🛄 Lost or delayed baggage
✍️ Legal name changes (marriage, divorce, etc.)
🧑⚕️ Medical or accessibility needs during travel
In these cases, avoid wasting time — call directly and ask for a live agent.
🚨 Beware of Fake Numbers and Scams
Only use the official number: 📞 1-866-284-3022.
Scammers often post fake “Frontier support” numbers online, asking for credit card info or login credentials.
🛡️ Never share your full credit card number or personal information with an unverified source.
✅ Final Thoughts
Talking to a live person at an airline should not be this hard — but when it comes to Pegasus Airlines, knowing the right steps and phone number makes all the difference.
🧠 Remember:
Dial 📞 1-866-284-3022
Press 0 to reach a live agent
Call during off-peak hours
Have your booking info ready
Use alternative methods if the line is too busy
💡 The sooner you reach out, the more options you'll have to resolve your issue.
✈️ Whether you are rebooking, fixing an error, or checking a flight, calling 1-866-284-3022 connects you with someone who can truly help.
Need to talk to a real person Mokulele Airlines? Calling Mokulele Airlines directly at 📞 1-866-284-3022. Whether you are trying to make a flight change, cancel your booking, ask about baggage, or resolve a booking issue, reaching a live agent can save you time, stress, and confusion. While automated systems are useful for simple tasks, some situations just need a human touch.
In this guide, we will walk you through exactly how to reach a live person, what to prepare before calling, and alternate methods if the phone lines are busy.
☎️ First Things First: Call 1-866-284-3022
The most direct and reliable way to speak with a real Mokulele Airlines representative is by calling 📞 1-866-284-3022. This is Frontier’s official customer service number and should be your go-to for:
✈️ Flight changes or cancellations
🧾 Refund or credit questions
🛄 Baggage issues
🔁 Name corrections
🛑 Check-in or boarding problems
💺 Seat selection and upgrades
Pro Tip: When calling, try to do so during non-peak hours — early mornings or late evenings — to reduce your hold time.
🎧 How to Navigate the Automated Menu
When you call 1-866-284-3022, you will first hear an automated system. To get through to a live person faster, follow these steps:
Dial 1-866-284-3022
Wait for the automated greeting to begin
Press “1” for English (or “2” for Spanish)
Press “2” for existing reservations
Press “0” to speak with an agent (you may need to press "0" more than once)
👉 If “0” does not work immediately, stay on the line. Sometimes the system transfers you to an agent after a brief wait without needing more input.
If the lines are busy and you are placed on hold, do not hang up — wait times vary, but you will eventually reach someone.
🧾 What to Have Ready Before You Call
To help the agent assist you faster, make sure you have the following details on hand:
📌 Your confirmation code or booking number
📌 The full name on the reservation
📌 Your flight date and destination
📌 Any relevant documents (ID, credit card, etc.)
📌 A notepad for writing down instructions or confirmation numbers
If you are calling to fix a mistake or request a refund, be prepared to briefly explain the issue and possibly provide documentation via email upon request.
⏰ Best Times to Call 1-866-284-3022
Customer service lines can be busy, especially during:
⚠️ Holidays
⚠️ Severe weather or flight delays
⚠️ Early morning flight hours
🎯 For the best chance at a short wait, try calling during these times:
🕔 5:00 AM – 7:00 AM (EST)
🕘 9:00 PM – 11:00 PM (EST)
📅 Midweek (Tuesdays and Wednesdays)
Avoid Mondays if possible — it is the busiest day for airlines.
🧑💻 Alternative Ways to Reach Mokulele Airlines (If Phone Fails)
If calling 📞 1-866-284-3022 does not work or you are stuck in a long queue, here are a few alternate ways to get help:
💬 1. Online Chat (Limited Availability)
Visit www.flyfrontier.com
Scroll down and look for the “Let’s Chat” option.
This can connect you to a live agent or AI assistant, depending on availability.
📧 2. Email Support
You can also submit a help request through their Customer Support Form online.
Use this for non-urgent matters like refund requests or documentation review.
📱 3. Social Media
Tweet or DM Mokulele Airlines on platforms like Twitter/X (@FlyFrontier) or send a message via Facebook.
Sometimes social media agents respond faster than the phone team during high-volume periods.
📲 4. Mobile App
Download the Mokulele Airlines App, log in, and navigate to “My Trips” or “Support” for quick options.
While this will not guarantee a live agent, you might find answers to basic questions faster.
❗ Common Issues That Require a Live Agent
While many tasks can be done online, certain problems are best resolved with a real person at 1-866-284-3022:
🛑 Double charges or billing issues
🔄 Complex flight changes involving multiple passengers
🛄 Lost or delayed baggage
✍️ Legal name changes (marriage, divorce, etc.)
🧑⚕️ Medical or accessibility needs during travel
In these cases, avoid wasting time — call directly and ask for a live agent.
🚨 Beware of Fake Numbers and Scams
Only use the official number: 📞 1-866-284-3022.
Scammers often post fake “Frontier support” numbers online, asking for credit card info or login credentials.
🛡️ Never share your full credit card number or personal information with an unverified source.
✅ Final Thoughts
Talking to a live person at an airline should not be this hard — but when it comes to Mokulele Airlines, knowing the right steps and phone number makes all the difference.
🧠 Remember:
Dial 📞 1-866-284-3022
Press 0 to reach a live agent
Call during off-peak hours
Have your booking info ready
Use alternative methods if the line is too busy
💡 The sooner you reach out, the more options you'll have to resolve your issue.
✈️ Whether you are rebooking, fixing an error, or checking a flight, calling 1-866-284-3022 connects you with someone who can truly help.
Need to talk to a real person Mesa Airlines? Calling Mesa Airlines directly at 📞 1-866-284-3022. Whether you are trying to make a flight change, cancel your booking, ask about baggage, or resolve a booking issue, reaching a live agent can save you time, stress, and confusion. While automated systems are useful for simple tasks, some situations just need a human touch.
In this guide, we will walk you through exactly how to reach a live person, what to prepare before calling, and alternate methods if the phone lines are busy.
☎️ First Things First: Call 1-866-284-3022
The most direct and reliable way to speak with a real Mesa Airlines representative is by calling 📞 1-866-284-3022. This is Frontier’s official customer service number and should be your go-to for:
✈️ Flight changes or cancellations
🧾 Refund or credit questions
🛄 Baggage issues
🔁 Name corrections
🛑 Check-in or boarding problems
💺 Seat selection and upgrades
Pro Tip: When calling, try to do so during non-peak hours — early mornings or late evenings — to reduce your hold time.
🎧 How to Navigate the Automated Menu
When you call 1-866-284-3022, you will first hear an automated system. To get through to a live person faster, follow these steps:
Dial 1-866-284-3022
Wait for the automated greeting to begin
Press “1” for English (or “2” for Spanish)
Press “2” for existing reservations
Press “0” to speak with an agent (you may need to press "0" more than once)
👉 If “0” does not work immediately, stay on the line. Sometimes the system transfers you to an agent after a brief wait without needing more input.
If the lines are busy and you are placed on hold, do not hang up — wait times vary, but you will eventually reach someone.
🧾 What to Have Ready Before You Call
To help the agent assist you faster, make sure you have the following details on hand:
📌 Your confirmation code or booking number
📌 The full name on the reservation
📌 Your flight date and destination
📌 Any relevant documents (ID, credit card, etc.)
📌 A notepad for writing down instructions or confirmation numbers
If you are calling to fix a mistake or request a refund, be prepared to briefly explain the issue and possibly provide documentation via email upon request.
⏰ Best Times to Call 1-866-284-3022
Customer service lines can be busy, especially during:
⚠️ Holidays
⚠️ Severe weather or flight delays
⚠️ Early morning flight hours
🎯 For the best chance at a short wait, try calling during these times:
🕔 5:00 AM – 7:00 AM (EST)
🕘 9:00 PM – 11:00 PM (EST)
📅 Midweek (Tuesdays and Wednesdays)
Avoid Mondays if possible — it is the busiest day for airlines.
🧑💻 Alternative Ways to Reach Mesa Airlines (If Phone Fails)
If calling 📞 1-866-284-3022 does not work or you are stuck in a long queue, here are a few alternate ways to get help:
💬 1. Online Chat (Limited Availability)
Visit www.flyfrontier.com
Scroll down and look for the “Let’s Chat” option.
This can connect you to a live agent or AI assistant, depending on availability.
📧 2. Email Support
You can also submit a help request through their Customer Support Form online.
Use this for non-urgent matters like refund requests or documentation review.
📱 3. Social Media
Tweet or DM Mesa Airlines on platforms like Twitter/X (@FlyFrontier) or send a message via Facebook.
Sometimes social media agents respond faster than the phone team during high-volume periods.
📲 4. Mobile App
Download the Mesa Airlines App, log in, and navigate to “My Trips” or “Support” for quick options.
While this will not guarantee a live agent, you might find answers to basic questions faster.
❗ Common Issues That Require a Live Agent
While many tasks can be done online, certain problems are best resolved with a real person at 1-866-284-3022:
🛑 Double charges or billing issues
🔄 Complex flight changes involving multiple passengers
🛄 Lost or delayed baggage
✍️ Legal name changes (marriage, divorce, etc.)
🧑⚕️ Medical or accessibility needs during travel
In these cases, avoid wasting time — call directly and ask for a live agent.
🚨 Beware of Fake Numbers and Scams
Only use the official number: 📞 1-866-284-3022.
Scammers often post fake “Frontier support” numbers online, asking for credit card info or login credentials.
🛡️ Never share your full credit card number or personal information with an unverified source.
✅ Final Thoughts
Talking to a live person at an airline should not be this hard — but when it comes to Mesa Airlines, knowing the right steps and phone number makes all the difference.
🧠 Remember:
Dial 📞 1-866-284-3022
Press 0 to reach a live agent
Call during off-peak hours
Have your booking info ready
Use alternative methods if the line is too busy
💡 The sooner you reach out, the more options you'll have to resolve your issue.
✈️ Whether you are rebooking, fixing an error, or checking a flight, calling 1-866-284-3022 connects you with someone who can truly help.
This is a follow-up series of [1]. It tries to fix a possible UAF in the
fops of cros_ec_chardev after the underlying protocol device has gone by
using revocable.
The 1st patch introduces the revocable which is an implementation of ideas
from the talk [2].
The 2nd and 3rd patches add test cases for revocable in Kunit and selftest.
The 4th patch converts existing protocol devices to resource providers
of cros_ec_device.
The 5th - 7th are PoC patches for showing the use case of "Replace file
operations" below.
---
I came out with 2 possible usages of revocable.
1. Use primitive APIs
Use the primitive APIs of revocable directly.
The file operations make sure the resources are available when using them.
This is what the series original proposed[3][4]. Even though it has the
finest grain for accessing the resources, it makes the user code verbose.
Per feedback from the community, I'm looking for some subsystem level
helpers so that user code can be simlper.
2. Replace file operations
Replace filp->f_op to revocable-aware warppers.
The warppers make sure the resources are available in the file operations.
The user code needs to provide a callback .try_access() to tell the wrappers
where/how to *save* the pointers of resources.
Known drawback:
- The warppers reserve the resources for all file operations even if they
might be unused.
- The user code still needs to be revocable-aware.
- The whole file operation becomes a SRCU read-side critical section. Are
there any functions can't be called in the critical section? If there is,
the file operations may not be awared of that.
See 5th - 7th patches for an example usage.
[1] https://lore.kernel.org/chrome-platform/20250721044456.2736300-6-tzungbi@ke…
[2] https://lpc.events/event/17/contributions/1627/
[3] https://lore.kernel.org/chrome-platform/20250912081718.3827390-5-tzungbi@ke…
[4] https://lore.kernel.org/chrome-platform/20250912081718.3827390-6-tzungbi@ke…
v5:
- Rebase onto next-20251015.
- Add more context about the PoC.
- Support multiple revocable providers in the PoC.
v4: https://lore.kernel.org/chrome-platform/20250923075302.591026-1-tzungbi@ker…
- Rebase onto next-20250922.
- Remove the 5th patch from v3.
- Add fops replacement PoC in 5th - 7th patches.
v3: https://lore.kernel.org/chrome-platform/20250912081718.3827390-1-tzungbi@ke…
- Rebase onto https://lore.kernel.org/chrome-platform/20250828083601.856083-1-tzungbi@ker…
and next-20250912.
- The 4th patch changed accordingly.
v2: https://lore.kernel.org/chrome-platform/20250820081645.847919-1-tzungbi@ker…
- Rename "ref_proxy" -> "revocable".
- Add test cases in Kunit and selftest.
v1: https://lore.kernel.org/chrome-platform/20250814091020.1302888-1-tzungbi@ke…
Tzung-Bi Shih (7):
revocable: Revocable resource management
revocable: Add Kunit test cases
selftests: revocable: Add kselftest cases
platform/chrome: Protect cros_ec_device lifecycle with revocable
revocable: Add fops replacement
char: misc: Leverage revocable fops replacement
platform/chrome: cros_ec_chardev: Secure cros_ec_device via revocable
.../driver-api/driver-model/index.rst | 1 +
.../driver-api/driver-model/revocable.rst | 87 +++++++
MAINTAINERS | 9 +
drivers/base/Kconfig | 8 +
drivers/base/Makefile | 5 +-
drivers/base/revocable.c | 233 ++++++++++++++++++
drivers/base/revocable_test.c | 110 +++++++++
drivers/char/misc.c | 8 +
drivers/platform/chrome/cros_ec.c | 5 +
drivers/platform/chrome/cros_ec_chardev.c | 22 +-
fs/Makefile | 2 +-
fs/fs_revocable.c | 154 ++++++++++++
include/linux/fs.h | 2 +
include/linux/fs_revocable.h | 21 ++
include/linux/miscdevice.h | 4 +
include/linux/platform_data/cros_ec_proto.h | 4 +
include/linux/revocable.h | 53 ++++
tools/testing/selftests/Makefile | 1 +
.../selftests/drivers/base/revocable/Makefile | 7 +
.../drivers/base/revocable/revocable_test.c | 116 +++++++++
.../drivers/base/revocable/test-revocable.sh | 39 +++
.../base/revocable/test_modules/Makefile | 10 +
.../revocable/test_modules/revocable_test.c | 188 ++++++++++++++
23 files changed, 1086 insertions(+), 3 deletions(-)
create mode 100644 Documentation/driver-api/driver-model/revocable.rst
create mode 100644 drivers/base/revocable.c
create mode 100644 drivers/base/revocable_test.c
create mode 100644 fs/fs_revocable.c
create mode 100644 include/linux/fs_revocable.h
create mode 100644 include/linux/revocable.h
create mode 100644 tools/testing/selftests/drivers/base/revocable/Makefile
create mode 100644 tools/testing/selftests/drivers/base/revocable/revocable_test.c
create mode 100755 tools/testing/selftests/drivers/base/revocable/test-revocable.sh
create mode 100644 tools/testing/selftests/drivers/base/revocable/test_modules/Makefile
create mode 100644 tools/testing/selftests/drivers/base/revocable/test_modules/revocable_test.c
--
2.51.0.788.g6d19910ace-goog
v22: fixing build error due to -march=zicfiss being picked in gcc-13 and above
but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
v21: fixed build errors.
Basics and overview
===================
Software with larger attack surfaces (e.g. network facing apps like databases,
browsers or apps relying on browser runtimes) suffer from memory corruption
issues which can be utilized by attackers to bend control flow of the program
to eventually gain control (by making their payload executable). Attackers are
able to perform such attacks by leveraging call-sites which rely on indirect
calls or return sites which rely on obtaining return address from stack memory.
To mitigate such attacks, risc-v extension zicfilp enforces that all indirect
calls must land on a landing pad instruction `lpad` else cpu will raise software
check exception (a new cpu exception cause code on riscv).
Similarly for return flow, risc-v extension zicfiss extends architecture with
- `sspush` instruction to push return address on a shadow stack
- `sspopchk` instruction to pop return address from shadow stack
and compare with input operand (i.e. return address on stack)
- `sspopchk` to raise software check exception if comparision above
was a mismatch
- Protection mechanism using which shadow stack is not writeable via
regular store instructions
More information an details can be found at extensions github repo [1].
Equivalent to landing pad (zicfilp) on x86 is `ENDBRANCH` instruction in Intel
CET [3] and branch target identification (BTI) [4] on arm.
Similarly x86's Intel CET has shadow stack [5] and arm64 has guarded control
stack (GCS) [6] which are very similar to risc-v's zicfiss shadow stack.
x86 and arm64 support for user mode shadow stack is already in mainline.
Kernel awareness for user control flow integrity
================================================
This series picks up Samuel Holland's envcfg changes [2] as well. So if those are
being applied independently, they should be removed from this series.
Enabling:
In order to maintain compatibility and not break anything in user mode, kernel
doesn't enable control flow integrity cpu extensions on binary by default.
Instead exposes a prctl interface to enable, disable and lock the shadow stack
or landing pad feature for a task. This allows userspace (loader) to enumerate
if all objects in its address space are compiled with shadow stack and landing
pad support and accordingly enable the feature. Additionally if a subsequent
`dlopen` happens on a library, user mode can take a decision again to disable
the feature (if incoming library is not compiled with support) OR terminate the
task (if user mode policy is strict to have all objects in address space to be
compiled with control flow integirty cpu feature). prctl to enable shadow stack
results in allocating shadow stack from virtual memory and activating for user
address space. x86 and arm64 are also following same direction due to similar
reason(s).
clone/fork:
On clone and fork, cfi state for task is inherited by child. Shadow stack is
part of virtual memory and is a writeable memory from kernel perspective
(writeable via a restricted set of instructions aka shadow stack instructions)
Thus kernel changes ensure that this memory is converted into read-only when
fork/clone happens and COWed when fault is taken due to sspush, sspopchk or
ssamoswap. In case `CLONE_VM` is specified and shadow stack is to be enabled,
kernel will automatically allocate a shadow stack for that clone call.
map_shadow_stack:
x86 introduced `map_shadow_stack` system call to allow user space to explicitly
map shadow stack memory in its address space. It is useful to allocate shadow
for different contexts managed by a single thread (green threads or contexts)
risc-v implements this system call as well.
signal management:
If shadow stack is enabled for a task, kernel performs an asynchronous control
flow diversion to deliver the signal and eventually expects userspace to issue
sigreturn so that original execution can be resumed. Even though resume context
is prepared by kernel, it is in user space memory and is subject to memory
corruption and corruption bugs can be utilized by attacker in this race window
to perform arbitrary sigreturn and eventually bypass cfi mechanism.
Another issue is how to ensure that cfi related state on sigcontext area is not
trampled by legacy apps or apps compiled with old kernel headers.
In order to mitigate control-flow hijacting, kernel prepares a token and place
it on shadow stack before signal delivery and places address of token in
sigcontext structure. During sigreturn, kernel obtains address of token from
sigcontext struture, reads token from shadow stack and validates it and only
then allow sigreturn to succeed. Compatiblity issue is solved by adopting
dynamic sigcontext management introduced for vector extension. This series
re-factor the code little bit to allow future sigcontext management easy (as
proposed by Andy Chiu from SiFive)
config and compilation:
Introduce a new risc-v config option `CONFIG_RISCV_USER_CFI`. Selecting this
config option picks the kernel support for user control flow integrity. This
optin is presented only if toolchain has shadow stack and landing pad support.
And is on purpose guarded by toolchain support. Reason being that eventually
vDSO also needs to be compiled in with shadow stack and landing pad support.
vDSO compile patches are not included as of now because landing pad labeling
scheme is yet to settle for usermode runtime.
To get more information on kernel interactions with respect to
zicfilp and zicfiss, patch series adds documentation for
`zicfilp` and `zicfiss` in following:
Documentation/arch/riscv/zicfiss.rst
Documentation/arch/riscv/zicfilp.rst
How to test this series
=======================
Toolchain
---------
$ git clone git@github.com:sifive/riscv-gnu-toolchain.git -b cfi-dev
$ riscv-gnu-toolchain/configure --prefix=<path-to-where-to-build> --with-arch=rv64gc_zicfilp_zicfiss --enable-linux --disable-gdb --with-extra-multilib-test="rv64gc_zicfilp_zicfiss-lp64d:-static"
$ make -j$(nproc)
Qemu
----
Get the lastest qemu
$ cd qemu
$ mkdir build
$ cd build
$ ../configure --target-list=riscv64-softmmu
$ make -j$(nproc)
Opensbi
-------
$ git clone git@github.com:deepak0414/opensbi.git -b v6_cfi_spec_split_opensbi
$ make CROSS_COMPILE=<your riscv toolchain> -j$(nproc) PLATFORM=generic
Linux
-----
Running defconfig is fine. CFI is enabled by default if the toolchain
supports it.
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc) defconfig
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc)
Running
-------
Modify your qemu command to have:
-bios <path-to-cfi-opensbi>/build/platform/generic/firmware/fw_dynamic.bin
-cpu rv64,zicfilp=true,zicfiss=true,zimop=true,zcmop=true
References
==========
[1] - https://github.com/riscv/riscv-cfi
[2] - https://lore.kernel.org/all/20240814081126.956287-1-samuel.holland@sifive.c…
[3] - https://lwn.net/Articles/889475/
[4] - https://developer.arm.com/documentation/109576/0100/Branch-Target-Identific…
[5] - https://www.intel.com/content/dam/develop/external/us/en/documents/catc17-i…
[6] - https://lwn.net/Articles/940403/
To: Thomas Gleixner <tglx(a)linutronix.de>
To: Ingo Molnar <mingo(a)redhat.com>
To: Borislav Petkov <bp(a)alien8.de>
To: Dave Hansen <dave.hansen(a)linux.intel.com>
To: x86(a)kernel.org
To: H. Peter Anvin <hpa(a)zytor.com>
To: Andrew Morton <akpm(a)linux-foundation.org>
To: Liam R. Howlett <Liam.Howlett(a)oracle.com>
To: Vlastimil Babka <vbabka(a)suse.cz>
To: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
To: Paul Walmsley <paul.walmsley(a)sifive.com>
To: Palmer Dabbelt <palmer(a)dabbelt.com>
To: Albert Ou <aou(a)eecs.berkeley.edu>
To: Conor Dooley <conor(a)kernel.org>
To: Rob Herring <robh(a)kernel.org>
To: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
To: Arnd Bergmann <arnd(a)arndb.de>
To: Christian Brauner <brauner(a)kernel.org>
To: Peter Zijlstra <peterz(a)infradead.org>
To: Oleg Nesterov <oleg(a)redhat.com>
To: Eric Biederman <ebiederm(a)xmission.com>
To: Kees Cook <kees(a)kernel.org>
To: Jonathan Corbet <corbet(a)lwn.net>
To: Shuah Khan <shuah(a)kernel.org>
To: Jann Horn <jannh(a)google.com>
To: Conor Dooley <conor+dt(a)kernel.org>
To: Miguel Ojeda <ojeda(a)kernel.org>
To: Alex Gaynor <alex.gaynor(a)gmail.com>
To: Boqun Feng <boqun.feng(a)gmail.com>
To: Gary Guo <gary(a)garyguo.net>
To: Björn Roy Baron <bjorn3_gh(a)protonmail.com>
To: Benno Lossin <benno.lossin(a)proton.me>
To: Andreas Hindborg <a.hindborg(a)kernel.org>
To: Alice Ryhl <aliceryhl(a)google.com>
To: Trevor Gross <tmgross(a)umich.edu>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-fsdevel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Cc: linux-riscv(a)lists.infradead.org
Cc: devicetree(a)vger.kernel.org
Cc: linux-arch(a)vger.kernel.org
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: alistair.francis(a)wdc.com
Cc: richard.henderson(a)linaro.org
Cc: jim.shu(a)sifive.com
Cc: andybnac(a)gmail.com
Cc: kito.cheng(a)sifive.com
Cc: charlie(a)rivosinc.com
Cc: atishp(a)rivosinc.com
Cc: evan(a)rivosinc.com
Cc: cleger(a)rivosinc.com
Cc: alexghiti(a)rivosinc.com
Cc: samitolvanen(a)google.com
Cc: broonie(a)kernel.org
Cc: rick.p.edgecombe(a)intel.com
Cc: rust-for-linux(a)vger.kernel.org
changelog
---------
v22:
- CONFIG_RISCV_USER_CFI was by default "n". With dual vdso support it is
default "y" (if toolchain supports it). Fixing build error due to
"-march=zicfiss" being picked in gcc-13 partially. gcc-13 only recognizes the
flag but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
- picked up tags and some cosmetic changes in commit message for dual vdso
patch.
v21:
- Fixing build errors due to changes in arch/riscv/include/asm/vdso.h
Using #ifdef instead of IS_ENABLED in arch/riscv/include/asm/vdso.h
vdso-cfi-offsets.h should be included only when CONFIG_RISCV_USER_CFI
is selected.
v20:
- rebased on v6.18-rc1.
- Added two vDSO support. If `CONFIG_RISCV_USER_CFI` is selected
two vDSOs are compiled (one for hardware prior to RVA23 and one
for RVA23 onwards). Kernel exposes RVA23 vDSO if hardware/cpu
implements zimop else exposes existing vDSO to userspace.
- default selection for `CONFIG_RISCV_USER_CFI` is "Yes".
- replaced "__ASSEMBLY__" with "__ASSEMBLER__"
v19:
- riscv_nousercfi was `int`. changed it to unsigned long.
Thanks to Alex Ghiti for reporting it. It was a bug.
- ELP is cleared on trap entry only when CONFIG_64BIT.
- restore ssp back on return to usermode was being done
before `riscv_v_context_nesting_end` on trap exit path.
If kernel shadow stack were enabled this would result in
kernel operating on user shadow stack and panic (as I found
in my testing of kcfi patch series). So fixed that.
v18:
- rebased on 6.16-rc1
- uprobe handling clears ELP in sstatus image in pt_regs
- vdso was missing shadow stack elf note for object files.
added that. Additional asm file for vdso needed the elf marker
flag. toolchain should complain if `-fcf-protection=full` and
marker is missing for object generated from asm file. Asked
toolchain folks to fix this. Although no reason to gate the merge
on that.
- Split up compile options for march and fcf-protection in vdso
Makefile
- CONFIG_RISCV_USER_CFI option is moved under "Kernel features" menu
Added `arch/riscv/configs/hardening.config` fragment which selects
CONFIG_RISCV_USER_CFI
v17:
- fixed warnings due to empty macros in usercfi.h (reported by alexg)
- fixed prefixes in commit titles reported by alexg
- took below uprobe with fcfi v2 patch from Zong Li and squashed it with
"riscv/traps: Introduce software check exception and uprobe handling"
https://lore.kernel.org/all/20250604093403.10916-1-zong.li@sifive.com/
v16:
- If FWFT is not implemented or returns error for shadow stack activation, then
no_usercfi is set to disable shadow stack. Although this should be picked up
by extension validation and activation. Fixed this bug for zicfilp and zicfiss
both. Thanks to Charlie Jenkins for reporting this.
- If toolchain doesn't support cfi, cfi kselftest shouldn't build. Suggested by
Charlie Jenkins.
- Default for CONFIG_RISCV_USER_CFI is set to no. Charlie/Atish suggested to
keep it off till we have more hardware availibility with RVA23 profile and
zimop/zcmop implemented. Else this will start breaking people's workflow
- Includes the fix if "!RV64 and !SBI" then definitions for FWFT in
asm-offsets.c error.
v15:
- Toolchain has been updated to include `-fcf-protection` flag. This
exists for x86 as well. Updated kernel patches to compile vDSO and
selftest to compile with `fcf-protection=full` flag.
- selecting CONFIG_RISCV_USERCFI selects CONFIG_RISCV_SBI.
- Patch to enable shadow stack for kernel wasn't hidden behind
CONFIG_RISCV_USERCFI and CONFIG_RISCV_SBI. fixed that.
v14:
- rebased on top of palmer/sbi-v3. Thus dropped clement's FWFT patches
Updated RISCV_ISA_EXT_XXXX in hwcap and hwprobe constants.
- Took Radim's suggestions on bitfields.
- Placed cfi_state at the end of thread_info block so that current situation
is not disturbed with respect to member fields of thread_info in single
cacheline.
v13:
- cpu_supports_shadow_stack/cpu_supports_indirect_br_lp_instr uses
riscv_has_extension_unlikely()
- uses nops(count) to create nop slide
- RISCV_ACQUIRE_BARRIER is not needed in `amo_user_shstk`. Removed it
- changed ternaries to simply use implicit casting to convert to bool.
- kernel command line allows to disable zicfilp and zicfiss independently.
updated kernel-parameters.txt.
- ptrace user abi for cfi uses bitmasks instead of bitfields. Added ptrace
kselftest.
- cosmetic and grammatical changes to documentation.
v12:
- It seems like I had accidently squashed arch agnostic indirect branch
tracking prctl and riscv implementation of those prctls. Split them again.
- set_shstk_status/set_indir_lp_status perform CSR writes only when CPU
support is available. As suggested by Zong Li.
- Some minor clean up in kselftests as suggested by Zong Li.
v11:
- patch "arch/riscv: compile vdso with landing pad" was unconditionally
selecting `_zicfilp` for vDSO compile. fixed that. Changed `lpad 1` to
to `lpad 0`.
v10:
- dropped "mm: helper `is_shadow_stack_vma` to check shadow stack vma". This patch
is not that interesting to this patch series for risc-v. There are instances in
arch directories where VM_SHADOW_STACK flag is anyways used. Dropping this patch
to expedite merging in riscv tree.
- Took suggestions from `Clement` on "riscv: zicfiss / zicfilp enumeration" to
validate presence of cfi based on config.
- Added a patch for vDSO to have `lpad 0`. I had omitted this earlier to make sure
we add single vdso object with cfi enabled. But a vdso object with scheme of
zero labeled landing pad is least common denominator and should work with all
objects of zero labeled as well as function-signature labeled objects.
v9:
- rebased on master (39a803b754d5 fix braino in "9p: fix ->rename_sem exclusion")
- dropped "mm: Introduce ARCH_HAS_USER_SHADOW_STACK" (master has it from arm64/gcs)
- dropped "prctl: arch-agnostic prctl for shadow stack" (master has it from arm64/gcs)
v8:
- rebased on palmer/for-next
- dropped samuel holland's `envcfg` context switch patches.
they are in parlmer/for-next
v7:
- Removed "riscv/Kconfig: enable HAVE_EXIT_THREAD for riscv"
Instead using `deactivate_mm` flow to clean up.
see here for more context
https://lore.kernel.org/all/20230908203655.543765-1-rick.p.edgecombe@intel.…
- Changed the header include in `kselftest`. Hopefully this fixes compile
issue faced by Zong Li at SiFive.
- Cleaned up an orphaned change to `mm/mmap.c` in below patch
"riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE"
- Lock interfaces for shadow stack and indirect branch tracking expect arg == 0
Any future evolution of this interface should accordingly define how arg should
be setup.
- `mm/map.c` has an instance of using `VM_SHADOW_STACK`. Fixed it to use helper
`is_shadow_stack_vma`.
- Link to v6: https://lore.kernel.org/r/20241008-v5_user_cfi_series-v6-0-60d9fe073f37@riv…
v6:
- Picked up Samuel Holland's changes as is with `envcfg` placed in
`thread` instead of `thread_info`
- fixed unaligned newline escapes in kselftest
- cleaned up messages in kselftest and included test output in commit message
- fixed a bug in clone path reported by Zong Li
- fixed a build issue if CONFIG_RISCV_ISA_V is not selected
(this was introduced due to re-factoring signal context
management code)
v5:
- rebased on v6.12-rc1
- Fixed schema related issues in device tree file
- Fixed some of the documentation related issues in zicfilp/ss.rst
(style issues and added index)
- added `SHADOW_STACK_SET_MARKER` so that implementation can define base
of shadow stack.
- Fixed warnings on definitions added in usercfi.h when
CONFIG_RISCV_USER_CFI is not selected.
- Adopted context header based signal handling as proposed by Andy Chiu
- Added support for enabling kernel mode access to shadow stack using
FWFT
(https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-firmware…)
- Link to v5: https://lore.kernel.org/r/20241001-v5_user_cfi_series-v1-0-3ba65b6e550f@riv…
(Note: I had an issue in my workflow due to which version number wasn't
picked up correctly while sending out patches)
v4:
- rebased on 6.11-rc6
- envcfg: Converged with Samuel Holland's patches for envcfg management on per-
thread basis.
- vma_is_shadow_stack is renamed to is_vma_shadow_stack
- picked up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` patch
- signal context: using extended context management to maintain compatibility.
- fixed `-Wmissing-prototypes` compiler warnings for prctl functions
- Documentation fixes and amending typos.
- Link to v4: https://lore.kernel.org/all/20240912231650.3740732-1-debug@rivosinc.com/
v3:
- envcfg
logic to pick up base envcfg had a bug where `ENVCFG_CBZE` could have been
picked on per task basis, even though CPU didn't implement it. Fixed in
this series.
- dt-bindings
As suggested, split into separate commit. fixed the messaging that spec is
in public review
- arch_is_shadow_stack change
arch_is_shadow_stack changed to vma_is_shadow_stack
- hwprobe
zicfiss / zicfilp if present will get enumerated in hwprobe
- selftests
As suggested, added object and binary filenames to .gitignore
Selftest binary anyways need to be compiled with cfi enabled compiler which
will make sure that landing pad and shadow stack are enabled. Thus removed
separate enable/disable tests. Cleaned up tests a bit.
- Link to v3: https://lore.kernel.org/lkml/20240403234054.2020347-1-debug@rivosinc.com/
v2:
- Using config `CONFIG_RISCV_USER_CFI`, kernel support for riscv control flow
integrity for user mode programs can be compiled in the kernel.
- Enabling of control flow integrity for user programs is left to user runtime
- This patch series introduces arch agnostic `prctls` to enable shadow stack
and indirect branch tracking. And implements them on riscv.
---
Changes in v22:
- Link to v21: https://lore.kernel.org/r/20251015-v5_user_cfi_series-v21-0-6a07856e90e7@ri…
Changes in v21:
- Link to v20: https://lore.kernel.org/r/20251013-v5_user_cfi_series-v20-0-b9de4be9912e@ri…
Changes in v20:
- Link to v19: https://lore.kernel.org/r/20250731-v5_user_cfi_series-v19-0-09b468d7beab@ri…
Changes in v19:
- Link to v18: https://lore.kernel.org/r/20250711-v5_user_cfi_series-v18-0-a8ee62f9f38e@ri…
Changes in v18:
- Link to v17: https://lore.kernel.org/r/20250604-v5_user_cfi_series-v17-0-4565c2cf869f@ri…
Changes in v17:
- Link to v16: https://lore.kernel.org/r/20250522-v5_user_cfi_series-v16-0-64f61a35eee7@ri…
Changes in v16:
- Link to v15: https://lore.kernel.org/r/20250502-v5_user_cfi_series-v15-0-914966471885@ri…
Changes in v15:
- changelog posted just below cover letter
- Link to v14: https://lore.kernel.org/r/20250429-v5_user_cfi_series-v14-0-5239410d012a@ri…
Changes in v14:
- changelog posted just below cover letter
- Link to v13: https://lore.kernel.org/r/20250424-v5_user_cfi_series-v13-0-971437de586a@ri…
Changes in v13:
- changelog posted just below cover letter
- Link to v12: https://lore.kernel.org/r/20250314-v5_user_cfi_series-v12-0-e51202b53138@ri…
Changes in v12:
- changelog posted just below cover letter
- Link to v11: https://lore.kernel.org/r/20250310-v5_user_cfi_series-v11-0-86b36cbfb910@ri…
Changes in v11:
- changelog posted just below cover letter
- Link to v10: https://lore.kernel.org/r/20250210-v5_user_cfi_series-v10-0-163dcfa31c60@ri…
---
Andy Chiu (1):
riscv: signal: abstract header saving for setup_sigcontext
Deepak Gupta (26):
mm: VM_SHADOW_STACK definition for riscv
dt-bindings: riscv: zicfilp and zicfiss in dt-bindings (extensions.yaml)
riscv: zicfiss / zicfilp enumeration
riscv: zicfiss / zicfilp extension csr and bit definitions
riscv: usercfi state for task and save/restore of CSR_SSP on trap entry/exit
riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE
riscv/mm: manufacture shadow stack pte
riscv/mm: teach pte_mkwrite to manufacture shadow stack PTEs
riscv/mm: write protect and shadow stack
riscv/mm: Implement map_shadow_stack() syscall
riscv/shstk: If needed allocate a new shadow stack on clone
riscv: Implements arch agnostic shadow stack prctls
prctl: arch-agnostic prctl for indirect branch tracking
riscv: Implements arch agnostic indirect branch tracking prctls
riscv/traps: Introduce software check exception and uprobe handling
riscv/signal: save and restore of shadow stack for signal
riscv/kernel: update __show_regs to print shadow stack register
riscv/ptrace: riscv cfi status and state via ptrace and in core files
riscv/hwprobe: zicfilp / zicfiss enumeration in hwprobe
riscv: kernel command line option to opt out of user cfi
riscv: enable kernel access to shadow stack memory via FWFT sbi call
arch/riscv: dual vdso creation logic and select vdso based on hw
riscv: create a config for shadow stack and landing pad instr support
riscv: Documentation for landing pad / indirect branch tracking
riscv: Documentation for shadow stack on riscv
kselftest/riscv: kselftest for user mode cfi
Jim Shu (1):
arch/riscv: compile vdso with landing pad and shadow stack note
Documentation/admin-guide/kernel-parameters.txt | 8 +
Documentation/arch/riscv/index.rst | 2 +
Documentation/arch/riscv/zicfilp.rst | 115 +++++
Documentation/arch/riscv/zicfiss.rst | 179 +++++++
.../devicetree/bindings/riscv/extensions.yaml | 14 +
arch/riscv/Kconfig | 22 +
arch/riscv/Makefile | 8 +-
arch/riscv/configs/hardening.config | 4 +
arch/riscv/include/asm/asm-prototypes.h | 1 +
arch/riscv/include/asm/assembler.h | 44 ++
arch/riscv/include/asm/cpufeature.h | 12 +
arch/riscv/include/asm/csr.h | 16 +
arch/riscv/include/asm/entry-common.h | 2 +
arch/riscv/include/asm/hwcap.h | 2 +
arch/riscv/include/asm/mman.h | 26 +
arch/riscv/include/asm/mmu_context.h | 7 +
arch/riscv/include/asm/pgtable.h | 30 +-
arch/riscv/include/asm/processor.h | 1 +
arch/riscv/include/asm/thread_info.h | 3 +
arch/riscv/include/asm/usercfi.h | 95 ++++
arch/riscv/include/asm/vdso.h | 13 +-
arch/riscv/include/asm/vector.h | 3 +
arch/riscv/include/uapi/asm/hwprobe.h | 2 +
arch/riscv/include/uapi/asm/ptrace.h | 34 ++
arch/riscv/include/uapi/asm/sigcontext.h | 1 +
arch/riscv/kernel/Makefile | 2 +
arch/riscv/kernel/asm-offsets.c | 10 +
arch/riscv/kernel/cpufeature.c | 27 +
arch/riscv/kernel/entry.S | 38 ++
arch/riscv/kernel/head.S | 27 +
arch/riscv/kernel/process.c | 27 +-
arch/riscv/kernel/ptrace.c | 95 ++++
arch/riscv/kernel/signal.c | 148 +++++-
arch/riscv/kernel/sys_hwprobe.c | 2 +
arch/riscv/kernel/sys_riscv.c | 10 +
arch/riscv/kernel/traps.c | 54 ++
arch/riscv/kernel/usercfi.c | 545 +++++++++++++++++++++
arch/riscv/kernel/vdso.c | 7 +
arch/riscv/kernel/vdso/Makefile | 40 +-
arch/riscv/kernel/vdso/flush_icache.S | 4 +
arch/riscv/kernel/vdso/gen_vdso_offsets.sh | 4 +-
arch/riscv/kernel/vdso/getcpu.S | 4 +
arch/riscv/kernel/vdso/note.S | 3 +
arch/riscv/kernel/vdso/rt_sigreturn.S | 4 +
arch/riscv/kernel/vdso/sys_hwprobe.S | 4 +
arch/riscv/kernel/vdso/vgetrandom-chacha.S | 5 +-
arch/riscv/kernel/vdso_cfi/Makefile | 25 +
arch/riscv/kernel/vdso_cfi/vdso-cfi.S | 11 +
arch/riscv/mm/init.c | 2 +-
arch/riscv/mm/pgtable.c | 16 +
include/linux/cpu.h | 4 +
include/linux/mm.h | 7 +
include/uapi/linux/elf.h | 2 +
include/uapi/linux/prctl.h | 27 +
kernel/sys.c | 30 ++
tools/testing/selftests/riscv/Makefile | 2 +-
tools/testing/selftests/riscv/cfi/.gitignore | 3 +
tools/testing/selftests/riscv/cfi/Makefile | 16 +
tools/testing/selftests/riscv/cfi/cfi_rv_test.h | 82 ++++
tools/testing/selftests/riscv/cfi/riscv_cfi_test.c | 173 +++++++
tools/testing/selftests/riscv/cfi/shadowstack.c | 385 +++++++++++++++
tools/testing/selftests/riscv/cfi/shadowstack.h | 27 +
62 files changed, 2475 insertions(+), 41 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20240930-v5_user_cfi_series-3dc332f8f5b2
--
- debug
Overall, we encountered a warning [1] that can be triggered by running the
selftest I provided.
MPTCP creates subflows for data transmission between two endpoints.
However, BPF can use sockops to perform additional operations when TCP
completes the three-way handshake. The issue arose because we used sockmap
in sockops, which replaces sk->sk_prot and some handlers. Since subflows
also have their own specialized handlers, this creates a conflict and leads
to traffic failure. Therefore, we need to reject operations targeting
subflows.
This patchset simply prevents the combination of subflows and sockmap
without changing any functionality.
A complete integration of MPTCP and sockmap would require more effort, for
example, we would need to retrieve the parent socket from subflows in
sockmap and implement handlers like read_skb.
If maintainers don't object, we can further improve this in subsequent
work.
v1: https://lore.kernel.org/mptcp/a0a2b87119a06c5ffaa51427a0964a05534fe6f1@linu…
[1] truncated warning:
[ 18.234652] ------------[ cut here ]------------
[ 18.234664] WARNING: CPU: 1 PID: 388 at net/mptcp/protocol.c:68 mptcp_stream_accept+0x34c/0x380
[ 18.234726] Modules linked in:
[ 18.234755] RIP: 0010:mptcp_stream_accept+0x34c/0x380
[ 18.234762] RSP: 0018:ffffc90000cf3cf8 EFLAGS: 00010202
[ 18.234800] PKRU: 55555554
[ 18.234806] Call Trace:
[ 18.234810] <TASK>
[ 18.234837] do_accept+0xeb/0x190
[ 18.234861] ? __x64_sys_pselect6+0x61/0x80
[ 18.234898] ? _raw_spin_unlock+0x12/0x30
[ 18.234915] ? alloc_fd+0x11e/0x190
[ 18.234925] __sys_accept4+0x8c/0x100
[ 18.234930] __x64_sys_accept+0x1f/0x30
[ 18.234933] x64_sys_call+0x202f/0x20f0
[ 18.234966] do_syscall_64+0x72/0x9a0
[ 18.234979] ? switch_fpu_return+0x60/0xf0
[ 18.234993] ? irqentry_exit_to_user_mode+0xdb/0x1e0
[ 18.235002] ? irqentry_exit+0x3f/0x50
[ 18.235005] ? clear_bhb_loop+0x50/0xa0
[ 18.235022] ? clear_bhb_loop+0x50/0xa0
[ 18.235025] ? clear_bhb_loop+0x50/0xa0
[ 18.235028] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 18.235066] </TASK>
[ 18.235109] ---[ end trace 0000000000000000 ]---
[ 18.235677] sockmap: MPTCP sockets are not supported
Jiayuan Chen (3):
net,mptcp: fix incorrect IPv4/IPv6 fallback detection with BPF Sockmap
bpf,sockmap: disallow MPTCP sockets from sockmap updates
selftests/bpf: Add mptcp test with sockmap
net/core/sock_map.c | 9 ++
net/mptcp/protocol.c | 7 +-
.../testing/selftests/bpf/prog_tests/mptcp.c | 136 ++++++++++++++++++
.../selftests/bpf/progs/mptcp_sockmap.c | 43 ++++++
4 files changed, 193 insertions(+), 2 deletions(-)
create mode 100644 tools/testing/selftests/bpf/progs/mptcp_sockmap.c
--
2.43.0
DAMON maintains the targets in a list, and allows committing only an
entire list of targets having the new parameters. Targets having same
index on the lists are treated as matching source and destination
targets. If an existing target cannot find a matching one in the
sources list, the target is removed. This means that there is no way to
remove only a specific monitoring target in the middle of the current
targets list.
Such pin-point target removal is really needed in some use cases,
though. Monitoring access patterns on virtual address spaces of
processes that spawned from the same ancestor is one example. If a
process of the group is terminated, the user may want to remove the
matching DAMON target as soon as possible, to save in-kernel memory
usage for the unnecessary target data. The user may also want to do
that without turning DAMON off or removing unnecessary targets, to keep
the current monitoring results for other active processes.
Extend DAMON kernel API and sysfs ABI to support the pin-point removal
in the following way. For API, add a new damon_target field, namely
'obsolete'. If the field on parameters commit source target is set, it
means the matching destination target is obsolete. Then the parameters
commit logic removes the destination target from the existing targets
list. For sysfs ABI, add a new file under the target directory, namely
'obsolete_target'. It is connected with the 'obsolete' field of the
commit source targets, so internally using the new API.
Also add a selftest for the new feature. The related helper scripts for
manipulating the sysfs interface and dumping in-kernel DAMON status are
also extended for this. Note that the selftest part was initially
posted as an individual RFC series [1], but now merged into this one.
Bijan Tabatabai (bijan311(a)gmail.com) has originally reported this issue,
and participated in this solution design on a GitHub issue [1] for DAMON
user-space tool.
Changes from RFC
(https://lore.kernel.org/20251016214736.84286-1-sj@kernel.org)
- Wordsmith commit messages
- Add Reviewed-by: tags from Bijan
- Add a kselftest for the functionality of the new feature
(https://lore.kernel.org/20251018204448.8906-1-sj@kernel.org)
[1] https://github.com/damonitor/damo/issues/36
SeongJae Park (9):
mm/damon/core: add damon_target->obsolete for pin-point removal
mm/damon/sysfs: test commit input against realistic destination
mm/damon/sysfs: implement obsolete_target file
Docs/admin-guide/mm/damon/usage: document obsolete_target file
Docs/ABI/damon: document obsolete_target sysfs file
selftests/damon/_damon_sysfs: support obsolete_target file
drgn_dump_damon_status: dump damon_target->obsolete
sysfs.py: extend assert_ctx_committed() for monitoring targets
selftests/damon/sysfs: add obsolete_target test
.../ABI/testing/sysfs-kernel-mm-damon | 7 +++
Documentation/admin-guide/mm/damon/usage.rst | 13 +++--
include/linux/damon.h | 6 +++
mm/damon/core.c | 10 +++-
mm/damon/sysfs.c | 51 ++++++++++++++++++-
tools/testing/selftests/damon/_damon_sysfs.py | 11 +++-
.../selftests/damon/drgn_dump_damon_status.py | 1 +
tools/testing/selftests/damon/sysfs.py | 48 +++++++++++++++++
8 files changed, 140 insertions(+), 7 deletions(-)
base-commit: a3e008fdd7964bc3e6d876491c202d476406ed59
--
2.47.3
This series introduces stats counters for psp. Device key rotations,
and so called 'stale-events' are common to all drivers and are tracked
by the core.
A driver facing api is provided for reporting stats required by the
"Implementation Requirements" section of the PSP Architecture
Specification. Drivers must implement these stats.
Lastly, implementations of the driver stats api for mlx5 and netdevsim
are included.
Here is the output of running the psp selftest suite and then
printing out stats with the ynl cli on system with a psp-capable CX7:
$ ./ksft-psp-stats/drivers/net/psp.py
TAP version 13
1..28
ok 1 psp.test_case # SKIP Test requires IPv4 connectivity
ok 2 psp.data_basic_send_v0_ip6
ok 3 psp.test_case # SKIP Test requires IPv4 connectivity
ok 4 psp.data_basic_send_v1_ip6
ok 5 psp.test_case # SKIP Test requires IPv4 connectivity
ok 6 psp.data_basic_send_v2_ip6 # SKIP ('PSP version not supported', 'hdr0-aes-gmac-128')
ok 7 psp.test_case # SKIP Test requires IPv4 connectivity
ok 8 psp.data_basic_send_v3_ip6 # SKIP ('PSP version not supported', 'hdr0-aes-gmac-256')
ok 9 psp.test_case # SKIP Test requires IPv4 connectivity
ok 10 psp.data_mss_adjust_ip6
ok 11 psp.dev_list_devices
ok 12 psp.dev_get_device
ok 13 psp.dev_get_device_bad
ok 14 psp.dev_rotate
ok 15 psp.dev_rotate_spi
ok 16 psp.assoc_basic
ok 17 psp.assoc_bad_dev
ok 18 psp.assoc_sk_only_conn
ok 19 psp.assoc_sk_only_mismatch
ok 20 psp.assoc_sk_only_mismatch_tx
ok 21 psp.assoc_sk_only_unconn
ok 22 psp.assoc_version_mismatch
ok 23 psp.assoc_twice
ok 24 psp.data_send_bad_key
ok 25 psp.data_send_disconnect
ok 26 psp.data_stale_key
ok 27 psp.removal_device_rx # XFAIL Test only works on netdevsim
ok 28 psp.removal_device_bi # XFAIL Test only works on netdevsim
# Totals: pass:19 fail:0 xfail:2 xpass:0 skip:7 error:0
#
# Responder logs (0):
# STDERR:
# Set PSP enable on device 1 to 0x3
# Set PSP enable on device 1 to 0x0
$ cd ynl/
$ ./pyynl/cli.py --spec netlink/specs/psp.yaml --dump get-stats
[{'dev-id': 1,
'key-rotations': 5,
'rx-auth-fail': 21,
'rx-bad': 0,
'rx-bytes': 11844,
'rx-error': 0,
'rx-packets': 94,
'stale-events': 6,
'tx-bytes': 1128456,
'tx-error': 0,
'tx-packets': 780}]
Daniel Zahka (2):
selftests: drv-net: psp: add assertions on core-tracked psp dev stats
netdevsim: implement psp device stats
Jakub Kicinski (3):
psp: report basic stats from the core
psp: add stats from psp spec to driver facing api
net/mlx5e: Add PSP stats support for Rx/Tx flows
Documentation/netlink/specs/psp.yaml | 95 +++++++
.../mellanox/mlx5/core/en_accel/psp.c | 239 ++++++++++++++++--
.../mellanox/mlx5/core/en_accel/psp.h | 18 ++
.../mellanox/mlx5/core/en_accel/psp_rxtx.c | 1 +
.../net/ethernet/mellanox/mlx5/core/en_main.c | 5 +
drivers/net/netdevsim/netdevsim.h | 5 +
drivers/net/netdevsim/psp.c | 27 ++
include/net/psp/types.h | 35 +++
include/uapi/linux/psp.h | 18 ++
net/psp/psp-nl-gen.c | 19 ++
net/psp/psp-nl-gen.h | 2 +
net/psp/psp_main.c | 3 +-
net/psp/psp_nl.c | 99 ++++++++
net/psp/psp_sock.c | 4 +-
tools/testing/selftests/drivers/net/psp.py | 13 +
15 files changed, 566 insertions(+), 17 deletions(-)
--
2.47.3
Prior to commit 9245fd6b8531 ("KVM: x86: model canonical checks more
precisely"), KVM_SET_NESTED_STATE would fail if the state was captured
with L2 active, L1 had CR4.LA57 set, L2 did not, and the
VMCS12.HOST_GSBASE (or other host-state field checked for canonicality)
had an address greater than 48 bits wide.
Add a regression test that reproduces the KVM_SET_NESTED_STATE failure
conditions. To do so, the first three patches add support for 5-level
paging in the selftest L1 VM.
Jim Mattson (4):
KVM: selftests: Use a loop to create guest page tables
KVM: selftests: Use a loop to walk guest page tables
KVM: selftests: Add VM_MODE_PXXV57_4K VM mode
KVM: selftests: Add a VMX test for LA57 nested state
tools/testing/selftests/kvm/Makefile.kvm | 1 +
.../testing/selftests/kvm/include/kvm_util.h | 1 +
tools/testing/selftests/kvm/lib/kvm_util.c | 21 +++
.../testing/selftests/kvm/lib/x86/processor.c | 66 ++++-----
tools/testing/selftests/kvm/lib/x86/vmx.c | 7 +-
.../kvm/x86/vmx_la57_nested_state_test.c | 137 ++++++++++++++++++
6 files changed, 195 insertions(+), 38 deletions(-)
create mode 100644 tools/testing/selftests/kvm/x86/vmx_la57_nested_state_test.c
--
2.51.0.470.ga7dc726c21-goog
This series fixes a race condition in netconsole's userdata handling
where concurrent message transmission could read partially updated
userdata fields, resulting in corrupted netconsole output.
The first patch adds a selftest that reproduces the race condition by
continuously sending messages while rapidly changing userdata values,
detecting any torn reads in the output.
The second patch fixes the issue by ensuring update_userdata() holds
the target_list_lock while updating both extradata_complete and
userdata_length, preventing readers from seeing inconsistent state.
This targets net tree as it fixes a bug introduced in commit df03f830d099
("net: netconsole: cache userdata formatted string in netconsole_target").
Signed-off-by: Gustavo Luiz Duarte <gustavold(a)gmail.com>
---
Gustavo Luiz Duarte (2):
selftests: netconsole: Add race condition test for userdata corruption
netconsole: Fix race condition in between reader and writer of userdata
drivers/net/netconsole.c | 5 ++
.../selftests/drivers/net/netcons_race_userdata.sh | 87 ++++++++++++++++++++++
2 files changed, 92 insertions(+)
---
base-commit: ffff5c8fc2af2218a3332b3d5b97654599d50cde
change-id: 20251020-netconsole-fix-race-f465f37b57ea
Best regards,
--
Gustavo Luiz Duarte <gustavold(a)gmail.com>
This series creates a new PMU scheme on ARM, a partitioned PMU that
allows reserving a subset of counters for more direct guest access,
significantly reducing overhead. More details, including performance
benchmarks, can be read in the v1 cover letter linked below.
v4:
* Apply Mark Brown's non-UNDEF FGT control commit to the PMU FGT
controls and calculate those controls with the others in
kvm_calculate_traps()
* Introduce lazy context swaps for guests that only turns on for
guests that have enabled partitioning and accessed PMU registers.
* Rename pmu-part.c to pmu-direct.c because future features might
achieve direct PMU access without partitioning.
* Better explain certain commits, such as why the untrapped registers
are safe to untrap.
* Reduce the PMU include cleanup down to only what is still necessary
and explain why.
v3:
https://lore.kernel.org/kvm/20250626200459.1153955-1-coltonlewis@google.com/
v2:
https://lore.kernel.org/kvm/20250620221326.1261128-1-coltonlewis@google.com/
v1:
https://lore.kernel.org/kvm/20250602192702.2125115-1-coltonlewis@google.com/
Colton Lewis (21):
arm64: cpufeature: Add cpucap for HPMN0
KVM: arm64: Reorganize PMU functions
perf: arm_pmuv3: Introduce method to partition the PMU
perf: arm_pmuv3: Generalize counter bitmasks
perf: arm_pmuv3: Keep out of guest counter partition
KVM: arm64: Account for partitioning in kvm_pmu_get_max_counters()
KVM: arm64: Set up FGT for Partitioned PMU
KVM: arm64: Writethrough trapped PMEVTYPER register
KVM: arm64: Use physical PMSELR for PMXEVTYPER if partitioned
KVM: arm64: Writethrough trapped PMOVS register
KVM: arm64: Write fast path PMU register handlers
KVM: arm64: Setup MDCR_EL2 to handle a partitioned PMU
KVM: arm64: Account for partitioning in PMCR_EL0 access
KVM: arm64: Context swap Partitioned PMU guest registers
KVM: arm64: Enforce PMU event filter at vcpu_load()
KVM: arm64: Extract enum debug_owner to enum vcpu_register_owner
KVM: arm64: Implement lazy PMU context swaps
perf: arm_pmuv3: Handle IRQs for Partitioned PMU guest counters
KVM: arm64: Inject recorded guest interrupts
KVM: arm64: Add ioctl to partition the PMU when supported
KVM: arm64: selftests: Add test case for partitioned PMU
Marc Zyngier (1):
KVM: arm64: Reorganize PMU includes
Mark Brown (1):
KVM: arm64: Introduce non-UNDEF FGT control
Documentation/virt/kvm/api.rst | 21 +
arch/arm/include/asm/arm_pmuv3.h | 38 +
arch/arm64/include/asm/arm_pmuv3.h | 61 +-
arch/arm64/include/asm/kvm_host.h | 34 +-
arch/arm64/include/asm/kvm_pmu.h | 123 +++
arch/arm64/include/asm/kvm_types.h | 7 +-
arch/arm64/kernel/cpufeature.c | 8 +
arch/arm64/kvm/Makefile | 2 +-
arch/arm64/kvm/arm.c | 22 +
arch/arm64/kvm/debug.c | 33 +-
arch/arm64/kvm/hyp/include/hyp/debug-sr.h | 6 +-
arch/arm64/kvm/hyp/include/hyp/switch.h | 181 ++++-
arch/arm64/kvm/pmu-direct.c | 395 ++++++++++
arch/arm64/kvm/pmu-emul.c | 674 +---------------
arch/arm64/kvm/pmu.c | 725 ++++++++++++++++++
arch/arm64/kvm/sys_regs.c | 137 +++-
arch/arm64/tools/cpucaps | 1 +
arch/arm64/tools/sysreg | 6 +-
drivers/perf/arm_pmuv3.c | 128 +++-
include/linux/perf/arm_pmu.h | 1 +
include/linux/perf/arm_pmuv3.h | 14 +-
include/uapi/linux/kvm.h | 4 +
tools/include/uapi/linux/kvm.h | 2 +
.../selftests/kvm/arm64/vpmu_counter_access.c | 62 +-
24 files changed, 1910 insertions(+), 775 deletions(-)
create mode 100644 arch/arm64/kvm/pmu-direct.c
base-commit: 79150772457f4d45e38b842d786240c36bb1f97f
--
2.50.0.727.gbf7dc18ff4-goog
Fix to avoid the usage of the `ret` variable uninitialized in the
following macro expansions.
It solves the following warning:
In file included from netlink-dumps.c:21:
netlink-dumps.c: In function ‘dump_extack’:
../kselftest_harness.h:788:35: warning: ‘ret’ may be used uninitialized [-Wmaybe-uninitialized]
788 | intmax_t __exp_print = (intmax_t)__exp; \
| ^~~~~~~~~~~
../kselftest_harness.h:631:9: note: in expansion of macro ‘__EXPECT’
631 | __EXPECT(expected, #expected, seen, #seen, ==, 0)
| ^~~~~~~~
netlink-dumps.c:169:9: note: in expansion of macro ‘EXPECT_EQ’
169 | EXPECT_EQ(ret, FOUND_EXTACK);
| ^~~~~~~~~
The issue can be reproduced, building the tests, with the command:
make -C tools/testing/selftests TARGETS=net
Signed-off-by: Alessandro Zanni <alessandro.zanni87(a)gmail.com>
---
tools/testing/selftests/net/netlink-dumps.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/net/netlink-dumps.c b/tools/testing/selftests/net/netlink-dumps.c
index 7618ebe528a4..8ebb8b1b9c5c 100644
--- a/tools/testing/selftests/net/netlink-dumps.c
+++ b/tools/testing/selftests/net/netlink-dumps.c
@@ -112,7 +112,7 @@ static const struct {
TEST(dump_extack)
{
int netlink_sock;
- int i, cnt, ret;
+ int i, cnt, ret = 0;
char buf[8192];
int one = 1;
ssize_t n;
--
2.43.0
Introduce SW acceleration for IPIP tunnels in the netfilter flowtable
infrastructure. This series introduces basic infrastructure to
accelerate other tunnel types (e.g. IP6IP6).
---
Changes in v7:
- Introduce sw acceleration for tx path of IPIP tunnels
- Rely on exact match during flowtable entry lookup
- Fix typos
- Link to v6: https://lore.kernel.org/r/20250818-nf-flowtable-ipip-v6-0-eda90442739c@kern…
Changes in v6:
- Rebase on top of nf-next main branch
- Link to v5: https://lore.kernel.org/r/20250721-nf-flowtable-ipip-v5-0-0865af9e58c6@kern…
Changes in v5:
- Rely on __ipv4_addr_hash() to compute the hash used as encap ID
- Remove unnecessary pskb_may_pull() in nf_flow_tuple_encap()
- Add nf_flow_ip4_ecanp_pop utility routine
- Link to v4: https://lore.kernel.org/r/20250718-nf-flowtable-ipip-v4-0-f8bb1c18b986@kern…
Changes in v4:
- Use the hash value of the saddr, daddr and protocol of outer IP header as
encapsulation id.
- Link to v3: https://lore.kernel.org/r/20250703-nf-flowtable-ipip-v3-0-880afd319b9f@kern…
Changes in v3:
- Add outer IP header sanity checks
- target nf-next tree instead of net-next
- Link to v2: https://lore.kernel.org/r/20250627-nf-flowtable-ipip-v2-0-c713003ce75b@kern…
Changes in v2:
- Introduce IPIP flowtable selftest
- Link to v1: https://lore.kernel.org/r/20250623-nf-flowtable-ipip-v1-1-2853596e3941@kern…
---
Lorenzo Bianconi (3):
net: netfilter: Add IPIP flowtable rx sw acceleration
net: netfilter: Add IPIP flowtable tx sw acceleration
selftests: netfilter: nft_flowtable.sh: Add IPIP flowtable selftest
include/linux/netdevice.h | 16 +++
include/net/netfilter/nf_flow_table.h | 26 +++++
net/ipv4/ipip.c | 29 +++++
net/netfilter/nf_flow_table_core.c | 10 ++
net/netfilter/nf_flow_table_ip.c | 118 ++++++++++++++++++++-
net/netfilter/nft_flow_offload.c | 79 ++++++++++++--
.../selftests/net/netfilter/nft_flowtable.sh | 40 +++++++
7 files changed, 307 insertions(+), 11 deletions(-)
---
base-commit: d1d7998df9d7d3ee20bcfc876065fa897b11506d
change-id: 20250623-nf-flowtable-ipip-1b3d7b08d067
Best regards,
--
Lorenzo Bianconi <lorenzo(a)kernel.org>
Dzień dobry,
kontaktuję się w imieniu kancelarii specjalizującej się w zarządzaniu wierzytelnościami.
Od lat wspieramy firmy w odzyskiwaniu należności. Prowadzimy kompleksową obsługę na etapach: przedsądowym, sądowym i egzekucyjnym, dostosowując działania do branży Klienta.
Kiedy możemy porozmawiać?
Pozdrawiam
Eryk Wawrzyn
Here are a few independent fixes related to MPTCP and its selftests:
- Patch 1: correctly handle ADD_ADDR being received after the switch to
'fully-established'. A fix for another recent fix backported up to
v5.14.
- Patches 2-5: properly mark some MPTCP Join subtests as 'skipped' if
the tested kernel doesn't support the feature being validated. Some
fixes for up to v5.13, v5.18, v6.11 and v6.18-rc1 respectively.
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
---
Matthieu Baerts (NGI0) (5):
mptcp: pm: in-kernel: C-flag: handle late ADD_ADDR
selftests: mptcp: join: mark 'flush re-add' as skipped if not supported
selftests: mptcp: join: mark implicit tests as skipped if not supported
selftests: mptcp: join: mark 'delete re-add signal' as skipped if not supported
selftests: mptcp: join: mark laminar tests as skipped if not supported
net/mptcp/pm_kernel.c | 6 ++++++
tools/testing/selftests/net/mptcp/mptcp_join.sh | 18 +++++++++---------
2 files changed, 15 insertions(+), 9 deletions(-)
---
base-commit: ffff5c8fc2af2218a3332b3d5b97654599d50cde
change-id: 20251020-net-mptcp-c-flag-late-add-addr-1d954e7b63d2
Best regards,
--
Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
This series adds namespace support to vhost-vsock and loopback. It does
not add namespaces to any of the other guest transports (virtio-vsock,
hyperv, or vmci).
The current revision supports two modes: local and global. Local
mode is complete isolation of namespaces, while global mode is complete
sharing between namespaces of CIDs (the original behavior).
The mode is set using /proc/sys/net/vsock/ns_mode.
Modes are per-netns and write-once. This allows a system to configure
namespaces independently (some may share CIDs, others are completely
isolated). This also supports future possible mixed use cases, where
there may be namespaces in global mode spinning up VMs while there are
mixed mode namespaces that provide services to the VMs, but are not
allowed to allocate from the global CID pool (this mode not implemented
in this series).
If a socket or VM is created when a namespace is global but the
namespace changes to local, the socket or VM will continue working
normally. That is, the socket or VM assumes the mode behavior of the
namespace at the time the socket/VM was created. The original mode is
captured in vsock_create() and so occurs at the time of socket(2) and
accept(2) for sockets and open(2) on /dev/vhost-vsock for VMs. This
prevents a socket/VM connection from suddenly breaking due to a
namespace mode change. Any new sockets/VMs created after the mode change
will adopt the new mode's behavior.
Additionally, added tests for the new namespace features:
tools/testing/selftests/vsock/vmtest.sh
1..30
ok 1 vm_server_host_client
ok 2 vm_client_host_server
ok 3 vm_loopback
ok 4 ns_host_vsock_ns_mode_ok
ok 5 ns_host_vsock_ns_mode_write_once_ok
ok 6 ns_global_same_cid_fails
ok 7 ns_local_same_cid_ok
ok 8 ns_global_local_same_cid_ok
ok 9 ns_local_global_same_cid_ok
ok 10 ns_diff_global_host_connect_to_global_vm_ok
ok 11 ns_diff_global_host_connect_to_local_vm_fails
ok 12 ns_diff_global_vm_connect_to_global_host_ok
ok 13 ns_diff_global_vm_connect_to_local_host_fails
ok 14 ns_diff_local_host_connect_to_local_vm_fails
ok 15 ns_diff_local_vm_connect_to_local_host_fails
ok 16 ns_diff_global_to_local_loopback_local_fails
ok 17 ns_diff_local_to_global_loopback_fails
ok 18 ns_diff_local_to_local_loopback_fails
ok 19 ns_diff_global_to_global_loopback_ok
ok 20 ns_same_local_loopback_ok
ok 21 ns_same_local_host_connect_to_local_vm_ok
ok 22 ns_same_local_vm_connect_to_local_host_ok
ok 23 ns_mode_change_connection_continue_vm_ok
ok 24 ns_mode_change_connection_continue_host_ok
ok 25 ns_mode_change_connection_continue_both_ok
ok 26 ns_delete_vm_ok
ok 27 ns_delete_host_ok
ok 28 ns_delete_both_ok
ok 29 ns_loopback_global_global_late_module_load_ok
ok 30 ns_loopback_local_local_late_module_load_fails
SUMMARY: PASS=30 SKIP=0 FAIL=0
Thanks again for everyone's help and reviews!
Signed-off-by: Bobby Eshleman <bobbyeshleman(a)gmail.com>
To: Stefano Garzarella <sgarzare(a)redhat.com>
To: Shuah Khan <shuah(a)kernel.org>
To: David S. Miller <davem(a)davemloft.net>
To: Eric Dumazet <edumazet(a)google.com>
To: Jakub Kicinski <kuba(a)kernel.org>
To: Paolo Abeni <pabeni(a)redhat.com>
To: Simon Horman <horms(a)kernel.org>
To: Stefan Hajnoczi <stefanha(a)redhat.com>
To: Michael S. Tsirkin <mst(a)redhat.com>
To: Jason Wang <jasowang(a)redhat.com>
To: Xuan Zhuo <xuanzhuo(a)linux.alibaba.com>
To: Eugenio Pérez <eperezma(a)redhat.com>
To: K. Y. Srinivasan <kys(a)microsoft.com>
To: Haiyang Zhang <haiyangz(a)microsoft.com>
To: Wei Liu <wei.liu(a)kernel.org>
To: Dexuan Cui <decui(a)microsoft.com>
To: Bryan Tan <bryan-bt.tan(a)broadcom.com>
To: Vishnu Dasa <vishnu.dasa(a)broadcom.com>
To: Broadcom internal kernel review list <bcm-kernel-feedback-list(a)broadcom.com>
Cc: virtualization(a)lists.linux.dev
Cc: netdev(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Cc: kvm(a)vger.kernel.org
Cc: linux-hyperv(a)vger.kernel.org
Cc: berrange(a)redhat.com
Changes in v7:
- fix hv_sock build
- break out vmtest patches into distinct, more well-scoped patches
- change `orig_net_mode` to `net_mode`
- many fixes and style changes in per-patch change sets (see individual
patches for specific changes)
- optimize `virtio_vsock_skb_cb` layout
- update commit messages with more useful descriptions
- vsock_loopback: use orig_net_mode instead of current net mode
- add tests for edge cases (ns deletion, mode changing, loopback module
load ordering)
- Link to v6: https://lore.kernel.org/r/20250916-vsock-vmtest-v6-0-064d2eb0c89d@meta.com
Changes in v6:
- define behavior when mode changes to local while socket/VM is alive
- af_vsock: clarify description of CID behavior
- af_vsock: use stronger langauge around CID rules (dont use "may")
- af_vsock: improve naming of buf/buffer
- af_vsock: improve string length checking on proc writes
- vsock_loopback: add space in struct to clarify lock protection
- vsock_loopback: do proper cleanup/unregister on vsock_loopback_exit()
- vsock_loopback: use virtio_vsock_skb_net() instead of sock_net()
- vsock_loopback: set loopback to NULL after kfree()
- vsock_loopback: use pernet_operations and remove callback mechanism
- vsock_loopback: add macros for "global" and "local"
- vsock_loopback: fix length checking
- vmtest.sh: check for namespace support in vmtest.sh
- Link to v5: https://lore.kernel.org/r/20250827-vsock-vmtest-v5-0-0ba580bede5b@meta.com
Changes in v5:
- /proc/net/vsock_ns_mode -> /proc/sys/net/vsock/ns_mode
- vsock_global_net -> vsock_global_dummy_net
- fix netns lookup in vhost_vsock to respect pid namespaces
- add callbacks for vsock_loopback to avoid circular dependency
- vmtest.sh loads vsock_loopback module
- remove vsock_net_mode_can_set()
- change vsock_net_write_mode() to return true/false based on success
- make vsock_net_mode enum instead of u8
- Link to v4: https://lore.kernel.org/r/20250805-vsock-vmtest-v4-0-059ec51ab111@meta.com
Changes in v4:
- removed RFC tag
- implemented loopback support
- renamed new tests to better reflect behavior
- completed suite of tests with permutations of ns modes and vsock_test
as guest/host
- simplified socat bridging with unix socket instead of tcp + veth
- only use vsock_test for success case, socat for failure case (context
in commit message)
- lots of cleanup
Changes in v3:
- add notion of "modes"
- add procfs /proc/net/vsock_ns_mode
- local and global modes only
- no /dev/vhost-vsock-netns
- vmtest.sh already merged, so new patch just adds new tests for NS
- Link to v2:
https://lore.kernel.org/kvm/20250312-vsock-netns-v2-0-84bffa1aa97a@gmail.com
Changes in v2:
- only support vhost-vsock namespaces
- all g2h namespaces retain old behavior, only common API changes
impacted by vhost-vsock changes
- add /dev/vhost-vsock-netns for "opt-in"
- leave /dev/vhost-vsock to old behavior
- removed netns module param
- Link to v1:
https://lore.kernel.org/r/20200116172428.311437-1-sgarzare@redhat.com
Changes in v1:
- added 'netns' module param to vsock.ko to enable the
network namespace support (disabled by default)
- added 'vsock_net_eq()' to check the "net" assigned to a socket
only when 'netns' support is enabled
- Link to RFC: https://patchwork.ozlabs.org/cover/1202235/
---
Bobby Eshleman (26):
vsock: a per-net vsock NS mode state
vsock/virtio: pack struct virtio_vsock_skb_cb
vsock: add netns to vsock skb cb
vsock: add netns to vsock core
vsock/loopback: add netns support
vsock/virtio: add netns to virtio transport common
vhost/vsock: add netns support
selftests/vsock: improve logging in vmtest.sh
selftests/vsock: make wait_for_listener() work even if pipefail is on
selftests/vsock: reuse logic for vsock_test through wrapper functions
selftests/vsock: avoid multi-VM pidfile collisions with QEMU
selftests/vsock: do not unconditionally die if qemu fails
selftests/vsock: speed up tests by reducing the QEMU pidfile timeout
selftests/vsock: add check_result() for pass/fail counting
selftests/vsock: identify and execute tests that can re-use VM
selftests/vsock: add namespace initialization function
selftests/vsock: remove namespaces in cleanup()
selftests/vsock: prepare vm management helpers for namespaces
selftests/vsock: add BUILD=0 definition
selftests/vsock: avoid false-positives when checking dmesg
selftests/vsock: add tests for proc sys vsock ns_mode
selftests/vsock: add namespace tests for CID collisions
selftests/vsock: add tests for host <-> vm connectivity with namespaces
selftests/vsock: add tests for namespace deletion and mode changes
selftests/vsock: add tests for module loading order
selftests/vsock: add 1.37 to tested virtme-ng versions
MAINTAINERS | 1 +
drivers/vhost/vsock.c | 48 +-
include/linux/virtio_vsock.h | 47 +-
include/net/af_vsock.h | 78 +-
include/net/net_namespace.h | 4 +
include/net/netns/vsock.h | 22 +
net/vmw_vsock/af_vsock.c | 264 ++++++-
net/vmw_vsock/virtio_transport.c | 7 +-
net/vmw_vsock/virtio_transport_common.c | 21 +-
net/vmw_vsock/vsock_loopback.c | 89 ++-
tools/testing/selftests/vsock/vmtest.sh | 1320 ++++++++++++++++++++++++++++---
11 files changed, 1729 insertions(+), 172 deletions(-)
---
base-commit: 3ff9bcecce83f12169ab3e42671bd76554ca521a
change-id: 20250325-vsock-vmtest-b3a21d2102c2
Best regards,
--
Bobby Eshleman <bobbyeshleman(a)meta.com>
From: Wilfred Mallawa <wilfred.mallawa(a)wdc.com>
During a handshake, an endpoint may specify a maximum record size limit.
Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the
maximum record size. Meaning that, the outgoing records from the kernel
can exceed a lower size negotiated during the handshake. In such a case,
the TLS endpoint must send a fatal "record_overflow" alert [1], and
thus the record is discarded.
Upcoming Western Digital NVMe-TCP hardware controllers implement TLS
support. For these devices, supporting TLS record size negotiation is
necessary because the maximum TLS record size supported by the controller
is less than the default 16KB currently used by the kernel.
Currently, there is no way to inform the kernel of such a limit. This patch
adds support to a new setsockopt() option `TLS_TX_MAX_PAYLOAD_LEN` that
allows for setting the maximum plaintext fragment size. Once set, outgoing
records are no larger than the size specified. This option can be used to
specify the record size limit.
[1] https://www.rfc-editor.org/rfc/rfc8449
Signed-off-by: Wilfred Mallawa <wilfred.mallawa(a)wdc.com>
---
V6 -> V7:
- Added more information to the description regarding record_size_limit
- For TLS 1.3, setsockopt() now allows a 63 byte minimum to account for the
ContentType
- getsockopt() returns the total plaintext length, for TLS 1.3, this will
1 byte higher than what is set using setsockopt().
---
Documentation/networking/tls.rst | 22 +++++++++++
include/net/tls.h | 3 ++
include/uapi/linux/tls.h | 2 +
net/tls/tls_device.c | 2 +-
net/tls/tls_main.c | 68 ++++++++++++++++++++++++++++++++
net/tls/tls_sw.c | 2 +-
6 files changed, 97 insertions(+), 2 deletions(-)
diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index 36cc7afc2527..ecaa7631ec46 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -280,6 +280,28 @@ If the record decrypted turns out to had been padded or is not a data
record it will be decrypted again into a kernel buffer without zero copy.
Such events are counted in the ``TlsDecryptRetry`` statistic.
+TLS_TX_MAX_PAYLOAD_LEN
+~~~~~~~~~~~~~~~~~~~~~~
+
+Specifies the maximum size of the plaintext payload for transmitted TLS records.
+
+When this option is set, the kernel enforces the specified limit on all outgoing
+TLS records. No plaintext fragment will exceed this size. This option can be used
+to implement the TLS Record Size Limit extension [1].
+ - For TLS 1.2, the value corresponds directly to the record size limit.
+ - For TLS 1.3, the value should be set to record_size_limit - 1, since
+ the record size limit includes one additional byte for the ContentType
+ field.
+
+The valid range for this option is 64 to 16384 bytes for TLS 1.2, and 63 to
+16384 bytes for TLS 1.3. The lower minimum for TLS 1.3 accounts for the
+extra byte used by the ContentType field.
+
+For TLS 1.3, getsockopt() will return the total plaintext fragment length,
+inclusive of the ContentType field.
+
+[1] https://datatracker.ietf.org/doc/html/rfc8449
+
Statistics
==========
diff --git a/include/net/tls.h b/include/net/tls.h
index 857340338b69..f2af113728aa 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -53,6 +53,8 @@ struct tls_rec;
/* Maximum data size carried in a TLS record */
#define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14)
+/* Minimum record size limit as per RFC8449 */
+#define TLS_MIN_RECORD_SIZE_LIM ((size_t)1 << 6)
#define TLS_HEADER_SIZE 5
#define TLS_NONCE_OFFSET TLS_HEADER_SIZE
@@ -226,6 +228,7 @@ struct tls_context {
u8 rx_conf:3;
u8 zerocopy_sendfile:1;
u8 rx_no_pad:1;
+ u16 tx_max_payload_len;
int (*push_pending_record)(struct sock *sk, int flags);
void (*sk_write_space)(struct sock *sk);
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index b66a800389cc..b8b9c42f848c 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -41,6 +41,7 @@
#define TLS_RX 2 /* Set receive parameters */
#define TLS_TX_ZEROCOPY_RO 3 /* TX zerocopy (only sendfile now) */
#define TLS_RX_EXPECT_NO_PAD 4 /* Attempt opportunistic zero-copy */
+#define TLS_TX_MAX_PAYLOAD_LEN 5 /* Maximum plaintext size */
/* Supported versions */
#define TLS_VERSION_MINOR(ver) ((ver) & 0xFF)
@@ -194,6 +195,7 @@ enum {
TLS_INFO_RXCONF,
TLS_INFO_ZC_RO_TX,
TLS_INFO_RX_NO_PAD,
+ TLS_INFO_TX_MAX_PAYLOAD_LEN,
__TLS_INFO_MAX,
};
#define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index caa2b5d24622..4d29b390aed9 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -462,7 +462,7 @@ static int tls_push_data(struct sock *sk,
/* TLS_HEADER_SIZE is not counted as part of the TLS record, and
* we need to leave room for an authentication tag.
*/
- max_open_record_len = TLS_MAX_PAYLOAD_SIZE +
+ max_open_record_len = tls_ctx->tx_max_payload_len +
prot->prepend_size;
do {
rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 39a2ab47fe72..b234d44bd789 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -541,6 +541,32 @@ static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval,
return 0;
}
+static int do_tls_getsockopt_tx_payload_len(struct sock *sk, char __user *optval,
+ int __user *optlen)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ u16 payload_len = ctx->tx_max_payload_len;
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ /* For TLS 1.3 payload length includes ContentType */
+ if (ctx->prot_info.version == TLS_1_3_VERSION)
+ payload_len++;
+
+ if (len < sizeof(payload_len))
+ return -EINVAL;
+
+ if (put_user(sizeof(payload_len), optlen))
+ return -EFAULT;
+
+ if (copy_to_user(optval, &payload_len, sizeof(payload_len)))
+ return -EFAULT;
+
+ return 0;
+}
+
static int do_tls_getsockopt(struct sock *sk, int optname,
char __user *optval, int __user *optlen)
{
@@ -560,6 +586,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname,
case TLS_RX_EXPECT_NO_PAD:
rc = do_tls_getsockopt_no_pad(sk, optval, optlen);
break;
+ case TLS_TX_MAX_PAYLOAD_LEN:
+ rc = do_tls_getsockopt_tx_payload_len(sk, optval, optlen);
+ break;
default:
rc = -ENOPROTOOPT;
break;
@@ -809,6 +838,32 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval,
return rc;
}
+static int do_tls_setsockopt_tx_payload_len(struct sock *sk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ struct tls_sw_context_tx *sw_ctx = tls_sw_ctx_tx(ctx);
+ u16 value;
+ bool tls_13 = ctx->prot_info.version == TLS_1_3_VERSION;
+
+ if (sw_ctx && sw_ctx->open_rec)
+ return -EBUSY;
+
+ if (sockptr_is_null(optval) || optlen != sizeof(value))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&value, optval, sizeof(value)))
+ return -EFAULT;
+
+ if (value < TLS_MIN_RECORD_SIZE_LIM - (tls_13 ? 1 : 0) ||
+ value > TLS_MAX_PAYLOAD_SIZE)
+ return -EINVAL;
+
+ ctx->tx_max_payload_len = value;
+
+ return 0;
+}
+
static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
unsigned int optlen)
{
@@ -830,6 +885,11 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
case TLS_RX_EXPECT_NO_PAD:
rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
break;
+ case TLS_TX_MAX_PAYLOAD_LEN:
+ lock_sock(sk);
+ rc = do_tls_setsockopt_tx_payload_len(sk, optval, optlen);
+ release_sock(sk);
+ break;
default:
rc = -ENOPROTOOPT;
break;
@@ -1019,6 +1079,7 @@ static int tls_init(struct sock *sk)
ctx->tx_conf = TLS_BASE;
ctx->rx_conf = TLS_BASE;
+ ctx->tx_max_payload_len = TLS_MAX_PAYLOAD_SIZE;
update_sk_prot(sk, ctx);
out:
write_unlock_bh(&sk->sk_callback_lock);
@@ -1108,6 +1169,12 @@ static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin)
goto nla_failure;
}
+ err = nla_put_u16(skb, TLS_INFO_TX_MAX_PAYLOAD_LEN,
+ ctx->tx_max_payload_len);
+
+ if (err)
+ goto nla_failure;
+
rcu_read_unlock();
nla_nest_end(skb, start);
return 0;
@@ -1129,6 +1196,7 @@ static size_t tls_get_info_size(const struct sock *sk, bool net_admin)
nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */
nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */
nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */
+ nla_total_size(sizeof(u16)) + /* TLS_INFO_TX_MAX_PAYLOAD_LEN */
0;
return size;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index d17135369980..9937d4c810f2 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1079,7 +1079,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
orig_size = msg_pl->sg.size;
full_record = false;
try_to_copy = msg_data_left(msg);
- record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
+ record_room = tls_ctx->tx_max_payload_len - msg_pl->sg.size;
if (try_to_copy >= record_room) {
try_to_copy = record_room;
full_record = true;
--
2.51.0
Recently, I noticed a selftest failure in my local environment. The
test_parse_test_list_file writes some data to
/tmp/bpf_arg_parsing_test.XXXXXX and parse_test_list_file() will read
the data back. However, after writing data to that file, we forget to
call fsync() and it's causing testing failure in my laptop. This patch
helps fix it by adding the missing fsync() call.
Signed-off-by: Xing Guo <higuoxing(a)gmail.com>
---
tools/testing/selftests/bpf/prog_tests/arg_parsing.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/tools/testing/selftests/bpf/prog_tests/arg_parsing.c b/tools/testing/selftests/bpf/prog_tests/arg_parsing.c
index bb143de68875..4f071943ffb0 100644
--- a/tools/testing/selftests/bpf/prog_tests/arg_parsing.c
+++ b/tools/testing/selftests/bpf/prog_tests/arg_parsing.c
@@ -140,6 +140,7 @@ static void test_parse_test_list_file(void)
fprintf(fp, "testA/subtest2\n");
fprintf(fp, "testC_no_eof_newline");
fflush(fp);
+ fsync(fd);
if (!ASSERT_OK(ferror(fp), "prepare tmp"))
goto out_fclose;
--
2.51.0
This patch series suggests fixes for several corner cases in the RISC-V
vector ptrace implementation:
- follow gdbserver expectations and return ENODATA instead of EINVAL if vector
extension is supported but not yet activated for a traced process
- force vector context save on the next context switch after ptrace call that
modified vector CSRs, to avoid reading stale values by the next ptrace calls
- force vector context save on the first context switch after vector context
initialization, to avoid reading zero vlenb by an early attached debugger
For detailed description see the appropriate commit messages. A new test is
added into the tools/testing/selftests/riscv/vector to verify the fixes.
Each fix is accompanied by its own test case.
Initial version [1] of this series included only the last fix for zero vlenb.
[1] https://lore.kernel.org/linux-riscv/20250821173957.563472-1-geomatsi@gmail.…
Ilya Mamay (1):
riscv: ptrace: return ENODATA for inactive vector extension
Sergey Matyukevich (5):
selftests: riscv: test ptrace vector interface
selftests: riscv: set invalid vtype using ptrace
riscv: vector: allow to force vector context save
selftests: riscv: verify initial vector state with ptrace
riscv: vector: initialize vlenb on the first context switch
arch/riscv/include/asm/thread_info.h | 2 +
arch/riscv/include/asm/vector.h | 3 +
arch/riscv/kernel/process.c | 2 +
arch/riscv/kernel/ptrace.c | 15 +-
arch/riscv/kernel/vector.c | 4 +
.../testing/selftests/riscv/vector/.gitignore | 1 +
tools/testing/selftests/riscv/vector/Makefile | 5 +-
.../testing/selftests/riscv/vector/v_ptrace.c | 302 ++++++++++++++++++
8 files changed, 331 insertions(+), 3 deletions(-)
create mode 100644 tools/testing/selftests/riscv/vector/v_ptrace.c
base-commit: c746c3b5169831d7fb032a1051d8b45592ae8d78
--
2.51.0
Introduce SW acceleration for IPIP tunnels in the netfilter flowtable
infrastructure.
---
Changes in v6:
- Rebase on top of nf-next main branch
- Link to v5: https://lore.kernel.org/r/20250721-nf-flowtable-ipip-v5-0-0865af9e58c6@kern…
Changes in v5:
- Rely on __ipv4_addr_hash() to compute the hash used as encap ID
- Remove unnecessary pskb_may_pull() in nf_flow_tuple_encap()
- Add nf_flow_ip4_ecanp_pop utility routine
- Link to v4: https://lore.kernel.org/r/20250718-nf-flowtable-ipip-v4-0-f8bb1c18b986@kern…
Changes in v4:
- Use the hash value of the saddr, daddr and protocol of outer IP header as
encapsulation id.
- Link to v3: https://lore.kernel.org/r/20250703-nf-flowtable-ipip-v3-0-880afd319b9f@kern…
Changes in v3:
- Add outer IP header sanity checks
- target nf-next tree instead of net-next
- Link to v2: https://lore.kernel.org/r/20250627-nf-flowtable-ipip-v2-0-c713003ce75b@kern…
Changes in v2:
- Introduce IPIP flowtable selftest
- Link to v1: https://lore.kernel.org/r/20250623-nf-flowtable-ipip-v1-1-2853596e3941@kern…
---
Lorenzo Bianconi (2):
net: netfilter: Add IPIP flowtable SW acceleration
selftests: netfilter: nft_flowtable.sh: Add IPIP flowtable selftest
include/linux/netdevice.h | 1 +
net/ipv4/ipip.c | 28 +++++++++++
net/netfilter/nf_flow_table_ip.c | 56 +++++++++++++++++++++-
net/netfilter/nft_flow_offload.c | 1 +
.../selftests/net/netfilter/nft_flowtable.sh | 40 ++++++++++++++++
5 files changed, 124 insertions(+), 2 deletions(-)
---
base-commit: bab3ce404553de56242d7b09ad7ea5b70441ea41
change-id: 20250623-nf-flowtable-ipip-1b3d7b08d067
Best regards,
--
Lorenzo Bianconi <lorenzo(a)kernel.org>
[All the precursor patches are merged now and AMD/RISCV/VTD conversions
are written]
Currently each of the iommu page table formats duplicates all of the logic
to maintain the page table and perform map/unmap/etc operations. There are
several different versions of the algorithms between all the different
formats. The io-pgtable system provides an interface to help isolate the
page table code from the iommu driver, but doesn't provide tools to
implement the common algorithms.
This makes it very hard to improve the state of the pagetable code under
the iommu domains as any proposed improvement needs to alter a large
number of different driver code paths. Combined with a lack of software
based testing this makes improvement in this area very hard.
iommufd wants several new page table operations:
- More efficient map/unmap operations, using iommufd's batching logic
- unmap that returns the physical addresses into a batch as it progresses
- cut that allows splitting areas so large pages can have holes
poked in them dynamically (ie guestmemfd hitless shared/private
transitions)
- More agressive freeing of table memory to avoid waste
- Fragmenting large pages so that dirty tracking can be more granular
- Reassembling large pages so that VMs can run at full IO performance
in migration/dirty tracking error flows
- KHO integration for kernel live upgrade
Together these are algorithmically complex enough to be a very significant
task to go and implement in all the page table formats we support. Just
the "server" focused drivers use almost all the formats (ARMv8 S1&S2 / x86
PAE / AMDv1 / VT-D SS / RISCV)
Instead of doing the duplicated work, this series takes the first step to
consolidate the algorithms into one places. In spirit it is similar to the
work Christoph did a few years back to pull the redundant get_user_pages()
implementations out of the arch code into core MM. This unlocked a great
deal of improvement in that space in the following years. I would like to
see the same benefit in iommu as well.
My first RFC showed a bigger picture with all most all formats and more
algorithms. This series reorganizes that to be narrowly focused on just
enough to convert the AMD driver to use the new mechanism.
kunit tests are provided that allow good testing of the algorithms and all
formats on x86, nothing is arch specific.
AMD is one of the simpler options as the HW is quite uniform with few
different options/bugs while still requiring the complicated contiguous
pages support. The HW also has a very simple range based invalidation
approach that is easy to implement.
The AMD v1 and AMD v2 page table formats are implemented bit for bit
identical to the current code, tested using a compare kunit test that
checks against the io-pgtable version (on github, see below).
Updating the AMD driver to replace the io-pgtable layer with the new stuff
is fairly straightforward now. The layering is fixed up in the new version
so that all the invalidation goes through function pointers.
Several small fixing patches have come out of this as I've been fixing the
problems that the test suite uncovers in the current code, and
implementing the fixed version in iommupt.
On performance, there is a quite wide variety of implementation designs
across all the drivers. Looking at some key performance across
the main formats:
iommu_map():
pgsz ,avg new,old ns, min new,old ns , min % (+ve is better)
2^12, 53,66 , 51,63 , 19.19 (AMDV1)
256*2^12, 386,1909 , 367,1795 , 79.79
256*2^21, 362,1633 , 355,1556 , 77.77
2^12, 56,62 , 52,59 , 11.11 (AMDv2)
256*2^12, 405,1355 , 357,1292 , 72.72
256*2^21, 393,1160 , 358,1114 , 67.67
2^12, 55,65 , 53,62 , 14.14 (VTD second stage)
256*2^12, 391,518 , 332,512 , 35.35
256*2^21, 383,635 , 336,624 , 46.46
2^12, 57,65 , 55,63 , 12.12 (ARM 64 bit)
256*2^12, 380,389 , 361,369 , 2.02
256*2^21, 358,419 , 345,400 , 13.13
iommu_unmap():
pgsz ,avg new,old ns, min new,old ns , min % (+ve is better)
2^12, 69,88 , 65,85 , 23.23 (AMDv1)
256*2^12, 353,6498 , 331,6029 , 94.94
256*2^21, 373,6014 , 360,5706 , 93.93
2^12, 71,72 , 66,69 , 4.04 (AMDv2)
256*2^12, 228,891 , 206,871 , 76.76
256*2^21, 254,721 , 245,711 , 65.65
2^12, 69,87 , 65,82 , 20.20 (VTD second stage)
256*2^12, 210,321 , 200,315 , 36.36
256*2^21, 255,349 , 238,342 , 30.30
2^12, 72,77 , 68,74 , 8.08 (ARM 64 bit)
256*2^12, 521,357 , 447,346 , -29.29
256*2^21, 489,358 , 433,345 , -25.25
* Above numbers include additional patches to remove the iommu_pgsize()
overheads. gcc 13.3.0, i7-12700
This version provides fairly consistent performance across formats. ARM
unmap performance is quite different because this version supports
contiguous pages and uses a very different algorithm for unmapping. Though
why it is so worse compared to AMDv1 I haven't figured out yet.
The per-format commits include a more detailed chart.
There is a second branch:
https://github.com/jgunthorpe/linux/commits/iommu_pt_all
Containing supporting work and future steps:
- ARM short descriptor (32 bit), ARM long descriptor (64 bit) formats
- RISCV format and RISCV conversion
https://github.com/jgunthorpe/linux/commits/iommu_pt_riscv
- Support for a DMA incoherent HW page table walker
- VT-D second stage format and VT-D conversion
https://github.com/jgunthorpe/linux/commits/iommu_pt_vtd
- DART v1 & v2 format
- Draft of a iommufd 'cut' operation to break down huge pages
- A compare test that checks the iommupt formats against the iopgtable
interface, including updating AMD to have a working iopgtable and patches
to make VT-D have an iopgtable for testing.
- A performance test to micro-benchmark map and unmap against iogptable
My strategy is to go one by one for the drivers:
- AMD driver conversion
- RISCV page table and driver
- Intel VT-D driver and VTDSS page table
- Flushing improvements for RISCV
- ARM SMMUv3
And concurrently work on the algorithm side:
- debugfs content dump, like VT-D has
- Cut support
- Increase/Decrease page size support
- map/unmap batching
- KHO
As we make more algorithm improvements the value to convert the drivers
increases.
This is on github: https://github.com/jgunthorpe/linux/commits/iommu_pt
v6:
- Improve comments and documentation
- Rename pt_entry_oa_full -> pt_entry_oa_exact
pt_has_system_page -> pt_has_system_page_size
pt_max_output_address_lg2 -> pt_max_oa_lg2
log2_f*() -> vaf* / oaf* / f*_t
pt_item_fully_covered -> pt_entry_fully_covered
- Fix missed constant propogation causing division
- Consolidate debugging checks to pt_check_install_leaf_args()
- Change collect->ignore_mapped to check_mapped
- Shuffle some hunks around to more appropriate patches
- Two new mini kunit tests
v5: https://patch.msgid.link/r/0-v5-116c4948af3d+68091-iommu_pt_jgg@nvidia.com
- Text grammar updates and kdoc fixes
v4: https://patch.msgid.link/r/0-v4-0d6a6726a372+18959-iommu_pt_jgg@nvidia.com
- Rebase on v6.16-rc3
- Integrate the HATS/HATDis changes
- Remove 'default n' from kconfig
- Remove unused 'PT_FIXED_TOP_LEVEL'
- Improve comments and documentation
- Fix some compile warnings from kbuild robots
v3: https://patch.msgid.link/r/0-v3-a93aab628dbc+521-iommu_pt_jgg@nvidia.com
- Rebase on v6.16-rc2
- s/PT_ENTRY_WORD_SIZE/PT_ITEM_WORD_SIZE/s to follow the language better
- Comment and documentation updates
- Add PT_TOP_PHYS_MASK to help manage alignment restrictions on the top
pointer
- Add missed force_aperture = true
- Make pt_iommu_deinit() take care of the not-yet-inited error case
internally as AMD/RISCV/VTD all shared this logic
- Change gather_range() into gather_range_pages() so it also deals with
the page list. This makes the following cache flushing series simpler
- Fix missed update of unmap->unmapped in some error cases
- Change clear_contig() to order the gather more logically
- Remove goto from the error handling in __map_range_leaf()
- s/log2_/oalog2_/ in places where the argument is an oaddr_t
- Pass the pts to pt_table_install64/32()
- Do not use SIGN_EXTEND for the AMDv2 page table because of Vasant's
information on how PASID 0 works.
v2: https://patch.msgid.link/r/0-v2-5c26bde5c22d+58b-iommu_pt_jgg@nvidia.com
- AMD driver only, many code changes
RFC: https://lore.kernel.org/all/0-v1-01fa10580981+1d-iommu_pt_jgg@nvidia.com/
Cc: Michael Roth <michael.roth(a)amd.com>
Cc: Alexey Kardashevskiy <aik(a)amd.com>
Cc: Pasha Tatashin <pasha.tatashin(a)soleen.com>
Cc: James Gowans <jgowans(a)amazon.com>
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
Alejandro Jimenez (1):
iommu/amd: Use the generic iommu page table
Jason Gunthorpe (14):
genpt: Generic Page Table base API
genpt: Add Documentation/ files
iommupt: Add the basic structure of the iommu implementation
iommupt: Add the AMD IOMMU v1 page table format
iommupt: Add iova_to_phys op
iommupt: Add unmap_pages op
iommupt: Add map_pages op
iommupt: Add read_and_clear_dirty op
iommupt: Add a kunit test for Generic Page Table
iommupt: Add a mock pagetable format for iommufd selftest to use
iommufd: Change the selftest to use iommupt instead of xarray
iommupt: Add the x86 64 bit page table format
iommu/amd: Remove AMD io_pgtable support
iommupt: Add a kunit test for the IOMMU implementation
.clang-format | 1 +
Documentation/driver-api/generic_pt.rst | 142 ++
Documentation/driver-api/index.rst | 1 +
drivers/iommu/Kconfig | 2 +
drivers/iommu/Makefile | 1 +
drivers/iommu/amd/Kconfig | 5 +-
drivers/iommu/amd/Makefile | 2 +-
drivers/iommu/amd/amd_iommu.h | 1 -
drivers/iommu/amd/amd_iommu_types.h | 110 +-
drivers/iommu/amd/io_pgtable.c | 577 --------
drivers/iommu/amd/io_pgtable_v2.c | 370 ------
drivers/iommu/amd/iommu.c | 538 ++++----
drivers/iommu/generic_pt/.kunitconfig | 13 +
drivers/iommu/generic_pt/Kconfig | 67 +
drivers/iommu/generic_pt/fmt/Makefile | 26 +
drivers/iommu/generic_pt/fmt/amdv1.h | 408 ++++++
drivers/iommu/generic_pt/fmt/defs_amdv1.h | 21 +
drivers/iommu/generic_pt/fmt/defs_x86_64.h | 21 +
drivers/iommu/generic_pt/fmt/iommu_amdv1.c | 15 +
drivers/iommu/generic_pt/fmt/iommu_mock.c | 10 +
drivers/iommu/generic_pt/fmt/iommu_template.h | 48 +
drivers/iommu/generic_pt/fmt/iommu_x86_64.c | 11 +
drivers/iommu/generic_pt/fmt/x86_64.h | 251 ++++
drivers/iommu/generic_pt/iommu_pt.h | 1157 +++++++++++++++++
drivers/iommu/generic_pt/kunit_generic_pt.h | 713 ++++++++++
drivers/iommu/generic_pt/kunit_iommu.h | 182 +++
drivers/iommu/generic_pt/kunit_iommu_pt.h | 486 +++++++
drivers/iommu/generic_pt/pt_common.h | 358 +++++
drivers/iommu/generic_pt/pt_defs.h | 329 +++++
drivers/iommu/generic_pt/pt_fmt_defaults.h | 233 ++++
drivers/iommu/generic_pt/pt_iter.h | 636 +++++++++
drivers/iommu/generic_pt/pt_log2.h | 122 ++
drivers/iommu/io-pgtable.c | 4 -
drivers/iommu/iommufd/Kconfig | 1 +
drivers/iommu/iommufd/iommufd_test.h | 11 +-
drivers/iommu/iommufd/selftest.c | 438 +++----
include/linux/generic_pt/common.h | 167 +++
include/linux/generic_pt/iommu.h | 270 ++++
include/linux/io-pgtable.h | 2 -
tools/testing/selftests/iommu/iommufd.c | 60 +-
tools/testing/selftests/iommu/iommufd_utils.h | 12 +
41 files changed, 6212 insertions(+), 1610 deletions(-)
create mode 100644 Documentation/driver-api/generic_pt.rst
delete mode 100644 drivers/iommu/amd/io_pgtable.c
delete mode 100644 drivers/iommu/amd/io_pgtable_v2.c
create mode 100644 drivers/iommu/generic_pt/.kunitconfig
create mode 100644 drivers/iommu/generic_pt/Kconfig
create mode 100644 drivers/iommu/generic_pt/fmt/Makefile
create mode 100644 drivers/iommu/generic_pt/fmt/amdv1.h
create mode 100644 drivers/iommu/generic_pt/fmt/defs_amdv1.h
create mode 100644 drivers/iommu/generic_pt/fmt/defs_x86_64.h
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_amdv1.c
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_mock.c
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_template.h
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_x86_64.c
create mode 100644 drivers/iommu/generic_pt/fmt/x86_64.h
create mode 100644 drivers/iommu/generic_pt/iommu_pt.h
create mode 100644 drivers/iommu/generic_pt/kunit_generic_pt.h
create mode 100644 drivers/iommu/generic_pt/kunit_iommu.h
create mode 100644 drivers/iommu/generic_pt/kunit_iommu_pt.h
create mode 100644 drivers/iommu/generic_pt/pt_common.h
create mode 100644 drivers/iommu/generic_pt/pt_defs.h
create mode 100644 drivers/iommu/generic_pt/pt_fmt_defaults.h
create mode 100644 drivers/iommu/generic_pt/pt_iter.h
create mode 100644 drivers/iommu/generic_pt/pt_log2.h
create mode 100644 include/linux/generic_pt/common.h
create mode 100644 include/linux/generic_pt/iommu.h
base-commit: cc1d7df505790fe734117b41455f1fe82ebf5ae5
--
2.43.0
Problem
=======
When host APEI is unable to claim a synchronous external abort (SEA)
during guest abort, today KVM directly injects an asynchronous SError
into the VCPU then resumes it. The injected SError usually results in
unpleasant guest kernel panic.
One of the major situation of guest SEA is when VCPU consumes recoverable
uncorrected memory error (UER), which is not uncommon at all in modern
datacenter servers with large amounts of physical memory. Although SError
and guest panic is sufficient to stop the propagation of corrupted memory,
there is room to recover from an UER in a more graceful manner.
Proposed Solution
=================
The idea is, we can replay the SEA to the faulting VCPU. If the memory
error consumption or the fault that cause SEA is not from guest kernel,
the blast radius can be limited to the poison-consuming guest process,
while the VM can keep running.
In addition, instead of doing under the hood without involving userspace,
there are benefits to redirect the SEA to VMM:
- VM customers care about the disruptions caused by memory errors, and
VMM usually has the responsibility to start the process of notifying
the customers of memory error events in their VMs. For example some
cloud provider emits a critical log in their observability UI [1], and
provides a playbook for customers on how to mitigate disruptions to
their workloads.
- VMM can protect future memory error consumption by unmapping the poisoned
pages from stage-2 page table with KVM userfault [2], or by splitting the
memslot that contains the poisoned pages.
- VMM can keep track of SEA events in the VM. When VMM thinks the status
on the host or the VM is bad enough, e.g. number of distinct SEAs
exceeds a threshold, it can restart the VM on another healthy host.
- Behavior parity with x86 architecture. When machine check exception
(MCE) is caused by VCPU, kernel or KVM signals userspace SIGBUS to
let VMM either recover from the MCE, or terminate itself with VM.
The prior RFC proposes to implement SIGBUS on arm64 as well, but
Marc preferred KVM exit over signal [3]. However, implementation
aside, returning SEA to VMM is on par with returning MCE to VMM.
Once SEA is redirected to VMM, among other actions, VMM is encouraged
to inject external aborts into the faulting VCPU.
New UAPIs
=========
This patchset introduces following userspace-visible changes to empower
VMM to control what happens for SEA on guest memory:
- KVM_CAP_ARM_SEA_TO_USER. While taking SEA, if userspace has enabled
this new capability at VM creation, and the SEA is not owned by kernel
allocated memory, instead of injecting SError, return KVM_EXIT_ARM_SEA
to userspace.
- KVM_EXIT_ARM_SEA. This is the VM exit reason VMM gets. The details
about the SEA is provided in arm_sea as much as possible, including
sanitized ESR value at EL2, faulting guest virtual and physical
addresses if available.
* From v3 [4]
- Rebased on commit 3a8660878839 ("Linux 6.18-rc1").
- In selftest, print a message if GVA or GPA expects to be valid.
* From v2 [5]:
- Rebased on "[PATCH] KVM: arm64: nv: Handle SEAs due to VNCR redirection" [6]
and kvmarm/next commit 7b8346bd9fce6 ("KVM: arm64: Don't attempt vLPI
mappings when vPE allocation is disabled")
- Took the host_owns_sea implementation from Oliver [7, 8].
- Excluded the guest SEA injection patches.
- Updated selftest.
* From v1 [9]:
- Rebased on commit 4d62121ce9b5 ("KVM: arm64: vgic-debug: Avoid
dereferencing NULL ITE pointer").
- Sanitize ESR_EL2 before reporting it to userspace.
- Do not do KVM_EXIT_ARM_SEA when SEA is caused by memory allocated to
stage-2 translation table.
[1] https://cloud.google.com/solutions/sap/docs/manage-host-errors
[2] https://lore.kernel.org/kvm/20250109204929.1106563-1-jthoughton@google.com
[3] https://lore.kernel.org/kvm/86pljbqqh0.wl-maz@kernel.org
[4] https://lore.kernel.org/kvmarm/20250731205844.1346839-1-jiaqiyan@google.com
[5] https://lore.kernel.org/kvm/20250604050902.3944054-1-jiaqiyan@google.com
[6] https://lore.kernel.org/kvmarm/20250729182342.3281742-1-oliver.upton@linux.…
[7] https://lore.kernel.org/kvm/aHFohmTb9qR_JG1E@linux.dev
[8] https://lore.kernel.org/kvm/aHK-DPufhLy5Dtuk@linux.dev
[9] https://lore.kernel.org/kvm/20250505161412.1926643-1-jiaqiyan@google.com
Jiaqi Yan (3):
KVM: arm64: VM exit to userspace to handle SEA
KVM: selftests: Test for KVM_EXIT_ARM_SEA
Documentation: kvm: new UAPI for handling SEA
Documentation/virt/kvm/api.rst | 61 ++++
arch/arm64/include/asm/kvm_host.h | 2 +
arch/arm64/kvm/arm.c | 5 +
arch/arm64/kvm/mmu.c | 68 +++-
include/uapi/linux/kvm.h | 10 +
tools/arch/arm64/include/asm/esr.h | 2 +
tools/testing/selftests/kvm/Makefile.kvm | 1 +
.../testing/selftests/kvm/arm64/sea_to_user.c | 331 ++++++++++++++++++
tools/testing/selftests/kvm/lib/kvm_util.c | 1 +
9 files changed, 480 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/kvm/arm64/sea_to_user.c
--
2.51.0.760.g7b8bcc2412-goog
This series addresses comments and combines into one the two
series [1] and [2], and adds review-bys.
This series refactors the KHO framework to better support in-kernel
users like the upcoming LUO. The current design, which relies on a
notifier chain and debugfs for control, is too restrictive for direct
programmatic use.
The core of this rework is the removal of the notifier chain in favor of
a direct registration API. This decouples clients from the shutdown-time
finalization sequence, allowing them to manage their preserved state
more flexibly and at any time.
Also, this series fixes a memory corruption bug in KHO that occurs when
KFENCE is enabled.
The root cause is that KHO metadata, allocated via kzalloc(), can be
randomly serviced by kfence_alloc(). When a kernel boots via KHO, the
early memblock allocator is restricted to a "scratch area". This forces
the KFENCE pool to be allocated within this scratch area, creating a
conflict. If KHO metadata is subsequently placed in this pool, it gets
corrupted during the next kexec operation.
[1] https://lore.kernel.org/all/20251007033100.836886-1-pasha.tatashin@soleen.c…
[2] https://lore.kernel.org/all/20251015053121.3978358-1-pasha.tatashin@soleen.…
Mike Rapoport (Microsoft) (1):
kho: drop notifiers
Pasha Tatashin (9):
kho: allow to drive kho from within kernel
kho: make debugfs interface optional
kho: add interfaces to unpreserve folios and page ranes
kho: don't unpreserve memory during abort
liveupdate: kho: move to kernel/liveupdate
kho: move kho debugfs directory to liveupdate
liveupdate: kho: warn and fail on metadata or preserved memory in
scratch area
liveupdate: kho: Increase metadata bitmap size to PAGE_SIZE
liveupdate: kho: allocate metadata directly from the buddy allocator
Documentation/core-api/kho/concepts.rst | 2 +-
MAINTAINERS | 3 +-
include/linux/kexec_handover.h | 53 +-
init/Kconfig | 2 +
kernel/Kconfig.kexec | 15 -
kernel/Makefile | 2 +-
kernel/liveupdate/Kconfig | 38 ++
kernel/liveupdate/Makefile | 5 +
kernel/{ => liveupdate}/kexec_handover.c | 588 +++++++++-----------
kernel/liveupdate/kexec_handover_debug.c | 25 +
kernel/liveupdate/kexec_handover_debugfs.c | 216 +++++++
kernel/liveupdate/kexec_handover_internal.h | 56 ++
lib/test_kho.c | 30 +-
mm/memblock.c | 62 +--
tools/testing/selftests/kho/init.c | 2 +-
tools/testing/selftests/kho/vmtest.sh | 1 +
16 files changed, 645 insertions(+), 455 deletions(-)
create mode 100644 kernel/liveupdate/Kconfig
create mode 100644 kernel/liveupdate/Makefile
rename kernel/{ => liveupdate}/kexec_handover.c (78%)
create mode 100644 kernel/liveupdate/kexec_handover_debug.c
create mode 100644 kernel/liveupdate/kexec_handover_debugfs.c
create mode 100644 kernel/liveupdate/kexec_handover_internal.h
base-commit: f406055cb18c6e299c4a783fc1effeb16be41803
--
2.51.0.915.g61a8936c21-goog
Hi all,
Now that the merge window is over, here's a respin of the previous
iteration rebased on the latest bpf-next_base. The bug triggering the
XDP_ADJUST_TAIL_SHRINK_MULTI_BUFF failure when CONFIG_DEBUG_VM is
enabled hasn't been fixed yet so I've moved the test to the flaky
table.
The test_xsk.sh script covers many AF_XDP use cases. The tests it runs
are defined in xksxceiver.c. Since this script is used to test real
hardware, the goal here is to leave it as it is, and only integrate the
tests that run on veth peers into the test_progs framework.
Some tests are flaky so they can't be integrated in the CI as they are.
I think that fixing their flakyness would require a significant amount of
work. So, as first step, I've excluded them from the list of tests
migrated to the CI (cf PATCH 14). If these tests get fixed at some
point, integrating them into the CI will be straightforward.
PATCH 1 extracts test_xsk[.c/.h] from xskxceiver[.c/.h] to make the
tests available to test_progs.
PATCH 2 to 7 fix small issues in the current test
PATCH 8 to 13 handle all errors to release resources instead of calling
exit() when any error occurs.
PATCH 14 isolates some flaky tests
PATCH 15 integrate the non-flaky tests to the test_progs framework
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet(a)bootlin.com>
---
Changes in v5:
- Rebase on latest bpf-next_base
- Move XDP_ADJUST_TAIL_SHRINK_MULTI_BUFF to the flaky table
- Add Maciej's reviewed-by
- Link to v4: https://lore.kernel.org/r/20250924-xsk-v4-0-20e57537b876@bootlin.com
Changes in v4:
- Fix test_xsk.sh's summary report.
- Merge PATCH 11 & 12 together, otherwise PATCH 11 fails to build.
- Split old PATCH 3 in two patches. The first one fixes
testapp_stats_rx_dropped(), the second one fixes
testapp_xdp_shared_umem(). The unecessary frees (in
testapp_stats_rx_full() and testapp_stats_fill_empty() are removed)
- Link to v3: https://lore.kernel.org/r/20250904-xsk-v3-0-ce382e331485@bootlin.com
Changes in v3:
- Rebase on latest bpf-next_base to integrate commit c9110e6f7237 ("selftests/bpf:
Fix count write in testapp_xdp_metadata_copy()").
- Move XDP_METADATA_COPY_* tests from flaky-tests to nominal tests
- Link to v2: https://lore.kernel.org/r/20250902-xsk-v2-0-17c6345d5215@bootlin.com
Changes in v2:
- Rebase on the latest bpf-next_base and integrate the newly added tests
to the work (adjust_tail* and tx_queue_consumer tests)
- Re-order patches to split xkxceiver sooner.
- Fix the bug reported by Maciej.
- Fix verbose mode in test_xsk.sh by keeping kselftest (remove PATCH 1,
7 and 8)
- Link to v1: https://lore.kernel.org/r/20250313-xsk-v1-0-7374729a93b9@bootlin.com
---
Bastien Curutchet (eBPF Foundation) (15):
selftests/bpf: test_xsk: Split xskxceiver
selftests/bpf: test_xsk: Initialize bitmap before use
selftests/bpf: test_xsk: Fix __testapp_validate_traffic()'s return value
selftests/bpf: test_xsk: fix memory leak in testapp_stats_rx_dropped()
selftests/bpf: test_xsk: fix memory leak in testapp_xdp_shared_umem()
selftests/bpf: test_xsk: Wrap test clean-up in functions
selftests/bpf: test_xsk: Release resources when swap fails
selftests/bpf: test_xsk: Add return value to init_iface()
selftests/bpf: test_xsk: Don't exit immediately when xsk_attach fails
selftests/bpf: test_xsk: Don't exit immediately when gettimeofday fails
selftests/bpf: test_xsk: Don't exit immediately when workers fail
selftests/bpf: test_xsk: Don't exit immediately if validate_traffic fails
selftests/bpf: test_xsk: Don't exit immediately on allocation failures
selftests/bpf: test_xsk: Isolate flaky tests
selftests/bpf: test_xsk: Integrate test_xsk.c to test_progs framework
tools/testing/selftests/bpf/Makefile | 11 +-
tools/testing/selftests/bpf/prog_tests/test_xsk.c | 2595 ++++++++++++++++++++
tools/testing/selftests/bpf/prog_tests/test_xsk.h | 294 +++
tools/testing/selftests/bpf/prog_tests/xsk.c | 146 ++
tools/testing/selftests/bpf/xskxceiver.c | 2696 +--------------------
tools/testing/selftests/bpf/xskxceiver.h | 156 --
6 files changed, 3174 insertions(+), 2724 deletions(-)
---
base-commit: bd61720310e0b11bfbb7c8e1f373bb87d98451d4
change-id: 20250218-xsk-0cf90e975d14
Best regards,
--
Bastien Curutchet (eBPF Foundation) <tux(a)bootlin.com>
The current KUnit documentation does not mention the kunit.enable
kernel parameter, making it unclear how to troubleshoot cases where
KUnit tests do not run as expected.
Add a note explaining kunit.enable parmaeter. Disabling this parameter
prevents all KUnit tests from running even if CONFIG_KUNIT is enabled.
Signed-off-by: Yuya Ishikawa <ishikawa.yuy-00(a)jp.fujitsu.com>
---
Documentation/dev-tools/kunit/run_manual.rst | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/Documentation/dev-tools/kunit/run_manual.rst b/Documentation/dev-tools/kunit/run_manual.rst
index 699d92885075..98e8d5b28808 100644
--- a/Documentation/dev-tools/kunit/run_manual.rst
+++ b/Documentation/dev-tools/kunit/run_manual.rst
@@ -35,6 +35,12 @@ or be built into the kernel.
a good way of quickly testing everything applicable to the current
config.
+ KUnit can be enabled or disabled at boot time, and this behavior is
+ controlled by the kunit.enable kernel parameter.
+ By default, kunit.enable is set to 1 because KUNIT_DEFAULT_ENABLED is
+ enabled by default. To ensure that tests are executed as expected,
+ verify that kunit.enable=1 at boot time.
+
Once we have built our kernel (and/or modules), it is simple to run
the tests. If the tests are built-in, they will run automatically on the
kernel boot. The results will be written to the kernel log (``dmesg``)
--
2.47.3
v22: fixing build error due to -march=zicfiss being picked in gcc-13 and above
but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
v21: fixed build errors.
Basics and overview
===================
Software with larger attack surfaces (e.g. network facing apps like databases,
browsers or apps relying on browser runtimes) suffer from memory corruption
issues which can be utilized by attackers to bend control flow of the program
to eventually gain control (by making their payload executable). Attackers are
able to perform such attacks by leveraging call-sites which rely on indirect
calls or return sites which rely on obtaining return address from stack memory.
To mitigate such attacks, risc-v extension zicfilp enforces that all indirect
calls must land on a landing pad instruction `lpad` else cpu will raise software
check exception (a new cpu exception cause code on riscv).
Similarly for return flow, risc-v extension zicfiss extends architecture with
- `sspush` instruction to push return address on a shadow stack
- `sspopchk` instruction to pop return address from shadow stack
and compare with input operand (i.e. return address on stack)
- `sspopchk` to raise software check exception if comparision above
was a mismatch
- Protection mechanism using which shadow stack is not writeable via
regular store instructions
More information an details can be found at extensions github repo [1].
Equivalent to landing pad (zicfilp) on x86 is `ENDBRANCH` instruction in Intel
CET [3] and branch target identification (BTI) [4] on arm.
Similarly x86's Intel CET has shadow stack [5] and arm64 has guarded control
stack (GCS) [6] which are very similar to risc-v's zicfiss shadow stack.
x86 and arm64 support for user mode shadow stack is already in mainline.
Kernel awareness for user control flow integrity
================================================
This series picks up Samuel Holland's envcfg changes [2] as well. So if those are
being applied independently, they should be removed from this series.
Enabling:
In order to maintain compatibility and not break anything in user mode, kernel
doesn't enable control flow integrity cpu extensions on binary by default.
Instead exposes a prctl interface to enable, disable and lock the shadow stack
or landing pad feature for a task. This allows userspace (loader) to enumerate
if all objects in its address space are compiled with shadow stack and landing
pad support and accordingly enable the feature. Additionally if a subsequent
`dlopen` happens on a library, user mode can take a decision again to disable
the feature (if incoming library is not compiled with support) OR terminate the
task (if user mode policy is strict to have all objects in address space to be
compiled with control flow integirty cpu feature). prctl to enable shadow stack
results in allocating shadow stack from virtual memory and activating for user
address space. x86 and arm64 are also following same direction due to similar
reason(s).
clone/fork:
On clone and fork, cfi state for task is inherited by child. Shadow stack is
part of virtual memory and is a writeable memory from kernel perspective
(writeable via a restricted set of instructions aka shadow stack instructions)
Thus kernel changes ensure that this memory is converted into read-only when
fork/clone happens and COWed when fault is taken due to sspush, sspopchk or
ssamoswap. In case `CLONE_VM` is specified and shadow stack is to be enabled,
kernel will automatically allocate a shadow stack for that clone call.
map_shadow_stack:
x86 introduced `map_shadow_stack` system call to allow user space to explicitly
map shadow stack memory in its address space. It is useful to allocate shadow
for different contexts managed by a single thread (green threads or contexts)
risc-v implements this system call as well.
signal management:
If shadow stack is enabled for a task, kernel performs an asynchronous control
flow diversion to deliver the signal and eventually expects userspace to issue
sigreturn so that original execution can be resumed. Even though resume context
is prepared by kernel, it is in user space memory and is subject to memory
corruption and corruption bugs can be utilized by attacker in this race window
to perform arbitrary sigreturn and eventually bypass cfi mechanism.
Another issue is how to ensure that cfi related state on sigcontext area is not
trampled by legacy apps or apps compiled with old kernel headers.
In order to mitigate control-flow hijacting, kernel prepares a token and place
it on shadow stack before signal delivery and places address of token in
sigcontext structure. During sigreturn, kernel obtains address of token from
sigcontext struture, reads token from shadow stack and validates it and only
then allow sigreturn to succeed. Compatiblity issue is solved by adopting
dynamic sigcontext management introduced for vector extension. This series
re-factor the code little bit to allow future sigcontext management easy (as
proposed by Andy Chiu from SiFive)
config and compilation:
Introduce a new risc-v config option `CONFIG_RISCV_USER_CFI`. Selecting this
config option picks the kernel support for user control flow integrity. This
optin is presented only if toolchain has shadow stack and landing pad support.
And is on purpose guarded by toolchain support. Reason being that eventually
vDSO also needs to be compiled in with shadow stack and landing pad support.
vDSO compile patches are not included as of now because landing pad labeling
scheme is yet to settle for usermode runtime.
To get more information on kernel interactions with respect to
zicfilp and zicfiss, patch series adds documentation for
`zicfilp` and `zicfiss` in following:
Documentation/arch/riscv/zicfiss.rst
Documentation/arch/riscv/zicfilp.rst
How to test this series
=======================
Toolchain
---------
$ git clone git@github.com:sifive/riscv-gnu-toolchain.git -b cfi-dev
$ riscv-gnu-toolchain/configure --prefix=<path-to-where-to-build> --with-arch=rv64gc_zicfilp_zicfiss --enable-linux --disable-gdb --with-extra-multilib-test="rv64gc_zicfilp_zicfiss-lp64d:-static"
$ make -j$(nproc)
Qemu
----
Get the lastest qemu
$ cd qemu
$ mkdir build
$ cd build
$ ../configure --target-list=riscv64-softmmu
$ make -j$(nproc)
Opensbi
-------
$ git clone git@github.com:deepak0414/opensbi.git -b v6_cfi_spec_split_opensbi
$ make CROSS_COMPILE=<your riscv toolchain> -j$(nproc) PLATFORM=generic
Linux
-----
Running defconfig is fine. CFI is enabled by default if the toolchain
supports it.
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc) defconfig
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc)
Running
-------
Modify your qemu command to have:
-bios <path-to-cfi-opensbi>/build/platform/generic/firmware/fw_dynamic.bin
-cpu rv64,zicfilp=true,zicfiss=true,zimop=true,zcmop=true
References
==========
[1] - https://github.com/riscv/riscv-cfi
[2] - https://lore.kernel.org/all/20240814081126.956287-1-samuel.holland@sifive.c…
[3] - https://lwn.net/Articles/889475/
[4] - https://developer.arm.com/documentation/109576/0100/Branch-Target-Identific…
[5] - https://www.intel.com/content/dam/develop/external/us/en/documents/catc17-i…
[6] - https://lwn.net/Articles/940403/
To: Thomas Gleixner <tglx(a)linutronix.de>
To: Ingo Molnar <mingo(a)redhat.com>
To: Borislav Petkov <bp(a)alien8.de>
To: Dave Hansen <dave.hansen(a)linux.intel.com>
To: x86(a)kernel.org
To: H. Peter Anvin <hpa(a)zytor.com>
To: Andrew Morton <akpm(a)linux-foundation.org>
To: Liam R. Howlett <Liam.Howlett(a)oracle.com>
To: Vlastimil Babka <vbabka(a)suse.cz>
To: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
To: Paul Walmsley <paul.walmsley(a)sifive.com>
To: Palmer Dabbelt <palmer(a)dabbelt.com>
To: Albert Ou <aou(a)eecs.berkeley.edu>
To: Conor Dooley <conor(a)kernel.org>
To: Rob Herring <robh(a)kernel.org>
To: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
To: Arnd Bergmann <arnd(a)arndb.de>
To: Christian Brauner <brauner(a)kernel.org>
To: Peter Zijlstra <peterz(a)infradead.org>
To: Oleg Nesterov <oleg(a)redhat.com>
To: Eric Biederman <ebiederm(a)xmission.com>
To: Kees Cook <kees(a)kernel.org>
To: Jonathan Corbet <corbet(a)lwn.net>
To: Shuah Khan <shuah(a)kernel.org>
To: Jann Horn <jannh(a)google.com>
To: Conor Dooley <conor+dt(a)kernel.org>
To: Miguel Ojeda <ojeda(a)kernel.org>
To: Alex Gaynor <alex.gaynor(a)gmail.com>
To: Boqun Feng <boqun.feng(a)gmail.com>
To: Gary Guo <gary(a)garyguo.net>
To: Björn Roy Baron <bjorn3_gh(a)protonmail.com>
To: Benno Lossin <benno.lossin(a)proton.me>
To: Andreas Hindborg <a.hindborg(a)kernel.org>
To: Alice Ryhl <aliceryhl(a)google.com>
To: Trevor Gross <tmgross(a)umich.edu>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-fsdevel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Cc: linux-riscv(a)lists.infradead.org
Cc: devicetree(a)vger.kernel.org
Cc: linux-arch(a)vger.kernel.org
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: alistair.francis(a)wdc.com
Cc: richard.henderson(a)linaro.org
Cc: jim.shu(a)sifive.com
Cc: andybnac(a)gmail.com
Cc: kito.cheng(a)sifive.com
Cc: charlie(a)rivosinc.com
Cc: atishp(a)rivosinc.com
Cc: evan(a)rivosinc.com
Cc: cleger(a)rivosinc.com
Cc: alexghiti(a)rivosinc.com
Cc: samitolvanen(a)google.com
Cc: broonie(a)kernel.org
Cc: rick.p.edgecombe(a)intel.com
Cc: rust-for-linux(a)vger.kernel.org
changelog
---------
v22:
- CONFIG_RISCV_USER_CFI was by default "n". With dual vdso support it is
default "y" (if toolchain supports it). Fixing build error due to
"-march=zicfiss" being picked in gcc-13 partially. gcc-13 only recognizes the
flag but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
- picked up tags and some cosmetic changes in commit message for dual vdso
patch.
v21:
- Fixing build errors due to changes in arch/riscv/include/asm/vdso.h
Using #ifdef instead of IS_ENABLED in arch/riscv/include/asm/vdso.h
vdso-cfi-offsets.h should be included only when CONFIG_RISCV_USER_CFI
is selected.
v20:
- rebased on v6.18-rc1.
- Added two vDSO support. If `CONFIG_RISCV_USER_CFI` is selected
two vDSOs are compiled (one for hardware prior to RVA23 and one
for RVA23 onwards). Kernel exposes RVA23 vDSO if hardware/cpu
implements zimop else exposes existing vDSO to userspace.
- default selection for `CONFIG_RISCV_USER_CFI` is "Yes".
- replaced "__ASSEMBLY__" with "__ASSEMBLER__"
v19:
- riscv_nousercfi was `int`. changed it to unsigned long.
Thanks to Alex Ghiti for reporting it. It was a bug.
- ELP is cleared on trap entry only when CONFIG_64BIT.
- restore ssp back on return to usermode was being done
before `riscv_v_context_nesting_end` on trap exit path.
If kernel shadow stack were enabled this would result in
kernel operating on user shadow stack and panic (as I found
in my testing of kcfi patch series). So fixed that.
v18:
- rebased on 6.16-rc1
- uprobe handling clears ELP in sstatus image in pt_regs
- vdso was missing shadow stack elf note for object files.
added that. Additional asm file for vdso needed the elf marker
flag. toolchain should complain if `-fcf-protection=full` and
marker is missing for object generated from asm file. Asked
toolchain folks to fix this. Although no reason to gate the merge
on that.
- Split up compile options for march and fcf-protection in vdso
Makefile
- CONFIG_RISCV_USER_CFI option is moved under "Kernel features" menu
Added `arch/riscv/configs/hardening.config` fragment which selects
CONFIG_RISCV_USER_CFI
v17:
- fixed warnings due to empty macros in usercfi.h (reported by alexg)
- fixed prefixes in commit titles reported by alexg
- took below uprobe with fcfi v2 patch from Zong Li and squashed it with
"riscv/traps: Introduce software check exception and uprobe handling"
https://lore.kernel.org/all/20250604093403.10916-1-zong.li@sifive.com/
v16:
- If FWFT is not implemented or returns error for shadow stack activation, then
no_usercfi is set to disable shadow stack. Although this should be picked up
by extension validation and activation. Fixed this bug for zicfilp and zicfiss
both. Thanks to Charlie Jenkins for reporting this.
- If toolchain doesn't support cfi, cfi kselftest shouldn't build. Suggested by
Charlie Jenkins.
- Default for CONFIG_RISCV_USER_CFI is set to no. Charlie/Atish suggested to
keep it off till we have more hardware availibility with RVA23 profile and
zimop/zcmop implemented. Else this will start breaking people's workflow
- Includes the fix if "!RV64 and !SBI" then definitions for FWFT in
asm-offsets.c error.
v15:
- Toolchain has been updated to include `-fcf-protection` flag. This
exists for x86 as well. Updated kernel patches to compile vDSO and
selftest to compile with `fcf-protection=full` flag.
- selecting CONFIG_RISCV_USERCFI selects CONFIG_RISCV_SBI.
- Patch to enable shadow stack for kernel wasn't hidden behind
CONFIG_RISCV_USERCFI and CONFIG_RISCV_SBI. fixed that.
v14:
- rebased on top of palmer/sbi-v3. Thus dropped clement's FWFT patches
Updated RISCV_ISA_EXT_XXXX in hwcap and hwprobe constants.
- Took Radim's suggestions on bitfields.
- Placed cfi_state at the end of thread_info block so that current situation
is not disturbed with respect to member fields of thread_info in single
cacheline.
v13:
- cpu_supports_shadow_stack/cpu_supports_indirect_br_lp_instr uses
riscv_has_extension_unlikely()
- uses nops(count) to create nop slide
- RISCV_ACQUIRE_BARRIER is not needed in `amo_user_shstk`. Removed it
- changed ternaries to simply use implicit casting to convert to bool.
- kernel command line allows to disable zicfilp and zicfiss independently.
updated kernel-parameters.txt.
- ptrace user abi for cfi uses bitmasks instead of bitfields. Added ptrace
kselftest.
- cosmetic and grammatical changes to documentation.
v12:
- It seems like I had accidently squashed arch agnostic indirect branch
tracking prctl and riscv implementation of those prctls. Split them again.
- set_shstk_status/set_indir_lp_status perform CSR writes only when CPU
support is available. As suggested by Zong Li.
- Some minor clean up in kselftests as suggested by Zong Li.
v11:
- patch "arch/riscv: compile vdso with landing pad" was unconditionally
selecting `_zicfilp` for vDSO compile. fixed that. Changed `lpad 1` to
to `lpad 0`.
v10:
- dropped "mm: helper `is_shadow_stack_vma` to check shadow stack vma". This patch
is not that interesting to this patch series for risc-v. There are instances in
arch directories where VM_SHADOW_STACK flag is anyways used. Dropping this patch
to expedite merging in riscv tree.
- Took suggestions from `Clement` on "riscv: zicfiss / zicfilp enumeration" to
validate presence of cfi based on config.
- Added a patch for vDSO to have `lpad 0`. I had omitted this earlier to make sure
we add single vdso object with cfi enabled. But a vdso object with scheme of
zero labeled landing pad is least common denominator and should work with all
objects of zero labeled as well as function-signature labeled objects.
v9:
- rebased on master (39a803b754d5 fix braino in "9p: fix ->rename_sem exclusion")
- dropped "mm: Introduce ARCH_HAS_USER_SHADOW_STACK" (master has it from arm64/gcs)
- dropped "prctl: arch-agnostic prctl for shadow stack" (master has it from arm64/gcs)
v8:
- rebased on palmer/for-next
- dropped samuel holland's `envcfg` context switch patches.
they are in parlmer/for-next
v7:
- Removed "riscv/Kconfig: enable HAVE_EXIT_THREAD for riscv"
Instead using `deactivate_mm` flow to clean up.
see here for more context
https://lore.kernel.org/all/20230908203655.543765-1-rick.p.edgecombe@intel.…
- Changed the header include in `kselftest`. Hopefully this fixes compile
issue faced by Zong Li at SiFive.
- Cleaned up an orphaned change to `mm/mmap.c` in below patch
"riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE"
- Lock interfaces for shadow stack and indirect branch tracking expect arg == 0
Any future evolution of this interface should accordingly define how arg should
be setup.
- `mm/map.c` has an instance of using `VM_SHADOW_STACK`. Fixed it to use helper
`is_shadow_stack_vma`.
- Link to v6: https://lore.kernel.org/r/20241008-v5_user_cfi_series-v6-0-60d9fe073f37@riv…
v6:
- Picked up Samuel Holland's changes as is with `envcfg` placed in
`thread` instead of `thread_info`
- fixed unaligned newline escapes in kselftest
- cleaned up messages in kselftest and included test output in commit message
- fixed a bug in clone path reported by Zong Li
- fixed a build issue if CONFIG_RISCV_ISA_V is not selected
(this was introduced due to re-factoring signal context
management code)
v5:
- rebased on v6.12-rc1
- Fixed schema related issues in device tree file
- Fixed some of the documentation related issues in zicfilp/ss.rst
(style issues and added index)
- added `SHADOW_STACK_SET_MARKER` so that implementation can define base
of shadow stack.
- Fixed warnings on definitions added in usercfi.h when
CONFIG_RISCV_USER_CFI is not selected.
- Adopted context header based signal handling as proposed by Andy Chiu
- Added support for enabling kernel mode access to shadow stack using
FWFT
(https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-firmware…)
- Link to v5: https://lore.kernel.org/r/20241001-v5_user_cfi_series-v1-0-3ba65b6e550f@riv…
(Note: I had an issue in my workflow due to which version number wasn't
picked up correctly while sending out patches)
v4:
- rebased on 6.11-rc6
- envcfg: Converged with Samuel Holland's patches for envcfg management on per-
thread basis.
- vma_is_shadow_stack is renamed to is_vma_shadow_stack
- picked up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` patch
- signal context: using extended context management to maintain compatibility.
- fixed `-Wmissing-prototypes` compiler warnings for prctl functions
- Documentation fixes and amending typos.
- Link to v4: https://lore.kernel.org/all/20240912231650.3740732-1-debug@rivosinc.com/
v3:
- envcfg
logic to pick up base envcfg had a bug where `ENVCFG_CBZE` could have been
picked on per task basis, even though CPU didn't implement it. Fixed in
this series.
- dt-bindings
As suggested, split into separate commit. fixed the messaging that spec is
in public review
- arch_is_shadow_stack change
arch_is_shadow_stack changed to vma_is_shadow_stack
- hwprobe
zicfiss / zicfilp if present will get enumerated in hwprobe
- selftests
As suggested, added object and binary filenames to .gitignore
Selftest binary anyways need to be compiled with cfi enabled compiler which
will make sure that landing pad and shadow stack are enabled. Thus removed
separate enable/disable tests. Cleaned up tests a bit.
- Link to v3: https://lore.kernel.org/lkml/20240403234054.2020347-1-debug@rivosinc.com/
v2:
- Using config `CONFIG_RISCV_USER_CFI`, kernel support for riscv control flow
integrity for user mode programs can be compiled in the kernel.
- Enabling of control flow integrity for user programs is left to user runtime
- This patch series introduces arch agnostic `prctls` to enable shadow stack
and indirect branch tracking. And implements them on riscv.
---
Changes in v22:
- Link to v21: https://lore.kernel.org/r/20251015-v5_user_cfi_series-v21-0-6a07856e90e7@ri…
Changes in v21:
- Link to v20: https://lore.kernel.org/r/20251013-v5_user_cfi_series-v20-0-b9de4be9912e@ri…
Changes in v20:
- Link to v19: https://lore.kernel.org/r/20250731-v5_user_cfi_series-v19-0-09b468d7beab@ri…
Changes in v19:
- Link to v18: https://lore.kernel.org/r/20250711-v5_user_cfi_series-v18-0-a8ee62f9f38e@ri…
Changes in v18:
- Link to v17: https://lore.kernel.org/r/20250604-v5_user_cfi_series-v17-0-4565c2cf869f@ri…
Changes in v17:
- Link to v16: https://lore.kernel.org/r/20250522-v5_user_cfi_series-v16-0-64f61a35eee7@ri…
Changes in v16:
- Link to v15: https://lore.kernel.org/r/20250502-v5_user_cfi_series-v15-0-914966471885@ri…
Changes in v15:
- changelog posted just below cover letter
- Link to v14: https://lore.kernel.org/r/20250429-v5_user_cfi_series-v14-0-5239410d012a@ri…
Changes in v14:
- changelog posted just below cover letter
- Link to v13: https://lore.kernel.org/r/20250424-v5_user_cfi_series-v13-0-971437de586a@ri…
Changes in v13:
- changelog posted just below cover letter
- Link to v12: https://lore.kernel.org/r/20250314-v5_user_cfi_series-v12-0-e51202b53138@ri…
Changes in v12:
- changelog posted just below cover letter
- Link to v11: https://lore.kernel.org/r/20250310-v5_user_cfi_series-v11-0-86b36cbfb910@ri…
Changes in v11:
- changelog posted just below cover letter
- Link to v10: https://lore.kernel.org/r/20250210-v5_user_cfi_series-v10-0-163dcfa31c60@ri…
---
Andy Chiu (1):
riscv: signal: abstract header saving for setup_sigcontext
Deepak Gupta (26):
mm: VM_SHADOW_STACK definition for riscv
dt-bindings: riscv: zicfilp and zicfiss in dt-bindings (extensions.yaml)
riscv: zicfiss / zicfilp enumeration
riscv: zicfiss / zicfilp extension csr and bit definitions
riscv: usercfi state for task and save/restore of CSR_SSP on trap entry/exit
riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE
riscv/mm: manufacture shadow stack pte
riscv/mm: teach pte_mkwrite to manufacture shadow stack PTEs
riscv/mm: write protect and shadow stack
riscv/mm: Implement map_shadow_stack() syscall
riscv/shstk: If needed allocate a new shadow stack on clone
riscv: Implements arch agnostic shadow stack prctls
prctl: arch-agnostic prctl for indirect branch tracking
riscv: Implements arch agnostic indirect branch tracking prctls
riscv/traps: Introduce software check exception and uprobe handling
riscv/signal: save and restore of shadow stack for signal
riscv/kernel: update __show_regs to print shadow stack register
riscv/ptrace: riscv cfi status and state via ptrace and in core files
riscv/hwprobe: zicfilp / zicfiss enumeration in hwprobe
riscv: kernel command line option to opt out of user cfi
riscv: enable kernel access to shadow stack memory via FWFT sbi call
arch/riscv: dual vdso creation logic and select vdso based on hw
riscv: create a config for shadow stack and landing pad instr support
riscv: Documentation for landing pad / indirect branch tracking
riscv: Documentation for shadow stack on riscv
kselftest/riscv: kselftest for user mode cfi
Jim Shu (1):
arch/riscv: compile vdso with landing pad and shadow stack note
Documentation/admin-guide/kernel-parameters.txt | 8 +
Documentation/arch/riscv/index.rst | 2 +
Documentation/arch/riscv/zicfilp.rst | 115 +++++
Documentation/arch/riscv/zicfiss.rst | 179 +++++++
.../devicetree/bindings/riscv/extensions.yaml | 14 +
arch/riscv/Kconfig | 22 +
arch/riscv/Makefile | 8 +-
arch/riscv/configs/hardening.config | 4 +
arch/riscv/include/asm/asm-prototypes.h | 1 +
arch/riscv/include/asm/assembler.h | 44 ++
arch/riscv/include/asm/cpufeature.h | 12 +
arch/riscv/include/asm/csr.h | 16 +
arch/riscv/include/asm/entry-common.h | 2 +
arch/riscv/include/asm/hwcap.h | 2 +
arch/riscv/include/asm/mman.h | 26 +
arch/riscv/include/asm/mmu_context.h | 7 +
arch/riscv/include/asm/pgtable.h | 30 +-
arch/riscv/include/asm/processor.h | 1 +
arch/riscv/include/asm/thread_info.h | 3 +
arch/riscv/include/asm/usercfi.h | 95 ++++
arch/riscv/include/asm/vdso.h | 13 +-
arch/riscv/include/asm/vector.h | 3 +
arch/riscv/include/uapi/asm/hwprobe.h | 2 +
arch/riscv/include/uapi/asm/ptrace.h | 34 ++
arch/riscv/include/uapi/asm/sigcontext.h | 1 +
arch/riscv/kernel/Makefile | 2 +
arch/riscv/kernel/asm-offsets.c | 10 +
arch/riscv/kernel/cpufeature.c | 27 +
arch/riscv/kernel/entry.S | 38 ++
arch/riscv/kernel/head.S | 27 +
arch/riscv/kernel/process.c | 27 +-
arch/riscv/kernel/ptrace.c | 95 ++++
arch/riscv/kernel/signal.c | 148 +++++-
arch/riscv/kernel/sys_hwprobe.c | 2 +
arch/riscv/kernel/sys_riscv.c | 10 +
arch/riscv/kernel/traps.c | 54 ++
arch/riscv/kernel/usercfi.c | 545 +++++++++++++++++++++
arch/riscv/kernel/vdso.c | 7 +
arch/riscv/kernel/vdso/Makefile | 40 +-
arch/riscv/kernel/vdso/flush_icache.S | 4 +
arch/riscv/kernel/vdso/gen_vdso_offsets.sh | 4 +-
arch/riscv/kernel/vdso/getcpu.S | 4 +
arch/riscv/kernel/vdso/note.S | 3 +
arch/riscv/kernel/vdso/rt_sigreturn.S | 4 +
arch/riscv/kernel/vdso/sys_hwprobe.S | 4 +
arch/riscv/kernel/vdso/vgetrandom-chacha.S | 5 +-
arch/riscv/kernel/vdso_cfi/Makefile | 25 +
arch/riscv/kernel/vdso_cfi/vdso-cfi.S | 11 +
arch/riscv/mm/init.c | 2 +-
arch/riscv/mm/pgtable.c | 16 +
include/linux/cpu.h | 4 +
include/linux/mm.h | 7 +
include/uapi/linux/elf.h | 2 +
include/uapi/linux/prctl.h | 27 +
kernel/sys.c | 30 ++
tools/testing/selftests/riscv/Makefile | 2 +-
tools/testing/selftests/riscv/cfi/.gitignore | 3 +
tools/testing/selftests/riscv/cfi/Makefile | 16 +
tools/testing/selftests/riscv/cfi/cfi_rv_test.h | 82 ++++
tools/testing/selftests/riscv/cfi/riscv_cfi_test.c | 173 +++++++
tools/testing/selftests/riscv/cfi/shadowstack.c | 385 +++++++++++++++
tools/testing/selftests/riscv/cfi/shadowstack.h | 27 +
62 files changed, 2475 insertions(+), 41 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20240930-v5_user_cfi_series-3dc332f8f5b2
--
- debug
v22: fixing build error due to -march=zicfiss being picked in gcc-13 and above
but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
v21: fixed build errors.
Basics and overview
===================
Software with larger attack surfaces (e.g. network facing apps like databases,
browsers or apps relying on browser runtimes) suffer from memory corruption
issues which can be utilized by attackers to bend control flow of the program
to eventually gain control (by making their payload executable). Attackers are
able to perform such attacks by leveraging call-sites which rely on indirect
calls or return sites which rely on obtaining return address from stack memory.
To mitigate such attacks, risc-v extension zicfilp enforces that all indirect
calls must land on a landing pad instruction `lpad` else cpu will raise software
check exception (a new cpu exception cause code on riscv).
Similarly for return flow, risc-v extension zicfiss extends architecture with
- `sspush` instruction to push return address on a shadow stack
- `sspopchk` instruction to pop return address from shadow stack
and compare with input operand (i.e. return address on stack)
- `sspopchk` to raise software check exception if comparision above
was a mismatch
- Protection mechanism using which shadow stack is not writeable via
regular store instructions
More information an details can be found at extensions github repo [1].
Equivalent to landing pad (zicfilp) on x86 is `ENDBRANCH` instruction in Intel
CET [3] and branch target identification (BTI) [4] on arm.
Similarly x86's Intel CET has shadow stack [5] and arm64 has guarded control
stack (GCS) [6] which are very similar to risc-v's zicfiss shadow stack.
x86 and arm64 support for user mode shadow stack is already in mainline.
Kernel awareness for user control flow integrity
================================================
This series picks up Samuel Holland's envcfg changes [2] as well. So if those are
being applied independently, they should be removed from this series.
Enabling:
In order to maintain compatibility and not break anything in user mode, kernel
doesn't enable control flow integrity cpu extensions on binary by default.
Instead exposes a prctl interface to enable, disable and lock the shadow stack
or landing pad feature for a task. This allows userspace (loader) to enumerate
if all objects in its address space are compiled with shadow stack and landing
pad support and accordingly enable the feature. Additionally if a subsequent
`dlopen` happens on a library, user mode can take a decision again to disable
the feature (if incoming library is not compiled with support) OR terminate the
task (if user mode policy is strict to have all objects in address space to be
compiled with control flow integirty cpu feature). prctl to enable shadow stack
results in allocating shadow stack from virtual memory and activating for user
address space. x86 and arm64 are also following same direction due to similar
reason(s).
clone/fork:
On clone and fork, cfi state for task is inherited by child. Shadow stack is
part of virtual memory and is a writeable memory from kernel perspective
(writeable via a restricted set of instructions aka shadow stack instructions)
Thus kernel changes ensure that this memory is converted into read-only when
fork/clone happens and COWed when fault is taken due to sspush, sspopchk or
ssamoswap. In case `CLONE_VM` is specified and shadow stack is to be enabled,
kernel will automatically allocate a shadow stack for that clone call.
map_shadow_stack:
x86 introduced `map_shadow_stack` system call to allow user space to explicitly
map shadow stack memory in its address space. It is useful to allocate shadow
for different contexts managed by a single thread (green threads or contexts)
risc-v implements this system call as well.
signal management:
If shadow stack is enabled for a task, kernel performs an asynchronous control
flow diversion to deliver the signal and eventually expects userspace to issue
sigreturn so that original execution can be resumed. Even though resume context
is prepared by kernel, it is in user space memory and is subject to memory
corruption and corruption bugs can be utilized by attacker in this race window
to perform arbitrary sigreturn and eventually bypass cfi mechanism.
Another issue is how to ensure that cfi related state on sigcontext area is not
trampled by legacy apps or apps compiled with old kernel headers.
In order to mitigate control-flow hijacting, kernel prepares a token and place
it on shadow stack before signal delivery and places address of token in
sigcontext structure. During sigreturn, kernel obtains address of token from
sigcontext struture, reads token from shadow stack and validates it and only
then allow sigreturn to succeed. Compatiblity issue is solved by adopting
dynamic sigcontext management introduced for vector extension. This series
re-factor the code little bit to allow future sigcontext management easy (as
proposed by Andy Chiu from SiFive)
config and compilation:
Introduce a new risc-v config option `CONFIG_RISCV_USER_CFI`. Selecting this
config option picks the kernel support for user control flow integrity. This
optin is presented only if toolchain has shadow stack and landing pad support.
And is on purpose guarded by toolchain support. Reason being that eventually
vDSO also needs to be compiled in with shadow stack and landing pad support.
vDSO compile patches are not included as of now because landing pad labeling
scheme is yet to settle for usermode runtime.
To get more information on kernel interactions with respect to
zicfilp and zicfiss, patch series adds documentation for
`zicfilp` and `zicfiss` in following:
Documentation/arch/riscv/zicfiss.rst
Documentation/arch/riscv/zicfilp.rst
How to test this series
=======================
Toolchain
---------
$ git clone git@github.com:sifive/riscv-gnu-toolchain.git -b cfi-dev
$ riscv-gnu-toolchain/configure --prefix=<path-to-where-to-build> --with-arch=rv64gc_zicfilp_zicfiss --enable-linux --disable-gdb --with-extra-multilib-test="rv64gc_zicfilp_zicfiss-lp64d:-static"
$ make -j$(nproc)
Qemu
----
Get the lastest qemu
$ cd qemu
$ mkdir build
$ cd build
$ ../configure --target-list=riscv64-softmmu
$ make -j$(nproc)
Opensbi
-------
$ git clone git@github.com:deepak0414/opensbi.git -b v6_cfi_spec_split_opensbi
$ make CROSS_COMPILE=<your riscv toolchain> -j$(nproc) PLATFORM=generic
Linux
-----
Running defconfig is fine. CFI is enabled by default if the toolchain
supports it.
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc) defconfig
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc)
Running
-------
Modify your qemu command to have:
-bios <path-to-cfi-opensbi>/build/platform/generic/firmware/fw_dynamic.bin
-cpu rv64,zicfilp=true,zicfiss=true,zimop=true,zcmop=true
References
==========
[1] - https://github.com/riscv/riscv-cfi
[2] - https://lore.kernel.org/all/20240814081126.956287-1-samuel.holland@sifive.c…
[3] - https://lwn.net/Articles/889475/
[4] - https://developer.arm.com/documentation/109576/0100/Branch-Target-Identific…
[5] - https://www.intel.com/content/dam/develop/external/us/en/documents/catc17-i…
[6] - https://lwn.net/Articles/940403/
To: Thomas Gleixner <tglx(a)linutronix.de>
To: Ingo Molnar <mingo(a)redhat.com>
To: Borislav Petkov <bp(a)alien8.de>
To: Dave Hansen <dave.hansen(a)linux.intel.com>
To: x86(a)kernel.org
To: H. Peter Anvin <hpa(a)zytor.com>
To: Andrew Morton <akpm(a)linux-foundation.org>
To: Liam R. Howlett <Liam.Howlett(a)oracle.com>
To: Vlastimil Babka <vbabka(a)suse.cz>
To: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
To: Paul Walmsley <paul.walmsley(a)sifive.com>
To: Palmer Dabbelt <palmer(a)dabbelt.com>
To: Albert Ou <aou(a)eecs.berkeley.edu>
To: Conor Dooley <conor(a)kernel.org>
To: Rob Herring <robh(a)kernel.org>
To: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
To: Arnd Bergmann <arnd(a)arndb.de>
To: Christian Brauner <brauner(a)kernel.org>
To: Peter Zijlstra <peterz(a)infradead.org>
To: Oleg Nesterov <oleg(a)redhat.com>
To: Eric Biederman <ebiederm(a)xmission.com>
To: Kees Cook <kees(a)kernel.org>
To: Jonathan Corbet <corbet(a)lwn.net>
To: Shuah Khan <shuah(a)kernel.org>
To: Jann Horn <jannh(a)google.com>
To: Conor Dooley <conor+dt(a)kernel.org>
To: Miguel Ojeda <ojeda(a)kernel.org>
To: Alex Gaynor <alex.gaynor(a)gmail.com>
To: Boqun Feng <boqun.feng(a)gmail.com>
To: Gary Guo <gary(a)garyguo.net>
To: Björn Roy Baron <bjorn3_gh(a)protonmail.com>
To: Benno Lossin <benno.lossin(a)proton.me>
To: Andreas Hindborg <a.hindborg(a)kernel.org>
To: Alice Ryhl <aliceryhl(a)google.com>
To: Trevor Gross <tmgross(a)umich.edu>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-fsdevel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Cc: linux-riscv(a)lists.infradead.org
Cc: devicetree(a)vger.kernel.org
Cc: linux-arch(a)vger.kernel.org
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: alistair.francis(a)wdc.com
Cc: richard.henderson(a)linaro.org
Cc: jim.shu(a)sifive.com
Cc: andybnac(a)gmail.com
Cc: kito.cheng(a)sifive.com
Cc: charlie(a)rivosinc.com
Cc: atishp(a)rivosinc.com
Cc: evan(a)rivosinc.com
Cc: cleger(a)rivosinc.com
Cc: alexghiti(a)rivosinc.com
Cc: samitolvanen(a)google.com
Cc: broonie(a)kernel.org
Cc: rick.p.edgecombe(a)intel.com
Cc: rust-for-linux(a)vger.kernel.org
changelog
---------
v22:
- CONFIG_RISCV_USER_CFI was by default "n". With dual vdso support it is
default "y" (if toolchain supports it). Fixing build error due to
"-march=zicfiss" being picked in gcc-13 partially. gcc-13 only recognizes the
flag but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
- picked up tags and some cosmetic changes in commit message for dual vdso
patch.
v21:
- Fixing build errors due to changes in arch/riscv/include/asm/vdso.h
Using #ifdef instead of IS_ENABLED in arch/riscv/include/asm/vdso.h
vdso-cfi-offsets.h should be included only when CONFIG_RISCV_USER_CFI
is selected.
v20:
- rebased on v6.18-rc1.
- Added two vDSO support. If `CONFIG_RISCV_USER_CFI` is selected
two vDSOs are compiled (one for hardware prior to RVA23 and one
for RVA23 onwards). Kernel exposes RVA23 vDSO if hardware/cpu
implements zimop else exposes existing vDSO to userspace.
- default selection for `CONFIG_RISCV_USER_CFI` is "Yes".
- replaced "__ASSEMBLY__" with "__ASSEMBLER__"
v19:
- riscv_nousercfi was `int`. changed it to unsigned long.
Thanks to Alex Ghiti for reporting it. It was a bug.
- ELP is cleared on trap entry only when CONFIG_64BIT.
- restore ssp back on return to usermode was being done
before `riscv_v_context_nesting_end` on trap exit path.
If kernel shadow stack were enabled this would result in
kernel operating on user shadow stack and panic (as I found
in my testing of kcfi patch series). So fixed that.
v18:
- rebased on 6.16-rc1
- uprobe handling clears ELP in sstatus image in pt_regs
- vdso was missing shadow stack elf note for object files.
added that. Additional asm file for vdso needed the elf marker
flag. toolchain should complain if `-fcf-protection=full` and
marker is missing for object generated from asm file. Asked
toolchain folks to fix this. Although no reason to gate the merge
on that.
- Split up compile options for march and fcf-protection in vdso
Makefile
- CONFIG_RISCV_USER_CFI option is moved under "Kernel features" menu
Added `arch/riscv/configs/hardening.config` fragment which selects
CONFIG_RISCV_USER_CFI
v17:
- fixed warnings due to empty macros in usercfi.h (reported by alexg)
- fixed prefixes in commit titles reported by alexg
- took below uprobe with fcfi v2 patch from Zong Li and squashed it with
"riscv/traps: Introduce software check exception and uprobe handling"
https://lore.kernel.org/all/20250604093403.10916-1-zong.li@sifive.com/
v16:
- If FWFT is not implemented or returns error for shadow stack activation, then
no_usercfi is set to disable shadow stack. Although this should be picked up
by extension validation and activation. Fixed this bug for zicfilp and zicfiss
both. Thanks to Charlie Jenkins for reporting this.
- If toolchain doesn't support cfi, cfi kselftest shouldn't build. Suggested by
Charlie Jenkins.
- Default for CONFIG_RISCV_USER_CFI is set to no. Charlie/Atish suggested to
keep it off till we have more hardware availibility with RVA23 profile and
zimop/zcmop implemented. Else this will start breaking people's workflow
- Includes the fix if "!RV64 and !SBI" then definitions for FWFT in
asm-offsets.c error.
v15:
- Toolchain has been updated to include `-fcf-protection` flag. This
exists for x86 as well. Updated kernel patches to compile vDSO and
selftest to compile with `fcf-protection=full` flag.
- selecting CONFIG_RISCV_USERCFI selects CONFIG_RISCV_SBI.
- Patch to enable shadow stack for kernel wasn't hidden behind
CONFIG_RISCV_USERCFI and CONFIG_RISCV_SBI. fixed that.
v14:
- rebased on top of palmer/sbi-v3. Thus dropped clement's FWFT patches
Updated RISCV_ISA_EXT_XXXX in hwcap and hwprobe constants.
- Took Radim's suggestions on bitfields.
- Placed cfi_state at the end of thread_info block so that current situation
is not disturbed with respect to member fields of thread_info in single
cacheline.
v13:
- cpu_supports_shadow_stack/cpu_supports_indirect_br_lp_instr uses
riscv_has_extension_unlikely()
- uses nops(count) to create nop slide
- RISCV_ACQUIRE_BARRIER is not needed in `amo_user_shstk`. Removed it
- changed ternaries to simply use implicit casting to convert to bool.
- kernel command line allows to disable zicfilp and zicfiss independently.
updated kernel-parameters.txt.
- ptrace user abi for cfi uses bitmasks instead of bitfields. Added ptrace
kselftest.
- cosmetic and grammatical changes to documentation.
v12:
- It seems like I had accidently squashed arch agnostic indirect branch
tracking prctl and riscv implementation of those prctls. Split them again.
- set_shstk_status/set_indir_lp_status perform CSR writes only when CPU
support is available. As suggested by Zong Li.
- Some minor clean up in kselftests as suggested by Zong Li.
v11:
- patch "arch/riscv: compile vdso with landing pad" was unconditionally
selecting `_zicfilp` for vDSO compile. fixed that. Changed `lpad 1` to
to `lpad 0`.
v10:
- dropped "mm: helper `is_shadow_stack_vma` to check shadow stack vma". This patch
is not that interesting to this patch series for risc-v. There are instances in
arch directories where VM_SHADOW_STACK flag is anyways used. Dropping this patch
to expedite merging in riscv tree.
- Took suggestions from `Clement` on "riscv: zicfiss / zicfilp enumeration" to
validate presence of cfi based on config.
- Added a patch for vDSO to have `lpad 0`. I had omitted this earlier to make sure
we add single vdso object with cfi enabled. But a vdso object with scheme of
zero labeled landing pad is least common denominator and should work with all
objects of zero labeled as well as function-signature labeled objects.
v9:
- rebased on master (39a803b754d5 fix braino in "9p: fix ->rename_sem exclusion")
- dropped "mm: Introduce ARCH_HAS_USER_SHADOW_STACK" (master has it from arm64/gcs)
- dropped "prctl: arch-agnostic prctl for shadow stack" (master has it from arm64/gcs)
v8:
- rebased on palmer/for-next
- dropped samuel holland's `envcfg` context switch patches.
they are in parlmer/for-next
v7:
- Removed "riscv/Kconfig: enable HAVE_EXIT_THREAD for riscv"
Instead using `deactivate_mm` flow to clean up.
see here for more context
https://lore.kernel.org/all/20230908203655.543765-1-rick.p.edgecombe@intel.…
- Changed the header include in `kselftest`. Hopefully this fixes compile
issue faced by Zong Li at SiFive.
- Cleaned up an orphaned change to `mm/mmap.c` in below patch
"riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE"
- Lock interfaces for shadow stack and indirect branch tracking expect arg == 0
Any future evolution of this interface should accordingly define how arg should
be setup.
- `mm/map.c` has an instance of using `VM_SHADOW_STACK`. Fixed it to use helper
`is_shadow_stack_vma`.
- Link to v6: https://lore.kernel.org/r/20241008-v5_user_cfi_series-v6-0-60d9fe073f37@riv…
v6:
- Picked up Samuel Holland's changes as is with `envcfg` placed in
`thread` instead of `thread_info`
- fixed unaligned newline escapes in kselftest
- cleaned up messages in kselftest and included test output in commit message
- fixed a bug in clone path reported by Zong Li
- fixed a build issue if CONFIG_RISCV_ISA_V is not selected
(this was introduced due to re-factoring signal context
management code)
v5:
- rebased on v6.12-rc1
- Fixed schema related issues in device tree file
- Fixed some of the documentation related issues in zicfilp/ss.rst
(style issues and added index)
- added `SHADOW_STACK_SET_MARKER` so that implementation can define base
of shadow stack.
- Fixed warnings on definitions added in usercfi.h when
CONFIG_RISCV_USER_CFI is not selected.
- Adopted context header based signal handling as proposed by Andy Chiu
- Added support for enabling kernel mode access to shadow stack using
FWFT
(https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-firmware…)
- Link to v5: https://lore.kernel.org/r/20241001-v5_user_cfi_series-v1-0-3ba65b6e550f@riv…
(Note: I had an issue in my workflow due to which version number wasn't
picked up correctly while sending out patches)
v4:
- rebased on 6.11-rc6
- envcfg: Converged with Samuel Holland's patches for envcfg management on per-
thread basis.
- vma_is_shadow_stack is renamed to is_vma_shadow_stack
- picked up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` patch
- signal context: using extended context management to maintain compatibility.
- fixed `-Wmissing-prototypes` compiler warnings for prctl functions
- Documentation fixes and amending typos.
- Link to v4: https://lore.kernel.org/all/20240912231650.3740732-1-debug@rivosinc.com/
v3:
- envcfg
logic to pick up base envcfg had a bug where `ENVCFG_CBZE` could have been
picked on per task basis, even though CPU didn't implement it. Fixed in
this series.
- dt-bindings
As suggested, split into separate commit. fixed the messaging that spec is
in public review
- arch_is_shadow_stack change
arch_is_shadow_stack changed to vma_is_shadow_stack
- hwprobe
zicfiss / zicfilp if present will get enumerated in hwprobe
- selftests
As suggested, added object and binary filenames to .gitignore
Selftest binary anyways need to be compiled with cfi enabled compiler which
will make sure that landing pad and shadow stack are enabled. Thus removed
separate enable/disable tests. Cleaned up tests a bit.
- Link to v3: https://lore.kernel.org/lkml/20240403234054.2020347-1-debug@rivosinc.com/
v2:
- Using config `CONFIG_RISCV_USER_CFI`, kernel support for riscv control flow
integrity for user mode programs can be compiled in the kernel.
- Enabling of control flow integrity for user programs is left to user runtime
- This patch series introduces arch agnostic `prctls` to enable shadow stack
and indirect branch tracking. And implements them on riscv.
---
Changes in v22:
- Link to v21: https://lore.kernel.org/r/20251015-v5_user_cfi_series-v21-0-6a07856e90e7@ri…
Changes in v21:
- Link to v20: https://lore.kernel.org/r/20251013-v5_user_cfi_series-v20-0-b9de4be9912e@ri…
Changes in v20:
- Link to v19: https://lore.kernel.org/r/20250731-v5_user_cfi_series-v19-0-09b468d7beab@ri…
Changes in v19:
- Link to v18: https://lore.kernel.org/r/20250711-v5_user_cfi_series-v18-0-a8ee62f9f38e@ri…
Changes in v18:
- Link to v17: https://lore.kernel.org/r/20250604-v5_user_cfi_series-v17-0-4565c2cf869f@ri…
Changes in v17:
- Link to v16: https://lore.kernel.org/r/20250522-v5_user_cfi_series-v16-0-64f61a35eee7@ri…
Changes in v16:
- Link to v15: https://lore.kernel.org/r/20250502-v5_user_cfi_series-v15-0-914966471885@ri…
Changes in v15:
- changelog posted just below cover letter
- Link to v14: https://lore.kernel.org/r/20250429-v5_user_cfi_series-v14-0-5239410d012a@ri…
Changes in v14:
- changelog posted just below cover letter
- Link to v13: https://lore.kernel.org/r/20250424-v5_user_cfi_series-v13-0-971437de586a@ri…
Changes in v13:
- changelog posted just below cover letter
- Link to v12: https://lore.kernel.org/r/20250314-v5_user_cfi_series-v12-0-e51202b53138@ri…
Changes in v12:
- changelog posted just below cover letter
- Link to v11: https://lore.kernel.org/r/20250310-v5_user_cfi_series-v11-0-86b36cbfb910@ri…
Changes in v11:
- changelog posted just below cover letter
- Link to v10: https://lore.kernel.org/r/20250210-v5_user_cfi_series-v10-0-163dcfa31c60@ri…
---
Andy Chiu (1):
riscv: signal: abstract header saving for setup_sigcontext
Deepak Gupta (26):
mm: VM_SHADOW_STACK definition for riscv
dt-bindings: riscv: zicfilp and zicfiss in dt-bindings (extensions.yaml)
riscv: zicfiss / zicfilp enumeration
riscv: zicfiss / zicfilp extension csr and bit definitions
riscv: usercfi state for task and save/restore of CSR_SSP on trap entry/exit
riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE
riscv/mm: manufacture shadow stack pte
riscv/mm: teach pte_mkwrite to manufacture shadow stack PTEs
riscv/mm: write protect and shadow stack
riscv/mm: Implement map_shadow_stack() syscall
riscv/shstk: If needed allocate a new shadow stack on clone
riscv: Implements arch agnostic shadow stack prctls
prctl: arch-agnostic prctl for indirect branch tracking
riscv: Implements arch agnostic indirect branch tracking prctls
riscv/traps: Introduce software check exception and uprobe handling
riscv/signal: save and restore of shadow stack for signal
riscv/kernel: update __show_regs to print shadow stack register
riscv/ptrace: riscv cfi status and state via ptrace and in core files
riscv/hwprobe: zicfilp / zicfiss enumeration in hwprobe
riscv: kernel command line option to opt out of user cfi
riscv: enable kernel access to shadow stack memory via FWFT sbi call
arch/riscv: dual vdso creation logic and select vdso based on hw
riscv: create a config for shadow stack and landing pad instr support
riscv: Documentation for landing pad / indirect branch tracking
riscv: Documentation for shadow stack on riscv
kselftest/riscv: kselftest for user mode cfi
Jim Shu (1):
arch/riscv: compile vdso with landing pad and shadow stack note
Documentation/admin-guide/kernel-parameters.txt | 8 +
Documentation/arch/riscv/index.rst | 2 +
Documentation/arch/riscv/zicfilp.rst | 115 +++++
Documentation/arch/riscv/zicfiss.rst | 179 +++++++
.../devicetree/bindings/riscv/extensions.yaml | 14 +
arch/riscv/Kconfig | 22 +
arch/riscv/Makefile | 8 +-
arch/riscv/configs/hardening.config | 4 +
arch/riscv/include/asm/asm-prototypes.h | 1 +
arch/riscv/include/asm/assembler.h | 44 ++
arch/riscv/include/asm/cpufeature.h | 12 +
arch/riscv/include/asm/csr.h | 16 +
arch/riscv/include/asm/entry-common.h | 2 +
arch/riscv/include/asm/hwcap.h | 2 +
arch/riscv/include/asm/mman.h | 26 +
arch/riscv/include/asm/mmu_context.h | 7 +
arch/riscv/include/asm/pgtable.h | 30 +-
arch/riscv/include/asm/processor.h | 1 +
arch/riscv/include/asm/thread_info.h | 3 +
arch/riscv/include/asm/usercfi.h | 95 ++++
arch/riscv/include/asm/vdso.h | 13 +-
arch/riscv/include/asm/vector.h | 3 +
arch/riscv/include/uapi/asm/hwprobe.h | 2 +
arch/riscv/include/uapi/asm/ptrace.h | 34 ++
arch/riscv/include/uapi/asm/sigcontext.h | 1 +
arch/riscv/kernel/Makefile | 2 +
arch/riscv/kernel/asm-offsets.c | 10 +
arch/riscv/kernel/cpufeature.c | 27 +
arch/riscv/kernel/entry.S | 38 ++
arch/riscv/kernel/head.S | 27 +
arch/riscv/kernel/process.c | 27 +-
arch/riscv/kernel/ptrace.c | 95 ++++
arch/riscv/kernel/signal.c | 148 +++++-
arch/riscv/kernel/sys_hwprobe.c | 2 +
arch/riscv/kernel/sys_riscv.c | 10 +
arch/riscv/kernel/traps.c | 54 ++
arch/riscv/kernel/usercfi.c | 545 +++++++++++++++++++++
arch/riscv/kernel/vdso.c | 7 +
arch/riscv/kernel/vdso/Makefile | 40 +-
arch/riscv/kernel/vdso/flush_icache.S | 4 +
arch/riscv/kernel/vdso/gen_vdso_offsets.sh | 4 +-
arch/riscv/kernel/vdso/getcpu.S | 4 +
arch/riscv/kernel/vdso/note.S | 3 +
arch/riscv/kernel/vdso/rt_sigreturn.S | 4 +
arch/riscv/kernel/vdso/sys_hwprobe.S | 4 +
arch/riscv/kernel/vdso/vgetrandom-chacha.S | 5 +-
arch/riscv/kernel/vdso_cfi/Makefile | 25 +
arch/riscv/kernel/vdso_cfi/vdso-cfi.S | 11 +
arch/riscv/mm/init.c | 2 +-
arch/riscv/mm/pgtable.c | 16 +
include/linux/cpu.h | 4 +
include/linux/mm.h | 7 +
include/uapi/linux/elf.h | 2 +
include/uapi/linux/prctl.h | 27 +
kernel/sys.c | 30 ++
tools/testing/selftests/riscv/Makefile | 2 +-
tools/testing/selftests/riscv/cfi/.gitignore | 3 +
tools/testing/selftests/riscv/cfi/Makefile | 16 +
tools/testing/selftests/riscv/cfi/cfi_rv_test.h | 82 ++++
tools/testing/selftests/riscv/cfi/riscv_cfi_test.c | 173 +++++++
tools/testing/selftests/riscv/cfi/shadowstack.c | 385 +++++++++++++++
tools/testing/selftests/riscv/cfi/shadowstack.h | 27 +
62 files changed, 2475 insertions(+), 41 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20240930-v5_user_cfi_series-3dc332f8f5b2
--
- debug
The core scheduling is for smt enabled cpus. It is not returns
failure and gives plenty of error messages and not clearly points
to the smt issue if the smt is disabled. It just mention
"not a core sched system" and many other messages. For example:
Not a core sched system
tid=210574, / tgid=210574 / pgid=210574: ffffffffffffffff
Not a core sched system
tid=210575, / tgid=210575 / pgid=210574: ffffffffffffffff
Not a core sched system
tid=210577, / tgid=210575 / pgid=210574: ffffffffffffffff
(similar things many other times)
In this patch, the test will first read /sys/devices/system/cpu/smt/active,
if the file cannot be opened or its value is 0, the test is skipped with
an explanatory message. This helps developers understand why it is skipped
and avoids unnecessary attention when running the full selftest suite.
Cc: stable(a)vger.kernel.org
Signed-off-by: Yifei Liu <yifei.l.liu(a)oracle.com>
---
tools/testing/selftests/sched/cs_prctl_test.c | 23 ++++++++++++++++++-
1 file changed, 22 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/sched/cs_prctl_test.c b/tools/testing/selftests/sched/cs_prctl_test.c
index 52d97fae4dbd..7ce8088cde6a 100644
--- a/tools/testing/selftests/sched/cs_prctl_test.c
+++ b/tools/testing/selftests/sched/cs_prctl_test.c
@@ -32,6 +32,8 @@
#include <stdlib.h>
#include <string.h>
+#include "../kselftest.h"
+
#if __GLIBC_PREREQ(2, 30) == 0
#include <sys/syscall.h>
static pid_t gettid(void)
@@ -109,6 +111,22 @@ static void handle_usage(int rc, char *msg)
exit(rc);
}
+int check_smt(void)
+{
+ int c = 0;
+ FILE *file;
+
+ file = fopen("/sys/devices/system/cpu/smt/active", "r");
+ if (!file)
+ return 0;
+ c = fgetc(file) - 0x30;
+ fclose(file);
+ if (c == 0 || c == 1)
+ return c;
+ //if fgetc returns EOF or -1 for correupted files, return 0.
+ return 0;
+}
+
static unsigned long get_cs_cookie(int pid)
{
unsigned long long cookie;
@@ -271,7 +289,10 @@ int main(int argc, char *argv[])
delay = -1;
srand(time(NULL));
-
+ if (!check_smt()) {
+ ksft_test_result_skip("smt not enabled\n");
+ return 1;
+ }
/* put into separate process group */
if (setpgid(0, 0) != 0)
handle_error("process group");
--
2.50.1
From: Chia-Yu Chang <chia-yu.chang(a)nokia-bell-labs.com>
Hello,
Plesae find the v4 AccECN case handling patch series, which covers
several excpetional case handling of Accurate ECN spec (RFC9768),
adds new identifiers to be used by CC modules, adds ecn_delta into
rate_sample, and keeps the ACE counter for computation, etc.
This patch series is part of the full AccECN patch series, which is available at
https://github.com/L4STeam/linux-net-next/commits/upstream_l4steam/
Best regards,
Chia-Yu
---
v4:
- Add previous #13 in v2 back after dicussion with the RFC author.
- Add TCP_ACCECN_OPTION_PERSIST to tcp_ecn_option sysctl to ignore AccECN fallback policy on sending AccECN option.
v3:
- Add additional min() check if pkts_acked_ewma is not initialized in #1.
- Change TCP_CONG_WANTS_ECT_1 into individual flag add helper function INET_ECN_xmit_wants_ect_1() in #3.
- Add empty line between variable declarations and code in #4.
- Update commit message to fix old AccECN commits in #5.
- Remove unnecessary brackets in #10.
- Move patch #3 in v2 to a later Prague patch serise and remove patch #13 in v2.
---
Chia-Yu Chang (11):
tcp: L4S ECT(1) identifier and NEEDS_ACCECN for CC modules
tcp: disable RFC3168 fallback identifier for CC modules
tcp: accecn: handle unexpected AccECN negotiation feedback
tcp: accecn: retransmit downgraded SYN in AccECN negotiation
tcp: move increment of num_retrans
tcp: accecn: retransmit SYN/ACK without AccECN option or non-AccECN
SYN/ACK
tcp: accecn: unset ECT if receive or send ACE=0 in AccECN negotiaion
tcp: accecn: fallback outgoing half link to non-AccECN
tcp: accecn: verify ACE counter in 1st ACK after AccECN negotiation
tcp: accecn: detect loss ACK w/ AccECN option and add
TCP_ACCECN_OPTION_PERSIST
tcp: accecn: enable AccECN
Ilpo Järvinen (2):
tcp: try to avoid safer when ACKs are thinned
gro: flushing when CWR is set negatively affects AccECN
Documentation/networking/ip-sysctl.rst | 4 +-
.../networking/net_cachelines/tcp_sock.rst | 1 +
include/linux/tcp.h | 4 +-
include/net/inet_ecn.h | 20 +++-
include/net/tcp.h | 32 ++++++-
include/net/tcp_ecn.h | 92 ++++++++++++++-----
net/ipv4/sysctl_net_ipv4.c | 4 +-
net/ipv4/tcp.c | 2 +
net/ipv4/tcp_cong.c | 10 +-
net/ipv4/tcp_input.c | 58 ++++++++++--
net/ipv4/tcp_minisocks.c | 40 +++++---
net/ipv4/tcp_offload.c | 3 +-
net/ipv4/tcp_output.c | 42 ++++++---
13 files changed, 241 insertions(+), 71 deletions(-)
--
2.34.1
Running this test on a system with only one CPU is not a recipe for
success. However, there's no clear-cut reason why it absolutely
shouldn't work, so the test shouldn't completely reject such a platform.
At present, the *3/4 calculation will return zero on these platforms and
the test fails. So, instead just skip that calculation.
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Brendan Jackman <jackmanb(a)google.com>
---
tools/testing/selftests/kvm/mmu_stress_test.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c
index 6a437d2be9fa444b34c2a73308a9d1c7ff3cc4f5..b5bd6fbad32a9ad5247a52ecf811b29293763e2e 100644
--- a/tools/testing/selftests/kvm/mmu_stress_test.c
+++ b/tools/testing/selftests/kvm/mmu_stress_test.c
@@ -263,8 +263,10 @@ static void calc_default_nr_vcpus(void)
TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)",
errno, strerror(errno));
- nr_vcpus = CPU_COUNT(&possible_mask) * 3/4;
+ nr_vcpus = CPU_COUNT(&possible_mask);
TEST_ASSERT(nr_vcpus > 0, "Uh, no CPUs?");
+ if (nr_vcpus >= 2)
+ nr_vcpus = nr_vcpus * 3/4;
}
int main(int argc, char *argv[])
---
base-commit: 6b36119b94d0b2bb8cea9d512017efafd461d6ac
change-id: 20251007-b4-kvm-mmu-stresstest-1proc-e6157c13787a
Best regards,
--
Brendan Jackman <jackmanb(a)google.com>
This series introduces NUMA-aware memory placement support for KVM guests
with guest_memfd memory backends. It builds upon Fuad Tabba's work (V17)
that enabled host-mapping for guest_memfd memory [1] and can be applied
directly applied on KVM tree [2] (branch kvm-next, base commit: a6ad5413,
Merge branch 'guest-memfd-mmap' into HEAD)
== Background ==
KVM's guest-memfd memory backend currently lacks support for NUMA policy
enforcement, causing guest memory allocations to be distributed across host
nodes according to kernel's default behavior, irrespective of any policy
specified by the VMM. This limitation arises because conventional userspace
NUMA control mechanisms like mbind(2) don't work since the memory isn't
directly mapped to userspace when allocations occur.
Fuad's work [1] provides the necessary mmap capability, and this series
leverages it to enable mbind(2).
== Implementation ==
This series implements proper NUMA policy support for guest-memfd by:
1. Adding mempolicy-aware allocation APIs to the filemap layer.
2. Introducing custom inodes (via a dedicated slab-allocated inode cache,
kvm_gmem_inode_info) to store NUMA policy and metadata for guest memory.
3. Implementing get/set_policy vm_ops in guest_memfd to support NUMA
policy.
With these changes, VMMs can now control guest memory placement by mapping
guest_memfd file descriptor and using mbind(2) to specify:
- Policy modes: default, bind, interleave, or preferred
- Host NUMA nodes: List of target nodes for memory allocation
These Policies affect only future allocations and do not migrate existing
memory. This matches mbind(2)'s default behavior which affects only new
allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL flags (Not
supported for guest_memfd as it is unmovable by design).
== Upstream Plan ==
Phased approach as per David's guest_memfd extension overview [3] and
community calls [4]:
Phase 1 (this series):
1. Focuses on shared guest_memfd support (non-CoCo VMs).
2. Builds on Fuad's host-mapping work [1].
Phase2 (future work):
1. NUMA support for private guest_memfd (CoCo VMs).
2. Depends on SNP in-place conversion support [5].
This series provides a clean integration path for NUMA-aware memory
management for guest_memfd and lays the groundwork for future confidential
computing NUMA capabilities.
Thanks,
Shivank
== Changelog ==
- v1,v2: Extended the KVM_CREATE_GUEST_MEMFD IOCTL to pass mempolicy.
- v3: Introduced fbind() syscall for VMM memory-placement configuration.
- v4-v6: Current approach using shared_policy support and vm_ops (based on
suggestions from David [6] and guest_memfd bi-weekly upstream
call discussion [7]).
- v7: Use inodes to store NUMA policy instead of file [8].
- v8: Rebase on top of Fuad's V12: Host mmaping for guest_memfd memory.
- v9: Rebase on top of Fuad's V13 and incorporate review comments
- V10: Rebase on top of Fuad's V17. Use latest guest_memfd inode patch
from Ackerley (with David's review comments). Use newer kmem_cache_create()
API variant with arg parameter (Vlastimil)
- V11: Rebase on kvm-next, remove RFC tag, use Ackerley's latest patch
and fix a rcu race bug during kvm module unload.
[1] https://lore.kernel.org/all/20250729225455.670324-1-seanjc@google.com
[2] https://git.kernel.org/pub/scm/virt/kvm/kvm.git/log/?h=next
[3] https://lore.kernel.org/all/c1c9591d-218a-495c-957b-ba356c8f8e09@redhat.com
[4] https://docs.google.com/document/d/1M6766BzdY1Lhk7LiR5IqVR8B8mG3cr-cxTxOrAo…
[5] https://lore.kernel.org/all/20250613005400.3694904-1-michael.roth@amd.com
[6] https://lore.kernel.org/all/6fbef654-36e2-4be5-906e-2a648a845278@redhat.com
[7] https://lore.kernel.org/all/2b77e055-98ac-43a1-a7ad-9f9065d7f38f@amd.com
[8] https://lore.kernel.org/all/diqzbjumm167.fsf@ackerleytng-ctop.c.googlers.com
Ackerley Tng (1):
KVM: guest_memfd: Use guest mem inodes instead of anonymous inodes
Matthew Wilcox (Oracle) (2):
mm/filemap: Add NUMA mempolicy support to filemap_alloc_folio()
mm/filemap: Extend __filemap_get_folio() to support NUMA memory
policies
Shivank Garg (4):
mm/mempolicy: Export memory policy symbols
KVM: guest_memfd: Add slab-allocated inode cache
KVM: guest_memfd: Enforce NUMA mempolicy using shared policy
KVM: guest_memfd: selftests: Add tests for mmap and NUMA policy
support
fs/bcachefs/fs-io-buffered.c | 2 +-
fs/btrfs/compression.c | 4 +-
fs/btrfs/verity.c | 2 +-
fs/erofs/zdata.c | 2 +-
fs/f2fs/compress.c | 2 +-
include/linux/pagemap.h | 18 +-
include/uapi/linux/magic.h | 1 +
mm/filemap.c | 23 +-
mm/mempolicy.c | 6 +
mm/readahead.c | 2 +-
tools/testing/selftests/kvm/Makefile.kvm | 1 +
.../testing/selftests/kvm/guest_memfd_test.c | 121 ++++++++
virt/kvm/guest_memfd.c | 262 ++++++++++++++++--
virt/kvm/kvm_main.c | 7 +-
virt/kvm/kvm_mm.h | 9 +-
15 files changed, 412 insertions(+), 50 deletions(-)
--
2.43.0
---
== Earlier Postings ==
v10: https://lore.kernel.org/all/20250811090605.16057-2-shivankg@amd.com
v9: https://lore.kernel.org/all/20250713174339.13981-2-shivankg@amd.com
v8: https://lore.kernel.org/all/20250618112935.7629-1-shivankg@amd.com
v7: https://lore.kernel.org/all/20250408112402.181574-1-shivankg@amd.com
v6: https://lore.kernel.org/all/20250226082549.6034-1-shivankg@amd.com
v5: https://lore.kernel.org/all/20250219101559.414878-1-shivankg@amd.com
v4: https://lore.kernel.org/all/20250210063227.41125-1-shivankg@amd.com
v3: https://lore.kernel.org/all/20241105164549.154700-1-shivankg@amd.com
v2: https://lore.kernel.org/all/20240919094438.10987-1-shivankg@amd.com
v1: https://lore.kernel.org/all/20240916165743.201087-1-shivankg@amd.com
When mapping guest ITS collections, vgic_lpi_stress iterates over
integers in the range [0, nr_cpus), passing them as the target_addr
parameter to its_send_mapc_cmd(). These integers correspond to the
selftest userspace vCPU IDs that we intend to map each ITS collection
to.
However, its_encode_target() within its_send_mapc_cmd() expects a
vCPU's redistributor address--not the vCPU ID--as the target_addr
parameter. This is evident from how its_encode_target() encodes the
target_addr parameter as:
its_mask_encode(&cmd->raw_cmd[2], target_addr >> 16, 51, 16)
This shows that we right-shift the input target_addr parameter by 16
bits before encoding it. This makes sense when the parameter refers to
redistributor addresses (e.g., 0x20000, 0x30000) but not vCPU IDs
(e.g., 0x2, 0x3).
The current impact of passing vCPU IDs to its_send_mapc_cmd() is that
all vCPU IDs become 0x0 after the bit shift. Thus, when
vgic_its_cmd_handle_mapc() receives the ITS command in vgic-its.c, it
always interprets the collection's target_vcpu as 0. All interrupts
sent to collections will be processed by vCPU 0, which defeats the
purpose of this multi-vCPU test.
Fix by left-shifting the vCPU parameter received by its_send_mapc_cmd
16 bits before passing it into its_encode_target for encoding.
Signed-off-by: Maximilian Dittgen <mdittgen(a)amazon.com>
---
To validate the patch, I added the following debug code at the top of vgic_its_cmd_handle_mapc:
u64 raw_cmd2 = le64_to_cpu(its_cmd[2]);
u32 target_addr = its_cmd_get_target_addr(its_cmd);
kvm_info("MAPC: coll_id=%d, raw_cmd[2]=0x%llx, parsed_target=%u\n",
coll_id, raw_cmd2, target_addr);
vcpu = kvm_get_vcpu_by_id(kvm, its_cmd_get_target_addr(its_cmd));
kvm_info("MAPC: coll_id=%d, vcpu_id=%d\n", coll_id, vcpu ? vcpu->vcpu_id : -1);
I then ran `./vgic_lpi_stress -v 3` to trigger the stress selftest with 3 vCPUs.
Before the patch, the debug logs read:
kvm [20832]: MAPC: coll_id=0, raw_cmd[2]=0x8000000000000000, parsed_target=0
kvm [20832]: MAPC: coll_id=0, vcpu_id=0
kvm [20832]: MAPC: coll_id=1, raw_cmd[2]=0x8000000000000001, parsed_target=0
kvm [20832]: MAPC: coll_id=1, vcpu_id=0
kvm [20832]: MAPC: coll_id=2, raw_cmd[2]=0x8000000000000002, parsed_target=0
kvm [20832]: MAPC: coll_id=2, vcpu_id=0
Note the last bit of the cmd string reflects the collection ID, but the rest of the cmd string reads 0. The handler parses out vCPU 0 for all 3 mapc calls.
After the patch, the debug logs read:
kvm [20019]: MAPC: coll_id=0, raw_cmd[2]=0x8000000000000000, parsed_target=0
kvm [20019]: MAPC: coll_id=0, vcpu_id=0
kvm [20019]: MAPC: coll_id=1, raw_cmd[2]=0x8000000000010001, parsed_target=1
kvm [20019]: MAPC: coll_id=1, vcpu_id=1
kvm [20019]: MAPC: coll_id=2, raw_cmd[2]=0x8000000000020002, parsed_target=2
kvm [20019]: MAPC: coll_id=2, vcpu_id=2
Note that the target vcpu and target collection are both visible in the cmd string. The handler parses out the correct vCPU for all 3 mapc calls.
---
tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c
index 09f270545646..23c46ad17221 100644
--- a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c
+++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c
@@ -15,6 +15,8 @@
#include "gic_v3.h"
#include "processor.h"
+#define GITS_COLLECTION_TARGET_SHIFT 16
+
static u64 its_read_u64(unsigned long offset)
{
return readq_relaxed(GITS_BASE_GVA + offset);
@@ -217,7 +219,7 @@ void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool val
its_encode_cmd(&cmd, GITS_CMD_MAPC);
its_encode_collection(&cmd, collection_id);
- its_encode_target(&cmd, vcpu_id);
+ its_encode_target(&cmd, vcpu_id << GITS_COLLECTION_TARGET_SHIFT);
its_encode_valid(&cmd, valid);
its_send_cmd(cmdq_base, &cmd);
--
2.50.1 (Apple Git-155)
Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christian Schlaeger
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597
Hello,
This patch series addresses an issue in the memory failure handling path
where MF_DELAYED is incorrectly treated as an error. This issue was
revealed because guest_memfd’s .error_remove_folio() callback returns
MF_DELAYED.
Currently, when the .error_remove_folio() callback for guest_memfd returns
MF_DELAYED, there are a few issues.
1. truncate_error_folio() maps this to MF_FAILED. This causes
memory_failure() to return -EBUSY, which unconditionally triggers a
SIGBUS. The process’ configured memory corruption kill policy is ignored
- even if PR_MCE_KILL_LATE is set, the process will still get a SIGBUS
on deferred memory failures.
2. “Failed to punch page” is printed, even though MF_DELAYED indicates that
it was intentionally not punched.
The first patch corrects this by updating truncate_error_folio() to
propagate MF_DELAYED to its caller. This allows memory_failure() to return
0, indicating success, and lets the delayed handling proceed as designed.
This patch also updates me_pagecache_clean() to account for the folio's
refcount, which remains elevated during delayed handling, aligning its
logic with me_swapcache_dirty().
The subsequent two patches add KVM selftests to validate the fix and the
expected behavior of guest_memfd memory failure:
The first test patch verifies that memory_failure() now returns 0 in the
delayed case and confirms that SIGBUS signaling logic remains correct for
other scenarios (e.g., madvise injection or PR_MCE_KILL_EARLY).
The second test patch confirms that after a memory failure, the poisoned
page is correctly unmapped from the KVM guest's stage 2 page tables and
that a subsequent access by the guest correctly notifies the userspace VMM
with EHWPOISON.
This patch series is built upon kvm/next. In addition, to align with the
change of INIT_SHARED and to use the macro wrapper in guest_memfd
selftests, we put these patches behind Sean’s patches [1].
For ease of testing, this series is also available, stitched together, at
https://github.com/googleprodkernel/linux-cc/tree/memory-failure-mf-delayed…
[1]: https://lore.kernel.org/all/20251003232606.4070510-1-seanjc@google.com/T/
Thank you,
Lisa Wang (3):
mm: memory_failure: Fix MF_DELAYED handling on truncation during
failure
KVM: selftests: Add memory failure tests in guest_memfd_test
KVM: selftests: Test guest_memfd behavior with respect to stage 2 page
tables
mm/memory-failure.c | 24 +-
.../testing/selftests/kvm/guest_memfd_test.c | 233 ++++++++++++++++++
2 files changed, 248 insertions(+), 9 deletions(-)
--
2.51.0.788.g6d19910ace-goog
This series primarily adds support for DECLARE_PCI_FIXUP_*() in modules.
There are a few drivers that already use this, and so they are
presumably broken when built as modules.
While at it, I wrote some unit tests that emulate a fake PCI device, and
let the PCI framework match/not-match its vendor/device IDs. This test
can be built into the kernel or built as a module.
I also include some infrastructure changes (patch 3 and 4), so that
ARCH=um (the default for kunit.py), ARCH=arm, and ARCH=arm64 will run
these tests by default. These patches have different maintainers and are
independent, so they can probably be picked up separately. I included
them because otherwise the tests in patch 2 aren't so easy to run.
Brian Norris (4):
PCI: Support FIXUP quirks in modules
PCI: Add KUnit tests for FIXUP quirks
um: Select PCI_DOMAINS_GENERIC
kunit: qemu_configs: Add PCI to arm, arm64
arch/um/Kconfig | 1 +
drivers/pci/Kconfig | 11 ++
drivers/pci/Makefile | 1 +
drivers/pci/fixup-test.c | 197 ++++++++++++++++++++++
drivers/pci/quirks.c | 62 +++++++
include/linux/module.h | 18 ++
kernel/module/main.c | 26 +++
tools/testing/kunit/qemu_configs/arm.py | 1 +
tools/testing/kunit/qemu_configs/arm64.py | 1 +
9 files changed, 318 insertions(+)
create mode 100644 drivers/pci/fixup-test.c
--
2.51.0.384.g4c02a37b29-goog
Hi all,
This series addresses an off-by-one bug in the VMA count limit check
and introduces several improvements for clarity, test coverage, and
observability around the VMA limit mechanism.
The VMA count limit, controlled by sysctl_max_map_count, is a critical
safeguard that prevents a single process from consuming excessive kernel
memory by creating too many memory mappings. However, the checks in
do_mmap() and do_brk_flags() used a strict inequality, allowing a
process to exceed this limit by one VMA.
This series begins by fixing this long-standing bug. The subsequent
patches build on this by improving the surrounding code. A comprehensive
selftest is added to validate VMA operations near the limit, preventing
future regressions. The open-coded limit checks are replaced with a
centralized helper, vma_count_remaining(), to improve readability.
For better code clarity, mm_struct->map_count is renamed to the more
apt vma_count.
Finally, a trace event is added to provide observability for processes
that fail allocations due to VMA exhaustion, which is valuable for
debugging and profiling on production systems.
The major changes in this version are:
1. Rebased on mm-new to resolve prior conflicts.
2. The patches to harden and add assertions for the VMA count
have been dropped. David pointed out that these could be
racy if sysctl_max_map_count is changed from userspace at
just the wrong time.
3. The selftest has been completely rewritten per Lorenzo's
feedback to make use of the kselftest harness and vm_util.h
helpers.
4. The trace event has also been updated to contain more useful
information and has been given a more fitting name, per
feedback from Steve and Lorenzo.
Tested on x86_64 and arm64:
1. Build test:
allyesconfig for rename
2. Selftests:
cd tools/testing/selftests/mm && \
make && \
./run_vmtests.sh -t max_vma_count
3. vma tests:
cd tools/testing/vma && \
make && \
./vma
Link to v2:
https://lore.kernel.org/r/20250915163838.631445-1-kaleshsingh@google.com/
Thanks to everyone for their comments and feedback on the previous
versions.
--Kalesh
Kalesh Singh (5):
mm: fix off-by-one error in VMA count limit checks
mm/selftests: add max_vma_count tests
mm: introduce vma_count_remaining()
mm: rename mm_struct::map_count to vma_count
mm/tracing: introduce trace_mm_insufficient_vma_slots event
MAINTAINERS | 2 +
fs/binfmt_elf.c | 2 +-
fs/coredump.c | 2 +-
include/linux/mm.h | 2 -
include/linux/mm_types.h | 2 +-
include/trace/events/vma.h | 32 +
kernel/fork.c | 2 +-
mm/debug.c | 2 +-
mm/internal.h | 3 +
mm/mmap.c | 31 +-
mm/mremap.c | 13 +-
mm/nommu.c | 8 +-
mm/util.c | 1 -
mm/vma.c | 39 +-
mm/vma_internal.h | 2 +
tools/testing/selftests/mm/.gitignore | 1 +
tools/testing/selftests/mm/Makefile | 1 +
.../selftests/mm/max_vma_count_tests.c | 672 ++++++++++++++++++
tools/testing/selftests/mm/run_vmtests.sh | 5 +
tools/testing/vma/vma.c | 32 +-
tools/testing/vma/vma_internal.h | 16 +-
21 files changed, 818 insertions(+), 52 deletions(-)
create mode 100644 include/trace/events/vma.h
create mode 100644 tools/testing/selftests/mm/max_vma_count_tests.c
base-commit: 4c4142c93fc19cd75a024e5c81b0532578a9e187
--
2.51.0.760.g7b8bcc2412-goog
Hello,
this series aims to convert another test to the test_progs framework to
make sure that it is executed in CI for series sent on the mailing list.
test_tc_tunnel.sh tests a variety of tunnels based on BPF: packets are
encapsulated by a BPF program on the client egress. We then check that
those packets can be decapsulated on server ingress side, either thanks
to kernel-based or BPF-based decapsulation. Those tests are run thanks
to two veths in two dedicated namespaces.
- patches 1 to 3 are preparatory patches
- patch 4 introduce tc_tunnel test into test_progs
- patch 5 gets rid of the test_tc_tunnel.sh script
The new test has been executed both in some x86 local qemu machine, as
well as in CI:
# ./test_progs -a tc_tunnel
#454/1 tc_tunnel/ipip_none:OK
#454/2 tc_tunnel/ipip6_none:OK
#454/3 tc_tunnel/ip6tnl_none:OK
#454/4 tc_tunnel/sit_none:OK
#454/5 tc_tunnel/vxlan_eth:OK
#454/6 tc_tunnel/ip6vxlan_eth:OK
#454/7 tc_tunnel/gre_none:OK
#454/8 tc_tunnel/gre_eth:OK
#454/9 tc_tunnel/gre_mpls:OK
#454/10 tc_tunnel/ip6gre_none:OK
#454/11 tc_tunnel/ip6gre_eth:OK
#454/12 tc_tunnel/ip6gre_mpls:OK
#454/13 tc_tunnel/udp_none:OK
#454/14 tc_tunnel/udp_eth:OK
#454/15 tc_tunnel/udp_mpls:OK
#454/16 tc_tunnel/ip6udp_none:OK
#454/17 tc_tunnel/ip6udp_eth:OK
#454/18 tc_tunnel/ip6udp_mpls:OK
#454 tc_tunnel:OK
Summary: 1/18 PASSED, 0 SKIPPED, 0 FAILED
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore(a)bootlin.com>
---
Alexis Lothoré (eBPF Foundation) (5):
testing/selftests: rename tc_helpers.h to tcx_helpers.h
selftests/bpf: add tc helpers
selftests/bpf: make test_tc_tunnel.bpf.c compatible with big endian platforms
selftests/bpf: integrate test_tc_tunnel.sh tests into test_progs
selftests/bpf: remove test_tc_tunnel.sh
tools/testing/selftests/bpf/Makefile | 2 +-
tools/testing/selftests/bpf/prog_tests/tc_links.c | 46 +-
tools/testing/selftests/bpf/prog_tests/tc_netkit.c | 22 +-
tools/testing/selftests/bpf/prog_tests/tc_opts.c | 40 +-
.../bpf/prog_tests/{tc_helpers.h => tcx_helpers.h} | 6 +-
.../selftests/bpf/prog_tests/test_tc_tunnel.c | 684 +++++++++++++++++++++
.../testing/selftests/bpf/prog_tests/test_tunnel.c | 80 +--
tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 99 ++-
tools/testing/selftests/bpf/tc_helpers.c | 87 +++
tools/testing/selftests/bpf/tc_helpers.h | 9 +
tools/testing/selftests/bpf/test_tc_tunnel.sh | 320 ----------
11 files changed, 884 insertions(+), 511 deletions(-)
---
base-commit: 22267893b8c7f2773896e814800bbe693f206e0c
change-id: 20250811-tc_tunnel-c61342683f18
Best regards,
--
Alexis Lothoré, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com
From: Wilfred Mallawa <wilfred.mallawa(a)wdc.com>
During a handshake, an endpoint may specify a maximum record size limit.
Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the
maximum record size. Meaning that, the outgoing records from the kernel
can exceed a lower size negotiated during the handshake. In such a case,
the TLS endpoint must send a fatal "record_overflow" alert [1], and
thus the record is discarded.
Upcoming Western Digital NVMe-TCP hardware controllers implement TLS
support. For these devices, supporting TLS record size negotiation is
necessary because the maximum TLS record size supported by the controller
is less than the default 16KB currently used by the kernel.
Currently, there is no way to inform the kernel of such a limit. This patch
adds support to a new setsockopt() option `TLS_TX_MAX_PAYLOAD_LEN` that
allows for setting the maximum plaintext fragment size. Once set, outgoing
records are no larger than the size specified. This option can be used to
specify the record size limit.
[1] https://www.rfc-editor.org/rfc/rfc8449
Tested-by: syzbot(a)syzkaller.appspotmail.com
Signed-off-by: Wilfred Mallawa <wilfred.mallawa(a)wdc.com>
---
Changes V5 -> V6:
- Add NULL check for sw_ctx. Reported by syzbot.
V5: https://lore.kernel.org/netdev/20251014051825.1084403-2-wilfred.opensource@…
---
Documentation/networking/tls.rst | 11 ++++++
include/net/tls.h | 3 ++
include/uapi/linux/tls.h | 2 ++
net/tls/tls_device.c | 2 +-
net/tls/tls_main.c | 62 ++++++++++++++++++++++++++++++++
net/tls/tls_sw.c | 2 +-
6 files changed, 80 insertions(+), 2 deletions(-)
diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index 36cc7afc2527..dabab17ab84a 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -280,6 +280,17 @@ If the record decrypted turns out to had been padded or is not a data
record it will be decrypted again into a kernel buffer without zero copy.
Such events are counted in the ``TlsDecryptRetry`` statistic.
+TLS_TX_MAX_PAYLOAD_LEN
+~~~~~~~~~~~~~~~~~~~~~~
+
+Sets the maximum size for the plaintext of a protected record.
+
+When this option is set, the kernel enforces this limit on all transmitted TLS
+records, ensuring no plaintext fragment exceeds the specified size. This can be
+used to specify the TLS Record Size Limit [1].
+
+[1] https://datatracker.ietf.org/doc/html/rfc8449
+
Statistics
==========
diff --git a/include/net/tls.h b/include/net/tls.h
index 857340338b69..f2af113728aa 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -53,6 +53,8 @@ struct tls_rec;
/* Maximum data size carried in a TLS record */
#define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14)
+/* Minimum record size limit as per RFC8449 */
+#define TLS_MIN_RECORD_SIZE_LIM ((size_t)1 << 6)
#define TLS_HEADER_SIZE 5
#define TLS_NONCE_OFFSET TLS_HEADER_SIZE
@@ -226,6 +228,7 @@ struct tls_context {
u8 rx_conf:3;
u8 zerocopy_sendfile:1;
u8 rx_no_pad:1;
+ u16 tx_max_payload_len;
int (*push_pending_record)(struct sock *sk, int flags);
void (*sk_write_space)(struct sock *sk);
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index b66a800389cc..b8b9c42f848c 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -41,6 +41,7 @@
#define TLS_RX 2 /* Set receive parameters */
#define TLS_TX_ZEROCOPY_RO 3 /* TX zerocopy (only sendfile now) */
#define TLS_RX_EXPECT_NO_PAD 4 /* Attempt opportunistic zero-copy */
+#define TLS_TX_MAX_PAYLOAD_LEN 5 /* Maximum plaintext size */
/* Supported versions */
#define TLS_VERSION_MINOR(ver) ((ver) & 0xFF)
@@ -194,6 +195,7 @@ enum {
TLS_INFO_RXCONF,
TLS_INFO_ZC_RO_TX,
TLS_INFO_RX_NO_PAD,
+ TLS_INFO_TX_MAX_PAYLOAD_LEN,
__TLS_INFO_MAX,
};
#define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index a64ae15b1a60..c6289c73cffc 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -461,7 +461,7 @@ static int tls_push_data(struct sock *sk,
/* TLS_HEADER_SIZE is not counted as part of the TLS record, and
* we need to leave room for an authentication tag.
*/
- max_open_record_len = TLS_MAX_PAYLOAD_SIZE +
+ max_open_record_len = tls_ctx->tx_max_payload_len +
prot->prepend_size;
do {
rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index a3ccb3135e51..b96c825b90e9 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -544,6 +544,28 @@ static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval,
return 0;
}
+static int do_tls_getsockopt_tx_payload_len(struct sock *sk, char __user *optval,
+ int __user *optlen)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ u16 payload_len = ctx->tx_max_payload_len;
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ if (len < sizeof(payload_len))
+ return -EINVAL;
+
+ if (put_user(sizeof(payload_len), optlen))
+ return -EFAULT;
+
+ if (copy_to_user(optval, &payload_len, sizeof(payload_len)))
+ return -EFAULT;
+
+ return 0;
+}
+
static int do_tls_getsockopt(struct sock *sk, int optname,
char __user *optval, int __user *optlen)
{
@@ -563,6 +585,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname,
case TLS_RX_EXPECT_NO_PAD:
rc = do_tls_getsockopt_no_pad(sk, optval, optlen);
break;
+ case TLS_TX_MAX_PAYLOAD_LEN:
+ rc = do_tls_getsockopt_tx_payload_len(sk, optval, optlen);
+ break;
default:
rc = -ENOPROTOOPT;
break;
@@ -812,6 +837,30 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval,
return rc;
}
+static int do_tls_setsockopt_tx_payload_len(struct sock *sk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ struct tls_sw_context_tx *sw_ctx = tls_sw_ctx_tx(ctx);
+ u16 value;
+
+ if (sw_ctx && sw_ctx->open_rec)
+ return -EBUSY;
+
+ if (sockptr_is_null(optval) || optlen != sizeof(value))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&value, optval, sizeof(value)))
+ return -EFAULT;
+
+ if (value < TLS_MIN_RECORD_SIZE_LIM || value > TLS_MAX_PAYLOAD_SIZE)
+ return -EINVAL;
+
+ ctx->tx_max_payload_len = value;
+
+ return 0;
+}
+
static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
unsigned int optlen)
{
@@ -833,6 +882,11 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
case TLS_RX_EXPECT_NO_PAD:
rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
break;
+ case TLS_TX_MAX_PAYLOAD_LEN:
+ lock_sock(sk);
+ rc = do_tls_setsockopt_tx_payload_len(sk, optval, optlen);
+ release_sock(sk);
+ break;
default:
rc = -ENOPROTOOPT;
break;
@@ -1022,6 +1076,7 @@ static int tls_init(struct sock *sk)
ctx->tx_conf = TLS_BASE;
ctx->rx_conf = TLS_BASE;
+ ctx->tx_max_payload_len = TLS_MAX_PAYLOAD_SIZE;
update_sk_prot(sk, ctx);
out:
write_unlock_bh(&sk->sk_callback_lock);
@@ -1111,6 +1166,12 @@ static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin)
goto nla_failure;
}
+ err = nla_put_u16(skb, TLS_INFO_TX_MAX_PAYLOAD_LEN,
+ ctx->tx_max_payload_len);
+
+ if (err)
+ goto nla_failure;
+
rcu_read_unlock();
nla_nest_end(skb, start);
return 0;
@@ -1132,6 +1193,7 @@ static size_t tls_get_info_size(const struct sock *sk, bool net_admin)
nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */
nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */
nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */
+ nla_total_size(sizeof(u16)) + /* TLS_INFO_TX_MAX_PAYLOAD_LEN */
0;
return size;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index daac9fd4be7e..e76ea38b712a 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1079,7 +1079,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
orig_size = msg_pl->sg.size;
full_record = false;
try_to_copy = msg_data_left(msg);
- record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
+ record_room = tls_ctx->tx_max_payload_len - msg_pl->sg.size;
if (try_to_copy >= record_room) {
try_to_copy = record_room;
full_record = true;
--
2.51.0
Hello,
I was wondering if you received the email I sent last week regarding the
December trip, I would hope you can plan for myself and my family of 14
(10 Adults & 4 Children)
am attaching an itinerary also for you to take a look
thank you
Pina Alvarez
An RFC patch series [1] that add a new DAMON sysfs file for arbitrary
targets removal is under review. Add a selftest for the feature. The
new test uses the feature using the python wrapper of DAMON sysfs
interface, and confirm the expected internal data structure change is
made using drgn.
So this patch series may better to be a part of the other one [1] that
introduces the obsolete_target file. But, because no significant change
is requested on the series so far, I'm posting this as an individual
RFC.
In the next version, I may merge the two series into one, to add all
related changes at one step.
[1] https://lore.kernel.org/20251016214736.84286-1-sj@kernel.org
SeongJae Park (4):
selftests/damon/_damon_sysfs: support obsolete_target file
drgn_dump_damon_status: dump damon_target->obsolete
sysfs.py: extend assert_ctx_committed() for monitoring targets
selftests/damon/sysfs: add obsolete_target test
tools/testing/selftests/damon/_damon_sysfs.py | 11 ++++-
.../selftests/damon/drgn_dump_damon_status.py | 1 +
tools/testing/selftests/damon/sysfs.py | 48 +++++++++++++++++++
3 files changed, 58 insertions(+), 2 deletions(-)
base-commit: 1aba8bd57e6aaa1c9e699c8de66bcc931d4b1116
--
2.47.3
Currently, there is no straightforward way to obtain the master/slave
relationship via netlink. Users have to retrieve all slaves through sysfs
to determine these relationships.
To address this, we can either list all slaves under the bond interface
or display the master index in each slave. Since the number of slaves could
be quite large (e.g., 100+), it is more efficient to show the master
information in the slave entry.
Signed-off-by: Hangbin Liu <liuhangbin(a)gmail.com>
---
drivers/net/bonding/bond_netlink.c | 4 ++++
include/uapi/linux/if_link.h | 1 +
2 files changed, 5 insertions(+)
diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 286f11c517f7..ff3f11674a8b 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -29,6 +29,7 @@ static size_t bond_get_slave_size(const struct net_device *bond_dev,
nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE */
nla_total_size(sizeof(s32)) + /* IFLA_BOND_SLAVE_PRIO */
nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_ACTOR_PORT_PRIO */
+ nla_total_size(sizeof(u32)) + /* IFLA_BOND_SLAVE_MASTER */
0;
}
@@ -38,6 +39,9 @@ static int bond_fill_slave_info(struct sk_buff *skb,
{
struct slave *slave = bond_slave_get_rtnl(slave_dev);
+ if (nla_put_u32(skb, IFLA_BOND_SLAVE_MASTER, bond_dev->ifindex))
+ goto nla_put_failure;
+
if (nla_put_u8(skb, IFLA_BOND_SLAVE_STATE, bond_slave_state(slave)))
goto nla_put_failure;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 3b491d96e52e..bad41a1807f7 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1567,6 +1567,7 @@ enum {
IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE,
IFLA_BOND_SLAVE_PRIO,
IFLA_BOND_SLAVE_ACTOR_PORT_PRIO,
+ IFLA_BOND_SLAVE_MASTER,
__IFLA_BOND_SLAVE_MAX,
};
--
2.50.1
On Tue, 17 Jun 2025 16:00:33 +0200 Guillaume Gomez <guillaume1.gomez(a)gmail.com> wrote:
>
> The goal of this patch is to remove the use of 2 unstable
> rustdoc features (`--no-run` and `--test-builder`) and replace it with a
> stable feature: `--output-format=doctest`, which was added in
> https://github.com/rust-lang/rust/pull/134531.
>
> Before this patch, the code was using very hacky methods in order to retrieve
> doctests, modify them as needed and then concatenate all of them in one file.
>
> Now, with this new flag, it instead asks rustdoc to provide the doctests
> code with their associated information such as file path and line number.
>
> Signed-off-by: Guillaume Gomez <guillaume1.gomez(a)gmail.com>
> ---
(Procedural bit: normally we provide a changelog between versions after
this `---` line so that reviewers now what changed so far.)
I finally took a look at this again, so I rebased it and got:
thread 'main' panicked at scripts/rustdoc_test_gen.rs:92:15:
No path candidates found for `rust_kernel_alloc_allocator.rs`.This is likely a bug in the build system, or some files went away while compiling.
which brings me to the bigger point: the main reason to have the new
output format is to avoid all these hacks, including the "find the real
path back to the original file" hack here. More generally, to avoid the
2 scripts approach.
So now we can finally get rid of all that and simplify. That is, we can
just merge it all in a single script that reads the JSON and builds the
result directly, since now we have everything we need (originally I
needed the 2 scripts approach since `rustdoc` executed the test builder
once per test so I had to somehow collect the results).
i.e. no more hundreds of generated files/processes, just a simple pipe.
Anyway, just to check we had everything we needed, I did a quick try --
please see the draft patch below.
I gave it a go -- please see the draft patch below. The diff w.r.t. your
patch would be something like +217 -341, i.e. we get rid of quite a lot
of lines. I added as well some more context in the commit message, and
put the right docs in the unified script. This also improves the sorting
of the tests (it now follows the line number better).
We still have to preserve the support for the old compilers, so what I
think I will do is just have the new script separately, keeping the old
ones as-is until we can remove them when we upgrade the minimum for e.g.
the next Debian Stable.
Cc'ing David and KUnit, since this is closer to getting ready -- please
let me know if this raises alarms for anyone.
Thanks!
Cheers,
Miguel
From 4aa4581e9004cb95534805f73fdae56c454b3d1d Mon Sep 17 00:00:00 2001
From: Guillaume Gomez <guillaume1.gomez(a)gmail.com>
Date: Tue, 17 Jun 2025 16:00:33 +0200
Subject: [PATCH] [TODO] rust: use new `rustdoc`'s `--output-format=doctest`
The goal of this patch is to remove the use of 2 unstable `rustdoc`
features (`--no-run` and `--test-builder`) and replace it with a future
stable feature: `--output-format=doctest` [1].
Before this patch, the KUnit Rust doctests generation needed to employ
several hacks in order to retrieve doctests, modify them as needed and
then concatenate all of them in one file. In particular, it required
using two scripts: one that got run as a test builder by `rustdoc` in
order to extract the data and another that collected the results of all
those processes.
We requested upstream `rustdoc` a feature to get `rustdoc` to generate
the information directly -- one that would also be designed to eventually
be made stable. This resulted in the `--output-format=doctest` flag,
which makes all the information neatly available as a JSON output,
including filenames, line numbers, doctest test bodies and so on.
Thus take advantage of the new flag, which in turn allows to just use
a single script that gets piped that JSON output from the compiler and
uses it to directly build the generated files to be run by KUnit.
Link: https://github.com/rust-lang/rust/issues/134529 [1]
Signed-off-by: Guillaume Gomez <guillaume1.gomez(a)gmail.com>
Co-developed-by: Miguel Ojeda <ojeda(a)kernel.org>
Signed-off-by: Miguel Ojeda <ojeda(a)kernel.org>
---
rust/Makefile | 12 +-
scripts/.gitignore | 1 -
scripts/Makefile | 2 -
scripts/json.rs | 235 +++++++++++++++++++++++++
scripts/remove-stale-files | 2 +
scripts/rustdoc_test_builder.rs | 300 ++++++++++++++++++++++++++------
scripts/rustdoc_test_gen.rs | 265 ----------------------------
7 files changed, 485 insertions(+), 332 deletions(-)
create mode 100644 scripts/json.rs
delete mode 100644 scripts/rustdoc_test_gen.rs
diff --git a/rust/Makefile b/rust/Makefile
index 23c7ae905bd2..93bc456e3576 100644
--- a/rust/Makefile
+++ b/rust/Makefile
@@ -57,7 +57,6 @@ RUST_LIB_SRC ?= $(rustc_sysroot)/lib/rustlib/src/rust/library
ifneq ($(quiet),)
rust_test_quiet=-q
rustdoc_test_quiet=--test-args -q
-rustdoc_test_kernel_quiet=>/dev/null
endif
core-cfgs = \
@@ -224,21 +223,20 @@ quiet_cmd_rustdoc_test_kernel = RUSTDOC TK $<
rm -rf $(objtree)/$(obj)/test/doctests/kernel; \
mkdir -p $(objtree)/$(obj)/test/doctests/kernel; \
OBJTREE=$(abspath $(objtree)) \
- $(RUSTDOC) --test $(filter-out --remap-path-prefix=%,$(rust_flags)) \
+ $(RUSTDOC) $(filter-out --remap-path-prefix=%,$(rust_flags)) \
-L$(objtree)/$(obj) --extern ffi --extern pin_init \
--extern kernel --extern build_error --extern macros \
--extern bindings --extern uapi \
- --no-run --crate-name kernel -Zunstable-options \
+ --crate-name kernel -Zunstable-options \
--sysroot=/dev/null \
+ --output-format=doctest \
$(rustdoc_modifiers_workaround) \
- --test-builder $(objtree)/scripts/rustdoc_test_builder \
- $< $(rustdoc_test_kernel_quiet); \
- $(objtree)/scripts/rustdoc_test_gen
+ $< | $(objtree)/scripts/rustdoc_test_builder
%/doctests_kernel_generated.rs %/doctests_kernel_generated_kunit.c: \
$(src)/kernel/lib.rs $(obj)/kernel.o \
$(objtree)/scripts/rustdoc_test_builder \
- $(objtree)/scripts/rustdoc_test_gen FORCE
+ FORCE
+$(call if_changed,rustdoc_test_kernel)
# We cannot use `-Zpanic-abort-tests` because some tests are dynamic,
diff --git a/scripts/.gitignore b/scripts/.gitignore
index c2ef68848da5..6e6ab7b8f496 100644
--- a/scripts/.gitignore
+++ b/scripts/.gitignore
@@ -7,7 +7,6 @@
/module.lds
/recordmcount
/rustdoc_test_builder
-/rustdoc_test_gen
/sign-file
/sorttable
/target.json
diff --git a/scripts/Makefile b/scripts/Makefile
index 46f860529df5..71c7d9dcd95b 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -10,7 +10,6 @@ hostprogs-always-$(CONFIG_ASN1) += asn1_compiler
hostprogs-always-$(CONFIG_MODULE_SIG_FORMAT) += sign-file
hostprogs-always-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert
hostprogs-always-$(CONFIG_RUST_KERNEL_DOCTESTS) += rustdoc_test_builder
-hostprogs-always-$(CONFIG_RUST_KERNEL_DOCTESTS) += rustdoc_test_gen
ifneq ($(or $(CONFIG_X86_64),$(CONFIG_X86_32)),)
always-$(CONFIG_RUST) += target.json
@@ -23,7 +22,6 @@ endif
hostprogs += generate_rust_target
generate_rust_target-rust := y
rustdoc_test_builder-rust := y
-rustdoc_test_gen-rust := y
HOSTCFLAGS_sorttable.o = -I$(srctree)/tools/include
HOSTLDLIBS_sorttable = -lpthread
diff --git a/scripts/json.rs b/scripts/json.rs
new file mode 100644
index 000000000000..aff24bfd9213
--- /dev/null
+++ b/scripts/json.rs
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! JSON parser used to parse rustdoc output when retrieving doctests.
+
+use std::collections::HashMap;
+use std::iter::Peekable;
+use std::str::FromStr;
+
+#[derive(Debug, PartialEq, Eq)]
+pub(crate) enum JsonValue {
+ Object(HashMap<String, JsonValue>),
+ String(String),
+ Number(i32),
+ Bool(bool),
+ Array(Vec<JsonValue>),
+ Null,
+}
+
+fn parse_ident<I: Iterator<Item = char>>(
+ iter: &mut I,
+ output: JsonValue,
+ ident: &str,
+) -> Result<JsonValue, String> {
+ let mut ident_iter = ident.chars().skip(1);
+
+ loop {
+ let i = ident_iter.next();
+ if i.is_none() {
+ return Ok(output);
+ }
+ let c = iter.next();
+ if i != c {
+ if let Some(c) = c {
+ return Err(format!("Unexpected character `{c}` when parsing `{ident}`"));
+ }
+ return Err(format!("Missing character when parsing `{ident}`"));
+ }
+ }
+}
+
+fn parse_string<I: Iterator<Item = char>>(iter: &mut I) -> Result<JsonValue, String> {
+ let mut out = String::new();
+
+ while let Some(c) = iter.next() {
+ match c {
+ '\\' => {
+ let Some(c) = iter.next() else { break };
+ match c {
+ '"' | '\\' | '/' => out.push(c),
+ 'b' => out.push(char::from(0x8u8)),
+ 'f' => out.push(char::from(0xCu8)),
+ 't' => out.push('\t'),
+ 'r' => out.push('\r'),
+ 'n' => out.push('\n'),
+ _ => {
+ // This code doesn't handle codepoints so we put the string content as is.
+ out.push('\\');
+ out.push(c);
+ }
+ }
+ }
+ '"' => {
+ return Ok(JsonValue::String(out));
+ }
+ _ => out.push(c),
+ }
+ }
+ Err(format!("Unclosed JSON string `{out}`"))
+}
+
+fn parse_number<I: Iterator<Item = char>>(
+ iter: &mut Peekable<I>,
+ digit: char,
+) -> Result<JsonValue, String> {
+ let mut nb = String::new();
+
+ nb.push(digit);
+ loop {
+ // We peek next character to prevent taking it from the iterator in case it's a comma.
+ if matches!(iter.peek(), Some(',' | '}' | ']')) {
+ break;
+ }
+ let Some(c) = iter.next() else { break };
+ if c.is_whitespace() {
+ break;
+ } else if !c.is_ascii_digit() {
+ return Err(format!("Error when parsing number `{nb}`: found `{c}`"));
+ }
+ nb.push(c);
+ }
+ i32::from_str(&nb)
+ .map(|nb| JsonValue::Number(nb))
+ .map_err(|error| format!("Invalid number: `{error}`"))
+}
+
+fn parse_array<I: Iterator<Item = char>>(iter: &mut Peekable<I>) -> Result<JsonValue, String> {
+ let mut values = Vec::new();
+
+ 'main: loop {
+ let Some(c) = iter.next() else {
+ return Err("Unclosed array".to_string());
+ };
+ if c.is_whitespace() {
+ continue;
+ } else if c == ']' {
+ break;
+ }
+ values.push(parse(iter, c)?);
+ while let Some(c) = iter.next() {
+ if c.is_whitespace() {
+ continue;
+ } else if c == ',' {
+ break;
+ } else if c == ']' {
+ break 'main;
+ } else {
+ return Err(format!("Unexpected `{c}` when parsing array"));
+ }
+ }
+ }
+ Ok(JsonValue::Array(values))
+}
+
+fn parse_object<I: Iterator<Item = char>>(iter: &mut Peekable<I>) -> Result<JsonValue, String> {
+ let mut values = HashMap::new();
+
+ 'main: loop {
+ let Some(c) = iter.next() else {
+ return Err("Unclosed object".to_string());
+ };
+ let key;
+ if c.is_whitespace() {
+ continue;
+ } else if c == '"' {
+ let JsonValue::String(k) = parse_string(iter)? else {
+ unreachable!()
+ };
+ key = k;
+ } else if c == '}' {
+ break;
+ } else {
+ return Err(format!("Expected `\"` when parsing Object, found `{c}`"));
+ }
+
+ // We then get the `:` separator.
+ loop {
+ let Some(c) = iter.next() else {
+ return Err(format!("Missing value after key `{key}`"));
+ };
+ if c.is_whitespace() {
+ continue;
+ } else if c == ':' {
+ break;
+ } else {
+ return Err(format!(
+ "Expected `:` after key, found `{c}` when parsing object"
+ ));
+ }
+ }
+ // Then the value.
+ let value = loop {
+ let Some(c) = iter.next() else {
+ return Err(format!("Missing value after key `{key}`"));
+ };
+ if c.is_whitespace() {
+ continue;
+ } else {
+ break parse(iter, c)?;
+ }
+ };
+
+ if values.contains_key(&key) {
+ return Err(format!("Duplicated key `{key}`"));
+ }
+ values.insert(key, value);
+
+ while let Some(c) = iter.next() {
+ if c.is_whitespace() {
+ continue;
+ } else if c == ',' {
+ break;
+ } else if c == '}' {
+ break 'main;
+ } else {
+ return Err(format!("Unexpected `{c}` when parsing array"));
+ }
+ }
+ }
+ Ok(JsonValue::Object(values))
+}
+
+fn parse<I: Iterator<Item = char>>(iter: &mut Peekable<I>, c: char) -> Result<JsonValue, String> {
+ match c {
+ '{' => parse_object(iter),
+ '"' => parse_string(iter),
+ '[' => parse_array(iter),
+ 't' => parse_ident(iter, JsonValue::Bool(true), "true"),
+ 'f' => parse_ident(iter, JsonValue::Bool(false), "false"),
+ 'n' => parse_ident(iter, JsonValue::Null, "null"),
+ c => {
+ if c.is_ascii_digit() || c == '-' {
+ parse_number(iter, c)
+ } else {
+ Err(format!("Unexpected `{c}` character"))
+ }
+ }
+ }
+}
+
+impl JsonValue {
+ pub(crate) fn parse(input: &str) -> Result<Self, String> {
+ let mut iter = input.chars().peekable();
+ let mut value = None;
+
+ while let Some(c) = iter.next() {
+ if c.is_whitespace() {
+ continue;
+ }
+ value = Some(parse(&mut iter, c)?);
+ break;
+ }
+ while let Some(c) = iter.next() {
+ if c.is_whitespace() {
+ continue;
+ } else {
+ return Err(format!("Unexpected character `{c}` after content"));
+ }
+ }
+ if let Some(value) = value {
+ Ok(value)
+ } else {
+ Err("Empty content".to_string())
+ }
+ }
+}
diff --git a/scripts/remove-stale-files b/scripts/remove-stale-files
index 6e39fa8540df..190dee6b50e8 100755
--- a/scripts/remove-stale-files
+++ b/scripts/remove-stale-files
@@ -26,3 +26,5 @@ rm -f scripts/selinux/genheaders/genheaders
rm -f *.spec
rm -f lib/test_fortify.log
+
+rm -f scripts/rustdoc_test_gen
diff --git a/scripts/rustdoc_test_builder.rs b/scripts/rustdoc_test_builder.rs
index f7540bcf595a..dd65bb670d25 100644
--- a/scripts/rustdoc_test_builder.rs
+++ b/scripts/rustdoc_test_builder.rs
@@ -1,74 +1,260 @@
// SPDX-License-Identifier: GPL-2.0
-//! Test builder for `rustdoc`-generated tests.
+//! Generates KUnit tests from `rustdoc`-generated doctests.
//!
-//! This script is a hack to extract the test from `rustdoc`'s output. Ideally, `rustdoc` would
-//! have an option to generate this information instead, e.g. as JSON output.
+//! KUnit passes a context (`struct kunit *`) to each test, which should be forwarded to the other
+//! KUnit functions and macros.
//!
-//! The `rustdoc`-generated test names look like `{file}_{line}_{number}`, e.g.
-//! `...path_rust_kernel_sync_arc_rs_42_0`. `number` is the "test number", needed in cases like
-//! a macro that expands into items with doctests is invoked several times within the same line.
+//! However, we want to keep this as an implementation detail because:
//!
-//! However, since these names are used for bisection in CI, the line number makes it not stable
-//! at all. In the future, we would like `rustdoc` to give us the Rust item path associated with
-//! the test, plus a "test number" (for cases with several examples per item) and generate a name
-//! from that. For the moment, we generate ourselves a new name, `{file}_{number}` instead, in
-//! the `gen` script (done there since we need to be aware of all the tests in a given file).
+//! - Test code should not care about the implementation.
+//!
+//! - Documentation looks worse if it needs to carry extra details unrelated to the piece
+//! being described.
+//!
+//! - Test code should be able to define functions and call them, without having to carry
+//! the context.
+//!
+//! - Later on, we may want to be able to test non-kernel code (e.g. `core` or third-party
+//! crates) which likely use the standard library `assert*!` macros.
+//!
+//! For this reason, instead of the passed context, `kunit_get_current_test()` is used instead
+//! (i.e. `current->kunit_test`).
+//!
+//! Note that this means other threads/tasks potentially spawned by a given test, if failing, will
+//! report the failure in the kernel log but will not fail the actual test. Saving the pointer in
+//! e.g. a `static` per test does not fully solve the issue either, because currently KUnit does
+//! not support assertions (only expectations) from other tasks. Thus leave that feature for
+//! the future, which simplifies the code here too. We could also simply not allow `assert`s in
+//! other tasks, but that seems overly constraining, and we do want to support them, eventually.
-use std::io::Read;
+use std::{
+ fs::File,
+ io::{BufWriter, Read, Write},
+};
+
+use json::JsonValue;
+
+mod json;
fn main() {
let mut stdin = std::io::stdin().lock();
- let mut body = String::new();
- stdin.read_to_string(&mut body).unwrap();
+ let mut rustdoc_json = String::new();
+ stdin.read_to_string(&mut rustdoc_json).unwrap();
- // Find the generated function name looking for the inner function inside `main()`.
- //
- // The line we are looking for looks like one of the following:
- //
- // ```
- // fn main() { #[allow(non_snake_case)] fn _doctest_main_rust_kernel_file_rs_28_0() {
- // fn main() { #[allow(non_snake_case)] fn _doctest_main_rust_kernel_file_rs_37_0() -> Result<(), impl ::core::fmt::Debug> {
- // ```
- //
- // It should be unlikely that doctest code matches such lines (when code is formatted properly).
- let rustdoc_function_name = body
- .lines()
- .find_map(|line| {
- Some(
- line.split_once("fn main() {")?
- .1
- .split_once("fn ")?
- .1
- .split_once("()")?
- .0,
- )
- .filter(|x| x.chars().all(|c| c.is_alphanumeric() || c == '_'))
- })
- .expect("No test function found in `rustdoc`'s output.");
-
- // Qualify `Result` to avoid the collision with our own `Result` coming from the prelude.
- let body = body.replace(
- &format!("{rustdoc_function_name}() -> Result<(), impl ::core::fmt::Debug> {{"),
- &format!(
- "{rustdoc_function_name}() -> ::core::result::Result<(), impl ::core::fmt::Debug> {{"
- ),
+ let JsonValue::Object(rustdoc) = JsonValue::parse(&rustdoc_json).unwrap() else {
+ panic!("Expected an object")
+ };
+
+ let Some(JsonValue::Number(format_version)) = rustdoc.get("format_version") else {
+ panic!("missing `format_version` field");
+ };
+ assert!(
+ *format_version == 2,
+ "unsupported rustdoc format version: {format_version}"
);
- // For tests that get generated with `Result`, like above, `rustdoc` generates an `unwrap()` on
- // the return value to check there were no returned errors. Instead, we use our assert macro
- // since we want to just fail the test, not panic the kernel.
+ let Some(JsonValue::Array(doctests)) = rustdoc.get("doctests") else {
+ panic!("`doctests` field is missing or has the wrong type");
+ };
+
+ let mut nb_generated = 0;
+ let mut number = 0;
+ let mut last_file = "";
+ let mut rust_tests = String::new();
+ let mut c_test_declarations = String::new();
+ let mut c_test_cases = String::new();
+ for doctest in doctests {
+ let JsonValue::Object(doctest) = doctest else {
+ unreachable!()
+ };
+
+ // We check if we need to skip this test by checking it's a rust code and it's not ignored.
+ if let Some(JsonValue::Object(attributes)) = doctest.get("doctest_attributes") {
+ if attributes.get("rust") != Some(&JsonValue::Bool(true)) {
+ continue;
+ }
+ if let Some(JsonValue::String(ignore)) = attributes.get("ignore") {
+ if ignore != "None" {
+ continue;
+ }
+ }
+ }
+
+ let (
+ Some(JsonValue::String(file)),
+ Some(JsonValue::Number(line)),
+ Some(JsonValue::String(name)),
+ Some(JsonValue::Object(doctest_code)),
+ ) = (
+ doctest.get("file"),
+ doctest.get("line"),
+ doctest.get("name"),
+ doctest.get("doctest_code"),
+ )
+ else {
+ continue;
+ };
+
+ let (
+ Some(JsonValue::String(code)),
+ Some(JsonValue::String(crate_level_code)),
+ Some(JsonValue::Object(wrapper)),
+ ) = (
+ doctest_code.get("code"),
+ doctest_code.get("crate_level"),
+ doctest_code.get("wrapper"),
+ )
+ else {
+ continue;
+ };
+
+ let (Some(JsonValue::String(before)), Some(JsonValue::String(after))) =
+ (wrapper.get("before"), wrapper.get("after"))
+ else {
+ continue;
+ };
+
+ // For tests that get generated with `Result`, `rustdoc` generates an `unwrap()` on
+ // the return value to check there were no returned errors. Instead, we use our assert macro
+ // since we want to just fail the test, not panic the kernel.
+ //
+ // We save the result in a variable so that the failed assertion message looks nicer.
+ let after = if let Some(JsonValue::Bool(true)) = wrapper.get("returns_result") {
+ "\n} let test_return_value = _inner(); assert!(test_return_value.is_ok()); }"
+ } else {
+ after.as_str()
+ };
+
+ let body = format!("{crate_level_code}\n{before}\n{code}{after}\n");
+ nb_generated += 1;
+
+ // Generate an ID sequence ("test number") for each one in the file.
+ if file == last_file {
+ number += 1;
+ } else {
+ number = 0;
+ last_file = file;
+ }
+
+ // Generate a KUnit name (i.e. test name and C symbol) for this test.
+ //
+ // We avoid the line number, like `rustdoc` does, to make things slightly more stable for
+ // bisection purposes. However, to aid developers in mapping back what test failed, we will
+ // print a diagnostics line in the KTAP report.
+ let kunit_name = format!(
+ "rust_doctest_{}_{number}",
+ file.replace('/', "_").replace('.', "_")
+ );
+
+ // Calculate how many lines before `main` function (including the `main` function line).
+ let body_offset = body
+ .lines()
+ .take_while(|line| !line.contains("fn main() {"))
+ .count()
+ + 1;
+
+ use std::fmt::Write;
+ write!(
+ rust_tests,
+ r#"/// Generated `{name}` KUnit test case from a Rust documentation test.
+#[no_mangle]
+pub extern "C" fn {kunit_name}(__kunit_test: *mut ::kernel::bindings::kunit) {{
+ /// Overrides the usual [`assert!`] macro with one that calls KUnit instead.
+ #[allow(unused)]
+ macro_rules! assert {{
+ ($cond:expr $(,)?) => {{{{
+ ::kernel::kunit_assert!(
+ "{kunit_name}", "{file}", __DOCTEST_ANCHOR - {line}, $cond
+ );
+ }}}}
+ }}
+
+ /// Overrides the usual [`assert_eq!`] macro with one that calls KUnit instead.
+ #[allow(unused)]
+ macro_rules! assert_eq {{
+ ($left:expr, $right:expr $(,)?) => {{{{
+ ::kernel::kunit_assert_eq!(
+ "{kunit_name}", "{file}", __DOCTEST_ANCHOR - {line}, $left, $right
+ );
+ }}}}
+ }}
+
+ // Many tests need the prelude, so provide it by default.
+ #[allow(unused)]
+ use ::kernel::prelude::*;
+
+ // Unconditionally print the location of the original doctest (i.e. rather than the location in
+ // the generated file) so that developers can easily map the test back to the source code.
//
- // We save the result in a variable so that the failed assertion message looks nicer.
- let body = body.replace(
- &format!("}} {rustdoc_function_name}().unwrap() }}"),
- &format!("}} let test_return_value = {rustdoc_function_name}(); assert!(test_return_value.is_ok()); }}"),
- );
+ // This information is also printed when assertions fail, but this helps in the successful cases
+ // when the user is running KUnit manually, or when passing `--raw_output` to `kunit.py`.
+ //
+ // This follows the syntax for declaring test metadata in the proposed KTAP v2 spec, which may
+ // be used for the proposed KUnit test attributes API. Thus hopefully this will make migration
+ // easier later on.
+ ::kernel::kunit::info(fmt!(" # {kunit_name}.location: {file}:{line}\n"));
+
+ /// The anchor where the test code body starts.
+ #[allow(unused)]
+ static __DOCTEST_ANCHOR: i32 = ::core::line!() as i32 + {body_offset} + 1;
+ {{
+ {body}
+ main();
+ }}
+}}
+
+"#
+ )
+ .unwrap();
+
+ write!(c_test_declarations, "void {kunit_name}(struct kunit *);\n").unwrap();
+ write!(c_test_cases, " KUNIT_CASE({kunit_name}),\n").unwrap();
+ }
+
+ if nb_generated == 0 {
+ panic!("No test function found in `rustdoc`'s output.");
+ }
+
+ let rust_tests = rust_tests.trim();
+ let c_test_declarations = c_test_declarations.trim();
+ let c_test_cases = c_test_cases.trim();
+
+ write!(
+ BufWriter::new(File::create("rust/doctests_kernel_generated.rs").unwrap()),
+ r#"//! `kernel` crate documentation tests.
+
+const __LOG_PREFIX: &[u8] = b"rust_doctests_kernel\0";
+
+{rust_tests}
+"#
+ )
+ .unwrap();
+
+ write!(
+ BufWriter::new(File::create("rust/doctests_kernel_generated_kunit.c").unwrap()),
+ r#"/*
+ * `kernel` crate documentation tests.
+ */
+
+#include <kunit/test.h>
+
+{c_test_declarations}
+
+static struct kunit_case test_cases[] = {{
+ {c_test_cases}
+ {{ }}
+}};
- // Figure out a smaller test name based on the generated function name.
- let name = rustdoc_function_name.split_once("_rust_kernel_").unwrap().1;
+static struct kunit_suite test_suite = {{
+ .name = "rust_doctests_kernel",
+ .test_cases = test_cases,
+}};
- let path = format!("rust/test/doctests/kernel/{name}");
+kunit_test_suite(test_suite);
- std::fs::write(path, body.as_bytes()).unwrap();
+MODULE_LICENSE("GPL");
+"#
+ )
+ .unwrap();
}
diff --git a/scripts/rustdoc_test_gen.rs b/scripts/rustdoc_test_gen.rs
deleted file mode 100644
index c8f9dc2ab976..000000000000
--- a/scripts/rustdoc_test_gen.rs
+++ /dev/null
@@ -1,265 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-//! Generates KUnit tests from saved `rustdoc`-generated tests.
-//!
-//! KUnit passes a context (`struct kunit *`) to each test, which should be forwarded to the other
-//! KUnit functions and macros.
-//!
-//! However, we want to keep this as an implementation detail because:
-//!
-//! - Test code should not care about the implementation.
-//!
-//! - Documentation looks worse if it needs to carry extra details unrelated to the piece
-//! being described.
-//!
-//! - Test code should be able to define functions and call them, without having to carry
-//! the context.
-//!
-//! - Later on, we may want to be able to test non-kernel code (e.g. `core` or third-party
-//! crates) which likely use the standard library `assert*!` macros.
-//!
-//! For this reason, instead of the passed context, `kunit_get_current_test()` is used instead
-//! (i.e. `current->kunit_test`).
-//!
-//! Note that this means other threads/tasks potentially spawned by a given test, if failing, will
-//! report the failure in the kernel log but will not fail the actual test. Saving the pointer in
-//! e.g. a `static` per test does not fully solve the issue either, because currently KUnit does
-//! not support assertions (only expectations) from other tasks. Thus leave that feature for
-//! the future, which simplifies the code here too. We could also simply not allow `assert`s in
-//! other tasks, but that seems overly constraining, and we do want to support them, eventually.
-
-use std::{
- fs,
- fs::File,
- io::{BufWriter, Read, Write},
- path::{Path, PathBuf},
-};
-
-/// Find the real path to the original file based on the `file` portion of the test name.
-///
-/// `rustdoc` generated `file`s look like `sync_locked_by_rs`. Underscores (except the last one)
-/// may represent an actual underscore in a directory/file, or a path separator. Thus the actual
-/// file might be `sync_locked_by.rs`, `sync/locked_by.rs`, `sync_locked/by.rs` or
-/// `sync/locked/by.rs`. This function walks the file system to determine which is the real one.
-///
-/// This does require that ambiguities do not exist, but that seems fair, especially since this is
-/// all supposed to be temporary until `rustdoc` gives us proper metadata to build this. If such
-/// ambiguities are detected, they are diagnosed and the script panics.
-fn find_real_path<'a>(srctree: &Path, valid_paths: &'a mut Vec<PathBuf>, file: &str) -> &'a str {
- valid_paths.clear();
-
- let potential_components: Vec<&str> = file.strip_suffix("_rs").unwrap().split('_').collect();
-
- find_candidates(srctree, valid_paths, Path::new(""), &potential_components);
- fn find_candidates(
- srctree: &Path,
- valid_paths: &mut Vec<PathBuf>,
- prefix: &Path,
- potential_components: &[&str],
- ) {
- // The base case: check whether all the potential components left, joined by underscores,
- // is a file.
- let joined_potential_components = potential_components.join("_") + ".rs";
- if srctree
- .join("rust/kernel")
- .join(prefix)
- .join(&joined_potential_components)
- .is_file()
- {
- // Avoid `srctree` here in order to keep paths relative to it in the KTAP output.
- valid_paths.push(
- Path::new("rust/kernel")
- .join(prefix)
- .join(joined_potential_components),
- );
- }
-
- // In addition, check whether each component prefix, joined by underscores, is a directory.
- // If not, there is no need to check for combinations with that prefix.
- for i in 1..potential_components.len() {
- let (components_prefix, components_rest) = potential_components.split_at(i);
- let prefix = prefix.join(components_prefix.join("_"));
- if srctree.join("rust/kernel").join(&prefix).is_dir() {
- find_candidates(srctree, valid_paths, &prefix, components_rest);
- }
- }
- }
-
- match valid_paths.as_slice() {
- [] => panic!(
- "No path candidates found for `{file}`. This is likely a bug in the build system, or \
- some files went away while compiling."
- ),
- [valid_path] => valid_path.to_str().unwrap(),
- valid_paths => {
- use std::fmt::Write;
-
- let mut candidates = String::new();
- for path in valid_paths {
- writeln!(&mut candidates, " {path:?}").unwrap();
- }
- panic!(
- "Several path candidates found for `{file}`, please resolve the ambiguity by \
- renaming a file or folder. Candidates:\n{candidates}",
- );
- }
- }
-}
-
-fn main() {
- let srctree = std::env::var("srctree").unwrap();
- let srctree = Path::new(&srctree);
-
- let mut paths = fs::read_dir("rust/test/doctests/kernel")
- .unwrap()
- .map(|entry| entry.unwrap().path())
- .collect::<Vec<_>>();
-
- // Sort paths.
- paths.sort();
-
- let mut rust_tests = String::new();
- let mut c_test_declarations = String::new();
- let mut c_test_cases = String::new();
- let mut body = String::new();
- let mut last_file = String::new();
- let mut number = 0;
- let mut valid_paths: Vec<PathBuf> = Vec::new();
- let mut real_path: &str = "";
- for path in paths {
- // The `name` follows the `{file}_{line}_{number}` pattern (see description in
- // `scripts/rustdoc_test_builder.rs`). Discard the `number`.
- let name = path.file_name().unwrap().to_str().unwrap().to_string();
-
- // Extract the `file` and the `line`, discarding the `number`.
- let (file, line) = name.rsplit_once('_').unwrap().0.rsplit_once('_').unwrap();
-
- // Generate an ID sequence ("test number") for each one in the file.
- if file == last_file {
- number += 1;
- } else {
- number = 0;
- last_file = file.to_string();
-
- // Figure out the real path, only once per file.
- real_path = find_real_path(srctree, &mut valid_paths, file);
- }
-
- // Generate a KUnit name (i.e. test name and C symbol) for this test.
- //
- // We avoid the line number, like `rustdoc` does, to make things slightly more stable for
- // bisection purposes. However, to aid developers in mapping back what test failed, we will
- // print a diagnostics line in the KTAP report.
- let kunit_name = format!("rust_doctest_kernel_{file}_{number}");
-
- // Read the test's text contents to dump it below.
- body.clear();
- File::open(path).unwrap().read_to_string(&mut body).unwrap();
-
- // Calculate how many lines before `main` function (including the `main` function line).
- let body_offset = body
- .lines()
- .take_while(|line| !line.contains("fn main() {"))
- .count()
- + 1;
-
- use std::fmt::Write;
- write!(
- rust_tests,
- r#"/// Generated `{name}` KUnit test case from a Rust documentation test.
-#[no_mangle]
-pub extern "C" fn {kunit_name}(__kunit_test: *mut ::kernel::bindings::kunit) {{
- /// Overrides the usual [`assert!`] macro with one that calls KUnit instead.
- #[allow(unused)]
- macro_rules! assert {{
- ($cond:expr $(,)?) => {{{{
- ::kernel::kunit_assert!(
- "{kunit_name}", "{real_path}", __DOCTEST_ANCHOR - {line}, $cond
- );
- }}}}
- }}
-
- /// Overrides the usual [`assert_eq!`] macro with one that calls KUnit instead.
- #[allow(unused)]
- macro_rules! assert_eq {{
- ($left:expr, $right:expr $(,)?) => {{{{
- ::kernel::kunit_assert_eq!(
- "{kunit_name}", "{real_path}", __DOCTEST_ANCHOR - {line}, $left, $right
- );
- }}}}
- }}
-
- // Many tests need the prelude, so provide it by default.
- #[allow(unused)]
- use ::kernel::prelude::*;
-
- // Unconditionally print the location of the original doctest (i.e. rather than the location in
- // the generated file) so that developers can easily map the test back to the source code.
- //
- // This information is also printed when assertions fail, but this helps in the successful cases
- // when the user is running KUnit manually, or when passing `--raw_output` to `kunit.py`.
- //
- // This follows the syntax for declaring test metadata in the proposed KTAP v2 spec, which may
- // be used for the proposed KUnit test attributes API. Thus hopefully this will make migration
- // easier later on.
- ::kernel::kunit::info(fmt!(" # {kunit_name}.location: {real_path}:{line}\n"));
-
- /// The anchor where the test code body starts.
- #[allow(unused)]
- static __DOCTEST_ANCHOR: i32 = ::core::line!() as i32 + {body_offset} + 1;
- {{
- {body}
- main();
- }}
-}}
-
-"#
- )
- .unwrap();
-
- write!(c_test_declarations, "void {kunit_name}(struct kunit *);\n").unwrap();
- write!(c_test_cases, " KUNIT_CASE({kunit_name}),\n").unwrap();
- }
-
- let rust_tests = rust_tests.trim();
- let c_test_declarations = c_test_declarations.trim();
- let c_test_cases = c_test_cases.trim();
-
- write!(
- BufWriter::new(File::create("rust/doctests_kernel_generated.rs").unwrap()),
- r#"//! `kernel` crate documentation tests.
-
-const __LOG_PREFIX: &[u8] = b"rust_doctests_kernel\0";
-
-{rust_tests}
-"#
- )
- .unwrap();
-
- write!(
- BufWriter::new(File::create("rust/doctests_kernel_generated_kunit.c").unwrap()),
- r#"/*
- * `kernel` crate documentation tests.
- */
-
-#include <kunit/test.h>
-
-{c_test_declarations}
-
-static struct kunit_case test_cases[] = {{
- {c_test_cases}
- {{ }}
-}};
-
-static struct kunit_suite test_suite = {{
- .name = "rust_doctests_kernel",
- .test_cases = test_cases,
-}};
-
-kunit_test_suite(test_suite);
-
-MODULE_LICENSE("GPL");
-"#
- )
- .unwrap();
-}
base-commit: 0d97f2067c166eb495771fede9f7b73999c67f66
--
2.51.0
The vector regset uses the maximum possible vlenb 8192 to allocate a
2^18 bytes buffer to copy the vector register. But most platforms
don’t support the largest vlenb.
The regset has 2 users, ptrace syscall and coredump. When handling the
PTRACE_GETREGSET requests from ptrace syscall, Linux will prepare a
kernel buffer which size is min(user buffer size, limit). A malicious
user process might overwhelm a memory-constrainted system when the
buffer limit is very large. The coredump uses regset_get_alloc() to
get the context of vector register. But this API allocates buffer
before checking whether the target process uses vector extension, this
wastes time to prepare a large memory buffer.
The buffer limit can be determined after getting platform vlenb in the
early boot stage, this can let the regset buffer match real hardware
limits. Also add .active callbacks to let the coredump skip vector part
when target process doesn't use it.
After this patchset, userspace process needs 2 ptrace syscalls to
retrieve the vector regset with PTRACE_GETREGSET. The first ptrace call
only reads the header to get the vlenb information. Then prepare a
suitable buffer to get the register context. The new vector ptrace
kselftest demonstrates it.
---
v2:
- fix issues in vector ptrace kselftest (Andy)
Yong-Xuan Wang (2):
riscv: ptrace: Optimize the allocation of vector regset
selftests: riscv: Add test for the Vector ptrace interface
arch/riscv/include/asm/vector.h | 1 +
arch/riscv/kernel/ptrace.c | 24 +++-
arch/riscv/kernel/vector.c | 2 +
tools/testing/selftests/riscv/vector/Makefile | 5 +-
.../selftests/riscv/vector/vstate_ptrace.c | 134 ++++++++++++++++++
5 files changed, 162 insertions(+), 4 deletions(-)
create mode 100644 tools/testing/selftests/riscv/vector/vstate_ptrace.c
--
2.43.0
Expose resctrl monitoring data via a lightweight perf PMU.
Background: The kernel's initial cache-monitoring interface shipped via
perf (commit 4afbb24ce5e7, 2015). That approach tied monitoring to tasks
and cgroups. Later, cache control was designed around the resctrl
filesystem to better match hardware semantics, and the incompatible perf
CQM code was removed (commit c39a0e2c8850, 2017). This series implements
a thin, generic perf PMU that _is_ compatible with resctrl.
Motivation: perf support enables measuring cache occupancy and memory
bandwidth metrics on hrtimer (high resolution timer) interrupts via eBPF.
Compared with polling from userspace, hrtimer-based reads remove
scheduling jitter and context switch overhead. Further, PMU reads can be
parallel, since the PMU read path need not lock resctrl's rdtgroup_mutex.
Parallelization and reduced jitter enable more accurate snapshots of
cache occupancy and memory bandwidth. [1] has more details on the
motivation and design.
Design: The "resctrl" PMU is a small adapter on top of resctrl's
monitoring path:
- Event selection uses `attr.config` to pass an open `mon_data` fd
(e.g. `mon_L3_00/llc_occupancy`).
- Events must be CPU-bound within the file's domain. Perf is responsible
the read executes on the bound CPU.
- Event init resolves and pins the rdtgroup, prepares struct rmid_read via
mon_event_setup_read(), and validates the bound CPU is in the file's
domain CPU mask.
- Sampling is not supported; reads match the `mon_data` file contents.
- If the rdtgroup is deleted, reads return 0.
Includes a new selftest (tools/testing/selftests/resctrl/pmu_test.c)
to validate the PMU event init path, and adds PMU testing to existing
CMT tests.
Example usage (see Documentation/filesystems/resctrl.rst):
Open a monitoring file and pass its fd in `perf_event_attr.config`, with
`attr.type` set to the `resctrl` PMU type.
The patches are based on top of v6.18-rc1 (commit 3a8660878839).
[1] https://www.youtube.com/watch?v=4BGhAMJdZTc
Jonathan Perry (8):
resctrl: Pin rdtgroup for mon_data file lifetime
resctrl/mon: Split RMID read init from execution
resctrl/mon: Select cpumask before invoking mon_event_read()
resctrl/mon: Create mon_event_setup_read() helper
resctrl: Propagate CPU mask validation error via rr->err
resctrl/pmu: Introduce skeleton PMU and selftests
resctrl/pmu: Use mon_event_setup_read() and validate CPU
resctrl/pmu: Implement .read via direct RMID read; add LLC selftest
Documentation/filesystems/resctrl.rst | 64 ++++
fs/resctrl/Makefile | 2 +-
fs/resctrl/ctrlmondata.c | 118 ++++---
fs/resctrl/internal.h | 24 +-
fs/resctrl/monitor.c | 8 +-
fs/resctrl/pmu.c | 217 +++++++++++++
fs/resctrl/rdtgroup.c | 131 +++++++-
tools/testing/selftests/resctrl/cache.c | 94 +++++-
tools/testing/selftests/resctrl/cmt_test.c | 17 +-
tools/testing/selftests/resctrl/pmu_test.c | 292 ++++++++++++++++++
tools/testing/selftests/resctrl/pmu_utils.c | 32 ++
tools/testing/selftests/resctrl/resctrl.h | 4 +
.../testing/selftests/resctrl/resctrl_tests.c | 1 +
13 files changed, 948 insertions(+), 56 deletions(-)
create mode 100644 fs/resctrl/pmu.c
create mode 100644 tools/testing/selftests/resctrl/pmu_test.c
create mode 100644 tools/testing/selftests/resctrl/pmu_utils.c
Add a series of tests to validate the RV tracefs API and basic
functionality.
* available monitors:
Check that all monitors (from the monitors folder) appear as
available and have a description. Works with nested monitors.
* enable/disable:
Enable and disable all monitors and validate both the enabled file
and the enabled_monitors. Check that enabling container monitors
enables all nested monitors.
* reactors:
Set all reactors and validate the setting, also for nested monitors.
* wwnr with printk:
wwnr is broken on purpose, run it with a load and check that the
printk reactor works. Also validate disabling reacting_on or
monitoring_on prevents reactions.
These tests use the ftracetest suite. The first patch of the series
adapts ftracetest to make this possible.
The enable/disable test cannot pass on upstream without the application
of the fix in [1].
Changes since V1:
- run stressors based on the cpu count on the wwnr/printk test
[1] - https://lore.kernel.org/lkml/87tt0t4u19.fsf@yellow.woof
To: Steven Rostedt <rostedt(a)goodmis.org>
To: Nam Cao <namcao(a)linutronix.de>
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: John Kacur <jkacur(a)redhat.com>
Cc: Waylon Cude <wcude(a)redhat.com>
Cc: linux-trace-kernel(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Gabriele Monaco (2):
selftest/ftrace: Generalise ftracetest to use with RV
selftests/verification: Add initial RV tests
MAINTAINERS | 1 +
tools/testing/selftests/ftrace/ftracetest | 34 ++++++---
.../ftrace/test.d/00basic/mount_options.tc | 2 +-
.../testing/selftests/ftrace/test.d/functions | 6 +-
.../testing/selftests/verification/.gitignore | 2 +
tools/testing/selftests/verification/Makefile | 8 ++
tools/testing/selftests/verification/config | 1 +
tools/testing/selftests/verification/settings | 1 +
.../selftests/verification/test.d/functions | 39 ++++++++++
.../test.d/rv_monitor_enable_disable.tc | 75 +++++++++++++++++++
.../verification/test.d/rv_monitor_reactor.tc | 68 +++++++++++++++++
.../test.d/rv_monitors_available.tc | 18 +++++
.../verification/test.d/rv_wwnr_printk.tc | 30 ++++++++
.../verification/verificationtest-ktap | 8 ++
14 files changed, 279 insertions(+), 14 deletions(-)
create mode 100644 tools/testing/selftests/verification/.gitignore
create mode 100644 tools/testing/selftests/verification/Makefile
create mode 100644 tools/testing/selftests/verification/config
create mode 100644 tools/testing/selftests/verification/settings
create mode 100644 tools/testing/selftests/verification/test.d/functions
create mode 100644 tools/testing/selftests/verification/test.d/rv_monitor_enable_disable.tc
create mode 100644 tools/testing/selftests/verification/test.d/rv_monitor_reactor.tc
create mode 100644 tools/testing/selftests/verification/test.d/rv_monitors_available.tc
create mode 100644 tools/testing/selftests/verification/test.d/rv_wwnr_printk.tc
create mode 100644 tools/testing/selftests/verification/verificationtest-ktap
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
--
2.51.0
The previous implementation incorrectly assumed the original type of
'priv' was void**, leading to an unnecessary and misleading
cast. Correct the cast of the 'priv' pointer in test_dev_action() to
its actual type, long*, removing an unnecessary cast.
As an additional benefit, this fixes an out-of-bounds CHERI fault on
hardware with architectural capabilities. The original implementation
tried to store a capability-sized pointer using the priv
pointer. However, the priv pointer's capability only granted access to
the memory region of its original long type, leading to a bounds
violation since the size of a long is smaller than the size of a
capability. This change ensures that the pointer usage respects the
capabilities' bounds.
Fixes: d03c720e03bd ("kunit: Add APIs for managing devices")
Reviewed-by: David Gow <davidgow(a)google.com>
Signed-off-by: Florian Schmaus <florian.schmaus(a)codasip.com>
---
Changes since v1:
- Fix simple formatting issue: long* -> long *
- Add 'Fixes' git trailer
lib/kunit/kunit-test.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c
index 8c01eabd4eaf..63130a48e237 100644
--- a/lib/kunit/kunit-test.c
+++ b/lib/kunit/kunit-test.c
@@ -739,7 +739,7 @@ static struct kunit_case kunit_current_test_cases[] = {
static void test_dev_action(void *priv)
{
- *(void **)priv = (void *)1;
+ *(long *)priv = 1;
}
static void kunit_device_test(struct kunit *test)
--
2.51.0
Add a series of tests to validate the RV tracefs API and basic
functionality.
* available monitors:
Check that all monitors (from the monitors folder) appear as
available and have a description. Works with nested monitors.
* enable/disable:
Enable and disable all monitors and validate both the enabled file
and the enabled_monitors. Check that enabling container monitors
enables all nested monitors.
* reactors:
Set all reactors and validate the setting, also for nested monitors.
* wwnr with printk:
wwnr is broken on purpose, run it with a load and check that the
printk reactor works. Also validate disabling reacting_on or
monitoring_on prevents reactions.
These tests use the ftracetest suite. The first patch of the series
adapts ftracetest to make this possible.
The enable/disable test cannot pass on upstream without the application
of the fix in [1].
[1] - https://lore.kernel.org/lkml/87tt0t4u19.fsf@yellow.woof
To: Steven Rostedt <rostedt(a)goodmis.org>
To: Nam Cao <namcao(a)linutronix.de>
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: John Kacur <jkacur(a)redhat.com>
Cc: Waylon Cude <wcude(a)redhat.com>
Cc: linux-trace-kernel(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Gabriele Monaco (2):
selftest/ftrace: Generalise ftracetest to use with RV
selftests/verification: Add initial RV tests
MAINTAINERS | 1 +
tools/testing/selftests/ftrace/ftracetest | 34 ++++++---
.../ftrace/test.d/00basic/mount_options.tc | 2 +-
.../testing/selftests/ftrace/test.d/functions | 6 +-
.../testing/selftests/verification/.gitignore | 2 +
tools/testing/selftests/verification/Makefile | 8 ++
tools/testing/selftests/verification/config | 1 +
tools/testing/selftests/verification/settings | 1 +
.../selftests/verification/test.d/functions | 39 ++++++++++
.../test.d/rv_monitor_enable_disable.tc | 75 +++++++++++++++++++
.../verification/test.d/rv_monitor_reactor.tc | 68 +++++++++++++++++
.../test.d/rv_monitors_available.tc | 18 +++++
.../verification/test.d/rv_wwnr_printk.tc | 29 +++++++
.../verification/verificationtest-ktap | 8 ++
14 files changed, 278 insertions(+), 14 deletions(-)
create mode 100644 tools/testing/selftests/verification/.gitignore
create mode 100644 tools/testing/selftests/verification/Makefile
create mode 100644 tools/testing/selftests/verification/config
create mode 100644 tools/testing/selftests/verification/settings
create mode 100644 tools/testing/selftests/verification/test.d/functions
create mode 100644 tools/testing/selftests/verification/test.d/rv_monitor_enable_disable.tc
create mode 100644 tools/testing/selftests/verification/test.d/rv_monitor_reactor.tc
create mode 100644 tools/testing/selftests/verification/test.d/rv_monitors_available.tc
create mode 100644 tools/testing/selftests/verification/test.d/rv_wwnr_printk.tc
create mode 100644 tools/testing/selftests/verification/verificationtest-ktap
base-commit: cec1e6e5d1ab33403b809f79cd20d6aff124ccfe
--
2.51.0
kunit_device_register() only returns error pointers, not NULL.
Furthermore for regular users who aren't testing the KUnit API
itself, errors most likely represent major system failure (e.g. OOM
or sysfs collision) beyond the scope of their own test conditions.
Replace the assert with straightforward error handling for clarity.
Signed-off-by: Robin Murphy <robin.murphy(a)arm.com>
---
This seemed the logical conclusion by inspection, but please do correct
me if I've misunderstood the intent...
---
Documentation/dev-tools/kunit/usage.rst | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/Documentation/dev-tools/kunit/usage.rst b/Documentation/dev-tools/kunit/usage.rst
index 038f480074fd..3452c739dd44 100644
--- a/Documentation/dev-tools/kunit/usage.rst
+++ b/Documentation/dev-tools/kunit/usage.rst
@@ -873,7 +873,8 @@ For example:
// Create a fake device.
fake_device = kunit_device_register(test, "my_device");
- KUNIT_ASSERT_NOT_ERR_OR_NULL(test, fake_device)
+ if (IS_ERR(fake_device))
+ return;
// Pass it to functions which need a device.
dev_managed_string = devm_kstrdup(fake_device, "Hello, World!");
--
2.34.1
From: Li RongQing <lirongqing(a)baidu.com>
Currently, when 'hung_task_panic' is enabled, the kernel panics
immediately upon detecting the first hung task. However, some hung
tasks are transient and allow system recovery, while persistent hangs
should trigger a panic when accumulating beyond a threshold.
Extend the 'hung_task_panic' sysctl to accept a threshold value
specifying the number of hung tasks that must be detected before
triggering a kernel panic. This provides finer control for environments
where transient hangs may occur but persistent hangs should be fatal.
The sysctl now accepts:
- 0: don't panic (maintains original behavior)
- 1: panic on first hung task (maintains original behavior)
- N > 1: panic after N hung tasks are detected in a single scan
This maintains backward compatibility while providing flexibility for
different hang scenarios.
Signed-off-by: Li RongQing <lirongqing(a)baidu.com>
Cc: Andrew Jeffery <andrew(a)codeconstruct.com.au>
Cc: Anshuman Khandual <anshuman.khandual(a)arm.com>
Cc: Arnd Bergmann <arnd(a)arndb.de>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Florian Wesphal <fw(a)strlen.de>
Cc: Jakub Kacinski <kuba(a)kernel.org>
Cc: Jason A. Donenfeld <jason(a)zx2c4.com>
Cc: Joel Granados <joel.granados(a)kernel.org>
Cc: Joel Stanley <joel(a)jms.id.au>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Kees Cook <kees(a)kernel.org>
Cc: Lance Yang <lance.yang(a)linux.dev>
Cc: Liam Howlett <liam.howlett(a)oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat(a)kernel.org>
Cc: "Paul E . McKenney" <paulmck(a)kernel.org>
Cc: Pawan Gupta <pawan.kumar.gupta(a)linux.intel.com>
Cc: Petr Mladek <pmladek(a)suse.com>
Cc: Phil Auld <pauld(a)redhat.com>
Cc: Randy Dunlap <rdunlap(a)infradead.org>
Cc: Russell King <linux(a)armlinux.org.uk>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Simon Horman <horms(a)kernel.org>
Cc: Stanislav Fomichev <sdf(a)fomichev.me>
Cc: Steven Rostedt <rostedt(a)goodmis.org>
---
diff with v3: comments modification, suggested by Lance, Masami, Randy and Petr
diff with v2: do not add a new sysctl, extend hung_task_panic, suggested by Kees Cook
Documentation/admin-guide/kernel-parameters.txt | 20 +++++++++++++-------
Documentation/admin-guide/sysctl/kernel.rst | 9 +++++----
arch/arm/configs/aspeed_g5_defconfig | 2 +-
kernel/configs/debug.config | 2 +-
kernel/hung_task.c | 15 ++++++++++-----
lib/Kconfig.debug | 9 +++++----
tools/testing/selftests/wireguard/qemu/kernel.config | 2 +-
7 files changed, 36 insertions(+), 23 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a51ab46..492f0bc 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1992,14 +1992,20 @@
the added memory block itself do not be affected.
hung_task_panic=
- [KNL] Should the hung task detector generate panics.
- Format: 0 | 1
+ [KNL] Number of hung tasks to trigger kernel panic.
+ Format: <int>
+
+ When set to a non-zero value, a kernel panic will be triggered if
+ the number of detected hung tasks reaches this value.
+
+ 0: don't panic
+ 1: panic immediately on first hung task
+ N: panic after N hung tasks are detected in a single scan
- A value of 1 instructs the kernel to panic when a
- hung task is detected. The default value is controlled
- by the CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time
- option. The value selected by this boot parameter can
- be changed later by the kernel.hung_task_panic sysctl.
+ The default value is controlled by the
+ CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time option. The value
+ selected by this boot parameter can be changed later by the
+ kernel.hung_task_panic sysctl.
hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC)
terminal devices. Valid values: 0..8
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index f3ee807..0065a55 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -397,13 +397,14 @@ a hung task is detected.
hung_task_panic
===============
-Controls the kernel's behavior when a hung task is detected.
+When set to a non-zero value, a kernel panic will be triggered if the
+number of hung tasks found during a single scan reaches this value.
This file shows up if ``CONFIG_DETECT_HUNG_TASK`` is enabled.
-= =================================================
+= =======================================================
0 Continue operation. This is the default behavior.
-1 Panic immediately.
-= =================================================
+N Panic when N hung tasks are found during a single scan.
+= =======================================================
hung_task_check_count
diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig
index 61cee1e..c3b0d5f 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -308,7 +308,7 @@ CONFIG_PANIC_ON_OOPS=y
CONFIG_PANIC_TIMEOUT=-1
CONFIG_SOFTLOCKUP_DETECTOR=y
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
-CONFIG_BOOTPARAM_HUNG_TASK_PANIC=y
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1
CONFIG_WQ_WATCHDOG=y
# CONFIG_SCHED_DEBUG is not set
CONFIG_FUNCTION_TRACER=y
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index e81327d..9f6ab7d 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -83,7 +83,7 @@ CONFIG_SLUB_DEBUG_ON=y
#
# Debug Oops, Lockups and Hangs
#
-# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0
# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_DETECT_HUNG_TASK=y
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index b2c1f14..84b4b04 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -81,7 +81,7 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
* hung task is detected:
*/
static unsigned int __read_mostly sysctl_hung_task_panic =
- IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC);
+ CONFIG_BOOTPARAM_HUNG_TASK_PANIC;
static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -218,8 +218,11 @@ static inline void debug_show_blocker(struct task_struct *task, unsigned long ti
}
#endif
-static void check_hung_task(struct task_struct *t, unsigned long timeout)
+static void check_hung_task(struct task_struct *t, unsigned long timeout,
+ unsigned long prev_detect_count)
{
+ unsigned long total_hung_task;
+
if (!task_is_hung(t, timeout))
return;
@@ -229,9 +232,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
*/
sysctl_hung_task_detect_count++;
+ total_hung_task = sysctl_hung_task_detect_count - prev_detect_count;
trace_sched_process_hang(t);
- if (sysctl_hung_task_panic) {
+ if (sysctl_hung_task_panic && total_hung_task >= sysctl_hung_task_panic) {
console_verbose();
hung_task_show_lock = true;
hung_task_call_panic = true;
@@ -300,6 +304,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
int max_count = sysctl_hung_task_check_count;
unsigned long last_break = jiffies;
struct task_struct *g, *t;
+ unsigned long prev_detect_count = sysctl_hung_task_detect_count;
/*
* If the system crashed already then all bets are off,
@@ -320,7 +325,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
last_break = jiffies;
}
- check_hung_task(t, timeout);
+ check_hung_task(t, timeout, prev_detect_count);
}
unlock:
rcu_read_unlock();
@@ -389,7 +394,7 @@ static const struct ctl_table hung_task_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "hung_task_check_count",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 3034e294..3976c90 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1258,12 +1258,13 @@ config DEFAULT_HUNG_TASK_TIMEOUT
Keeping the default should be fine in most cases.
config BOOTPARAM_HUNG_TASK_PANIC
- bool "Panic (Reboot) On Hung Tasks"
+ int "Number of hung tasks to trigger kernel panic"
depends on DETECT_HUNG_TASK
+ default 0
help
- Say Y here to enable the kernel to panic on "hung tasks",
- which are bugs that cause the kernel to leave a task stuck
- in uninterruptible "D" state.
+ When set to a non-zero value, a kernel panic will be triggered
+ if the number of hung tasks found during a single scan reaches
+ this value.
The panic can be used in combination with panic_timeout,
to cause the system to reboot automatically after a
diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config
index 936b18b..0504c11 100644
--- a/tools/testing/selftests/wireguard/qemu/kernel.config
+++ b/tools/testing/selftests/wireguard/qemu/kernel.config
@@ -81,7 +81,7 @@ CONFIG_WQ_WATCHDOG=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
-CONFIG_BOOTPARAM_HUNG_TASK_PANIC=y
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1
CONFIG_PANIC_TIMEOUT=-1
CONFIG_STACKTRACE=y
CONFIG_EARLY_PRINTK=y
--
2.9.4
The previous implementation incorrectly assumed the original type of
'priv' was void**, leading to an unnecessary and misleading
cast. Correct the cast of the 'priv' pointer in test_dev_action() to
its actual type, long*, removing an unnecessary cast.
As an additional benefit, this fixes an out-of-bounds CHERI fault on
hardware with architectural capabilities. The original implementation
tried to store a capability-sized pointer using the 'priv'
pointer. However, the 'priv' pointer's capability only granted access
to the memory region of its original long type, leading to a bounds
violation since the size of a long is smaller than the size of a
capability. This change ensures that the pointer usage respects the
capabilities' bounds.
Signed-off-by: Florian Schmaus <florian.schmaus(a)codasip.com>
---
lib/kunit/kunit-test.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c
index 8c01eabd4eaf..957b67818489 100644
--- a/lib/kunit/kunit-test.c
+++ b/lib/kunit/kunit-test.c
@@ -739,7 +739,7 @@ static struct kunit_case kunit_current_test_cases[] = {
static void test_dev_action(void *priv)
{
- *(void **)priv = (void *)1;
+ *(long*)priv = 1;
}
static void kunit_device_test(struct kunit *test)
--
2.51.0
Some high-level virtual drivers need to compute features from their
lower devices, but each currently has its own implementation and may
miss some feature computations. This patch set introduces a common function
to compute features for such devices.
Currently, bonding, team, and bridge have been updated to use the new
helper.
v4:
* update needed_{headroom, tailroom} in the common helper (Ido Schimmel)
* remove unneeded err in team (Stanislav Fomichev)
* remove selftest as `ethtool -k` does not test the dev->*_features. We
can add back the selftest when there is a good way to test. (Sabrina Dubroca)
v3:
a) fix hw_enc_features asign order (Sabrina Dubroca)
b) set virtual dev feature defination in netdev_features.h (Jakub Kicinski)
c) remove unneeded err in team_del_slave (Stanislav Fomichev)
d) remove NETIF_F_HW_ESP test as it needs to be test with GSO pkts (Sabrina Dubroca)
v2:
a) remove hard_header_len setting. I will set needed_headroom for bond/team
in a separate patch as bridge has it's own ways. (Ido Schimmel)
b) Add test file to Makefile, set RET=0 to a proper location. (Ido Schimmel)
Hangbin Liu (4):
net: add a common function to compute features from lowers devices
bonding: use common function to compute the features
team: use common function to compute the features
net: bridge: use common function to compute the features
drivers/net/bonding/bond_main.c | 99 ++-------------------------------
drivers/net/team/team_core.c | 83 ++-------------------------
include/linux/netdev_features.h | 18 ++++++
include/linux/netdevice.h | 1 +
net/bridge/br_if.c | 22 +-------
net/core/dev.c | 95 +++++++++++++++++++++++++++++++
6 files changed, 127 insertions(+), 191 deletions(-)
--
2.50.1
These patches are taken from the LUOv4 series [1] and address recent
comments from Pratyush.
This series refactors the KHO framework to better support in-kernel
users like the upcoming LUO. The current design, which relies on a
notifier chain and debugfs for control, is too restrictive for direct
programmatic use.
The core of this rework is the removal of the notifier chain in favor of
a direct registration API. This decouples clients from the shutdown-time
finalization sequence, allowing them to manage their preserved state
more flexibly and at any time.
In support of this new model, this series also:
- Exports kho_finalize() and kho_abort() for programmatic control.
- Makes the debugfs interface optional.
- Introduces APIs to unpreserve memory and fixes a bug in the abort
path where client state was being incorrectly discarded. Note that
this is an interim step, as a more comprehensive fix is planned as
part of the stateless KHO work [2].
- Moves all KHO code into a new kernel/liveupdate/ directory to
consolidate live update components.
[1] https://lore.kernel.org/all/20250929010321.3462457-1-pasha.tatashin@soleen.…
[2] https://lore.kernel.org/all/20251001011941.1513050-1-jasonmiu@google.com
Mike Rapoport (Microsoft) (1):
kho: drop notifiers
Pasha Tatashin (6):
kho: allow to drive kho from within kernel
kho: make debugfs interface optional
kho: add interfaces to unpreserve folios and page ranes
kho: don't unpreserve memory during abort
liveupdate: kho: move to kernel/liveupdate
kho: move kho debugfs directory to liveupdate
Documentation/core-api/kho/concepts.rst | 2 +-
MAINTAINERS | 3 +-
include/linux/kexec_handover.h | 53 +-
init/Kconfig | 2 +
kernel/Kconfig.kexec | 15 -
kernel/Makefile | 2 +-
kernel/liveupdate/Kconfig | 30 ++
kernel/liveupdate/Makefile | 4 +
kernel/{ => liveupdate}/kexec_handover.c | 515 ++++++++------------
kernel/liveupdate/kexec_handover_debug.c | 216 ++++++++
kernel/liveupdate/kexec_handover_internal.h | 47 ++
lib/test_kho.c | 30 +-
mm/memblock.c | 60 +--
tools/testing/selftests/kho/init.c | 2 +-
tools/testing/selftests/kho/vmtest.sh | 1 +
15 files changed, 553 insertions(+), 429 deletions(-)
create mode 100644 kernel/liveupdate/Kconfig
create mode 100644 kernel/liveupdate/Makefile
rename kernel/{ => liveupdate}/kexec_handover.c (79%)
create mode 100644 kernel/liveupdate/kexec_handover_debug.c
create mode 100644 kernel/liveupdate/kexec_handover_internal.h
base-commit: 4a71531471926e3c391665ee9c42f4e0295a4585
--
2.51.0.618.g983fd99d29-goog
On systems where the shmget() syscall is not supported, tests like
anon_page and shared_waitv will fail. Skip these tests in such cases to
allow the rest of the test suite to run.
Signed-off-by: Carlos Llamas <cmllamas(a)google.com>
---
tools/testing/selftests/futex/functional/futex_wait.c | 2 ++
tools/testing/selftests/futex/functional/futex_waitv.c | 2 ++
2 files changed, 4 insertions(+)
diff --git a/tools/testing/selftests/futex/functional/futex_wait.c b/tools/testing/selftests/futex/functional/futex_wait.c
index 152ca4612886..1269642bb662 100644
--- a/tools/testing/selftests/futex/functional/futex_wait.c
+++ b/tools/testing/selftests/futex/functional/futex_wait.c
@@ -71,6 +71,8 @@ TEST(anon_page)
/* Testing an anon page shared memory */
shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666);
if (shm_id < 0) {
+ if (errno == ENOSYS)
+ ksft_exit_skip("shmget syscall not supported\n");
perror("shmget");
exit(1);
}
diff --git a/tools/testing/selftests/futex/functional/futex_waitv.c b/tools/testing/selftests/futex/functional/futex_waitv.c
index c684b10eb76e..3bc4e5dc70e7 100644
--- a/tools/testing/selftests/futex/functional/futex_waitv.c
+++ b/tools/testing/selftests/futex/functional/futex_waitv.c
@@ -86,6 +86,8 @@ TEST(shared_waitv)
int shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666);
if (shm_id < 0) {
+ if (errno == ENOSYS)
+ ksft_exit_skip("shmget syscall not supported\n");
perror("shmget");
exit(1);
}
--
2.51.0.869.ge66316f041-goog
Basics and overview
===================
Software with larger attack surfaces (e.g. network facing apps like databases,
browsers or apps relying on browser runtimes) suffer from memory corruption
issues which can be utilized by attackers to bend control flow of the program
to eventually gain control (by making their payload executable). Attackers are
able to perform such attacks by leveraging call-sites which rely on indirect
calls or return sites which rely on obtaining return address from stack memory.
To mitigate such attacks, risc-v extension zicfilp enforces that all indirect
calls must land on a landing pad instruction `lpad` else cpu will raise software
check exception (a new cpu exception cause code on riscv).
Similarly for return flow, risc-v extension zicfiss extends architecture with
- `sspush` instruction to push return address on a shadow stack
- `sspopchk` instruction to pop return address from shadow stack
and compare with input operand (i.e. return address on stack)
- `sspopchk` to raise software check exception if comparision above
was a mismatch
- Protection mechanism using which shadow stack is not writeable via
regular store instructions
More information an details can be found at extensions github repo [1].
Equivalent to landing pad (zicfilp) on x86 is `ENDBRANCH` instruction in Intel
CET [3] and branch target identification (BTI) [4] on arm.
Similarly x86's Intel CET has shadow stack [5] and arm64 has guarded control
stack (GCS) [6] which are very similar to risc-v's zicfiss shadow stack.
x86 and arm64 support for user mode shadow stack is already in mainline.
Kernel awareness for user control flow integrity
================================================
This series picks up Samuel Holland's envcfg changes [2] as well. So if those are
being applied independently, they should be removed from this series.
Enabling:
In order to maintain compatibility and not break anything in user mode, kernel
doesn't enable control flow integrity cpu extensions on binary by default.
Instead exposes a prctl interface to enable, disable and lock the shadow stack
or landing pad feature for a task. This allows userspace (loader) to enumerate
if all objects in its address space are compiled with shadow stack and landing
pad support and accordingly enable the feature. Additionally if a subsequent
`dlopen` happens on a library, user mode can take a decision again to disable
the feature (if incoming library is not compiled with support) OR terminate the
task (if user mode policy is strict to have all objects in address space to be
compiled with control flow integirty cpu feature). prctl to enable shadow stack
results in allocating shadow stack from virtual memory and activating for user
address space. x86 and arm64 are also following same direction due to similar
reason(s).
clone/fork:
On clone and fork, cfi state for task is inherited by child. Shadow stack is
part of virtual memory and is a writeable memory from kernel perspective
(writeable via a restricted set of instructions aka shadow stack instructions)
Thus kernel changes ensure that this memory is converted into read-only when
fork/clone happens and COWed when fault is taken due to sspush, sspopchk or
ssamoswap. In case `CLONE_VM` is specified and shadow stack is to be enabled,
kernel will automatically allocate a shadow stack for that clone call.
map_shadow_stack:
x86 introduced `map_shadow_stack` system call to allow user space to explicitly
map shadow stack memory in its address space. It is useful to allocate shadow
for different contexts managed by a single thread (green threads or contexts)
risc-v implements this system call as well.
signal management:
If shadow stack is enabled for a task, kernel performs an asynchronous control
flow diversion to deliver the signal and eventually expects userspace to issue
sigreturn so that original execution can be resumed. Even though resume context
is prepared by kernel, it is in user space memory and is subject to memory
corruption and corruption bugs can be utilized by attacker in this race window
to perform arbitrary sigreturn and eventually bypass cfi mechanism.
Another issue is how to ensure that cfi related state on sigcontext area is not
trampled by legacy apps or apps compiled with old kernel headers.
In order to mitigate control-flow hijacting, kernel prepares a token and place
it on shadow stack before signal delivery and places address of token in
sigcontext structure. During sigreturn, kernel obtains address of token from
sigcontext struture, reads token from shadow stack and validates it and only
then allow sigreturn to succeed. Compatiblity issue is solved by adopting
dynamic sigcontext management introduced for vector extension. This series
re-factor the code little bit to allow future sigcontext management easy (as
proposed by Andy Chiu from SiFive)
config and compilation:
Introduce a new risc-v config option `CONFIG_RISCV_USER_CFI`. Selecting this
config option picks the kernel support for user control flow integrity. This
optin is presented only if toolchain has shadow stack and landing pad support.
And is on purpose guarded by toolchain support. Reason being that eventually
vDSO also needs to be compiled in with shadow stack and landing pad support.
vDSO compile patches are not included as of now because landing pad labeling
scheme is yet to settle for usermode runtime.
To get more information on kernel interactions with respect to
zicfilp and zicfiss, patch series adds documentation for
`zicfilp` and `zicfiss` in following:
Documentation/arch/riscv/zicfiss.rst
Documentation/arch/riscv/zicfilp.rst
How to test this series
=======================
Toolchain
---------
$ git clone git@github.com:sifive/riscv-gnu-toolchain.git -b cfi-dev
$ riscv-gnu-toolchain/configure --prefix=<path-to-where-to-build> --with-arch=rv64gc_zicfilp_zicfiss --enable-linux --disable-gdb --with-extra-multilib-test="rv64gc_zicfilp_zicfiss-lp64d:-static"
$ make -j$(nproc)
Qemu
----
Get the lastest qemu
$ cd qemu
$ mkdir build
$ cd build
$ ../configure --target-list=riscv64-softmmu
$ make -j$(nproc)
Opensbi
-------
$ git clone git@github.com:deepak0414/opensbi.git -b v6_cfi_spec_split_opensbi
$ make CROSS_COMPILE=<your riscv toolchain> -j$(nproc) PLATFORM=generic
Linux
-----
Running defconfig is fine. CFI is enabled by default if the toolchain
supports it.
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc) defconfig
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc)
Running
-------
Modify your qemu command to have:
-bios <path-to-cfi-opensbi>/build/platform/generic/firmware/fw_dynamic.bin
-cpu rv64,zicfilp=true,zicfiss=true,zimop=true,zcmop=true
References
==========
[1] - https://github.com/riscv/riscv-cfi
[2] - https://lore.kernel.org/all/20240814081126.956287-1-samuel.holland@sifive.c…
[3] - https://lwn.net/Articles/889475/
[4] - https://developer.arm.com/documentation/109576/0100/Branch-Target-Identific…
[5] - https://www.intel.com/content/dam/develop/external/us/en/documents/catc17-i…
[6] - https://lwn.net/Articles/940403/
To: Thomas Gleixner <tglx(a)linutronix.de>
To: Ingo Molnar <mingo(a)redhat.com>
To: Borislav Petkov <bp(a)alien8.de>
To: Dave Hansen <dave.hansen(a)linux.intel.com>
To: x86(a)kernel.org
To: H. Peter Anvin <hpa(a)zytor.com>
To: Andrew Morton <akpm(a)linux-foundation.org>
To: Liam R. Howlett <Liam.Howlett(a)oracle.com>
To: Vlastimil Babka <vbabka(a)suse.cz>
To: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
To: Paul Walmsley <paul.walmsley(a)sifive.com>
To: Palmer Dabbelt <palmer(a)dabbelt.com>
To: Albert Ou <aou(a)eecs.berkeley.edu>
To: Conor Dooley <conor(a)kernel.org>
To: Rob Herring <robh(a)kernel.org>
To: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
To: Arnd Bergmann <arnd(a)arndb.de>
To: Christian Brauner <brauner(a)kernel.org>
To: Peter Zijlstra <peterz(a)infradead.org>
To: Oleg Nesterov <oleg(a)redhat.com>
To: Eric Biederman <ebiederm(a)xmission.com>
To: Kees Cook <kees(a)kernel.org>
To: Jonathan Corbet <corbet(a)lwn.net>
To: Shuah Khan <shuah(a)kernel.org>
To: Jann Horn <jannh(a)google.com>
To: Conor Dooley <conor+dt(a)kernel.org>
To: Miguel Ojeda <ojeda(a)kernel.org>
To: Alex Gaynor <alex.gaynor(a)gmail.com>
To: Boqun Feng <boqun.feng(a)gmail.com>
To: Gary Guo <gary(a)garyguo.net>
To: Björn Roy Baron <bjorn3_gh(a)protonmail.com>
To: Benno Lossin <benno.lossin(a)proton.me>
To: Andreas Hindborg <a.hindborg(a)kernel.org>
To: Alice Ryhl <aliceryhl(a)google.com>
To: Trevor Gross <tmgross(a)umich.edu>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-fsdevel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Cc: linux-riscv(a)lists.infradead.org
Cc: devicetree(a)vger.kernel.org
Cc: linux-arch(a)vger.kernel.org
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: alistair.francis(a)wdc.com
Cc: richard.henderson(a)linaro.org
Cc: jim.shu(a)sifive.com
Cc: andybnac(a)gmail.com
Cc: kito.cheng(a)sifive.com
Cc: charlie(a)rivosinc.com
Cc: atishp(a)rivosinc.com
Cc: evan(a)rivosinc.com
Cc: cleger(a)rivosinc.com
Cc: alexghiti(a)rivosinc.com
Cc: samitolvanen(a)google.com
Cc: broonie(a)kernel.org
Cc: rick.p.edgecombe(a)intel.com
Cc: rust-for-linux(a)vger.kernel.org
changelog
---------
v20:
- rebased on v6.18-rc1.
- Added two vDSO support. If `CONFIG_RISCV_USER_CFI` is selected
two vDSOs are compiled (one for hardware prior to RVA23 and one
for RVA23 onwards). Kernel exposes RVA23 vDSO if hardware/cpu
implements zimop else exposes existing vDSO to userspace.
- default selection for `CONFIG_RISCV_USER_CFI` is "Yes".
- replaced "__ASSEMBLY__" with "__ASSEMBLER__"
v19:
- riscv_nousercfi was `int`. changed it to unsigned long.
Thanks to Alex Ghiti for reporting it. It was a bug.
- ELP is cleared on trap entry only when CONFIG_64BIT.
- restore ssp back on return to usermode was being done
before `riscv_v_context_nesting_end` on trap exit path.
If kernel shadow stack were enabled this would result in
kernel operating on user shadow stack and panic (as I found
in my testing of kcfi patch series). So fixed that.
v18:
- rebased on 6.16-rc1
- uprobe handling clears ELP in sstatus image in pt_regs
- vdso was missing shadow stack elf note for object files.
added that. Additional asm file for vdso needed the elf marker
flag. toolchain should complain if `-fcf-protection=full` and
marker is missing for object generated from asm file. Asked
toolchain folks to fix this. Although no reason to gate the merge
on that.
- Split up compile options for march and fcf-protection in vdso
Makefile
- CONFIG_RISCV_USER_CFI option is moved under "Kernel features" menu
Added `arch/riscv/configs/hardening.config` fragment which selects
CONFIG_RISCV_USER_CFI
v17:
- fixed warnings due to empty macros in usercfi.h (reported by alexg)
- fixed prefixes in commit titles reported by alexg
- took below uprobe with fcfi v2 patch from Zong Li and squashed it with
"riscv/traps: Introduce software check exception and uprobe handling"
https://lore.kernel.org/all/20250604093403.10916-1-zong.li@sifive.com/
v16:
- If FWFT is not implemented or returns error for shadow stack activation, then
no_usercfi is set to disable shadow stack. Although this should be picked up
by extension validation and activation. Fixed this bug for zicfilp and zicfiss
both. Thanks to Charlie Jenkins for reporting this.
- If toolchain doesn't support cfi, cfi kselftest shouldn't build. Suggested by
Charlie Jenkins.
- Default for CONFIG_RISCV_USER_CFI is set to no. Charlie/Atish suggested to
keep it off till we have more hardware availibility with RVA23 profile and
zimop/zcmop implemented. Else this will start breaking people's workflow
- Includes the fix if "!RV64 and !SBI" then definitions for FWFT in
asm-offsets.c error.
v15:
- Toolchain has been updated to include `-fcf-protection` flag. This
exists for x86 as well. Updated kernel patches to compile vDSO and
selftest to compile with `fcf-protection=full` flag.
- selecting CONFIG_RISCV_USERCFI selects CONFIG_RISCV_SBI.
- Patch to enable shadow stack for kernel wasn't hidden behind
CONFIG_RISCV_USERCFI and CONFIG_RISCV_SBI. fixed that.
v14:
- rebased on top of palmer/sbi-v3. Thus dropped clement's FWFT patches
Updated RISCV_ISA_EXT_XXXX in hwcap and hwprobe constants.
- Took Radim's suggestions on bitfields.
- Placed cfi_state at the end of thread_info block so that current situation
is not disturbed with respect to member fields of thread_info in single
cacheline.
v13:
- cpu_supports_shadow_stack/cpu_supports_indirect_br_lp_instr uses
riscv_has_extension_unlikely()
- uses nops(count) to create nop slide
- RISCV_ACQUIRE_BARRIER is not needed in `amo_user_shstk`. Removed it
- changed ternaries to simply use implicit casting to convert to bool.
- kernel command line allows to disable zicfilp and zicfiss independently.
updated kernel-parameters.txt.
- ptrace user abi for cfi uses bitmasks instead of bitfields. Added ptrace
kselftest.
- cosmetic and grammatical changes to documentation.
v12:
- It seems like I had accidently squashed arch agnostic indirect branch
tracking prctl and riscv implementation of those prctls. Split them again.
- set_shstk_status/set_indir_lp_status perform CSR writes only when CPU
support is available. As suggested by Zong Li.
- Some minor clean up in kselftests as suggested by Zong Li.
v11:
- patch "arch/riscv: compile vdso with landing pad" was unconditionally
selecting `_zicfilp` for vDSO compile. fixed that. Changed `lpad 1` to
to `lpad 0`.
v10:
- dropped "mm: helper `is_shadow_stack_vma` to check shadow stack vma". This patch
is not that interesting to this patch series for risc-v. There are instances in
arch directories where VM_SHADOW_STACK flag is anyways used. Dropping this patch
to expedite merging in riscv tree.
- Took suggestions from `Clement` on "riscv: zicfiss / zicfilp enumeration" to
validate presence of cfi based on config.
- Added a patch for vDSO to have `lpad 0`. I had omitted this earlier to make sure
we add single vdso object with cfi enabled. But a vdso object with scheme of
zero labeled landing pad is least common denominator and should work with all
objects of zero labeled as well as function-signature labeled objects.
v9:
- rebased on master (39a803b754d5 fix braino in "9p: fix ->rename_sem exclusion")
- dropped "mm: Introduce ARCH_HAS_USER_SHADOW_STACK" (master has it from arm64/gcs)
- dropped "prctl: arch-agnostic prctl for shadow stack" (master has it from arm64/gcs)
v8:
- rebased on palmer/for-next
- dropped samuel holland's `envcfg` context switch patches.
they are in parlmer/for-next
v7:
- Removed "riscv/Kconfig: enable HAVE_EXIT_THREAD for riscv"
Instead using `deactivate_mm` flow to clean up.
see here for more context
https://lore.kernel.org/all/20230908203655.543765-1-rick.p.edgecombe@intel.…
- Changed the header include in `kselftest`. Hopefully this fixes compile
issue faced by Zong Li at SiFive.
- Cleaned up an orphaned change to `mm/mmap.c` in below patch
"riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE"
- Lock interfaces for shadow stack and indirect branch tracking expect arg == 0
Any future evolution of this interface should accordingly define how arg should
be setup.
- `mm/map.c` has an instance of using `VM_SHADOW_STACK`. Fixed it to use helper
`is_shadow_stack_vma`.
- Link to v6: https://lore.kernel.org/r/20241008-v5_user_cfi_series-v6-0-60d9fe073f37@riv…
v6:
- Picked up Samuel Holland's changes as is with `envcfg` placed in
`thread` instead of `thread_info`
- fixed unaligned newline escapes in kselftest
- cleaned up messages in kselftest and included test output in commit message
- fixed a bug in clone path reported by Zong Li
- fixed a build issue if CONFIG_RISCV_ISA_V is not selected
(this was introduced due to re-factoring signal context
management code)
v5:
- rebased on v6.12-rc1
- Fixed schema related issues in device tree file
- Fixed some of the documentation related issues in zicfilp/ss.rst
(style issues and added index)
- added `SHADOW_STACK_SET_MARKER` so that implementation can define base
of shadow stack.
- Fixed warnings on definitions added in usercfi.h when
CONFIG_RISCV_USER_CFI is not selected.
- Adopted context header based signal handling as proposed by Andy Chiu
- Added support for enabling kernel mode access to shadow stack using
FWFT
(https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-firmware…)
- Link to v5: https://lore.kernel.org/r/20241001-v5_user_cfi_series-v1-0-3ba65b6e550f@riv…
(Note: I had an issue in my workflow due to which version number wasn't
picked up correctly while sending out patches)
v4:
- rebased on 6.11-rc6
- envcfg: Converged with Samuel Holland's patches for envcfg management on per-
thread basis.
- vma_is_shadow_stack is renamed to is_vma_shadow_stack
- picked up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` patch
- signal context: using extended context management to maintain compatibility.
- fixed `-Wmissing-prototypes` compiler warnings for prctl functions
- Documentation fixes and amending typos.
- Link to v4: https://lore.kernel.org/all/20240912231650.3740732-1-debug@rivosinc.com/
v3:
- envcfg
logic to pick up base envcfg had a bug where `ENVCFG_CBZE` could have been
picked on per task basis, even though CPU didn't implement it. Fixed in
this series.
- dt-bindings
As suggested, split into separate commit. fixed the messaging that spec is
in public review
- arch_is_shadow_stack change
arch_is_shadow_stack changed to vma_is_shadow_stack
- hwprobe
zicfiss / zicfilp if present will get enumerated in hwprobe
- selftests
As suggested, added object and binary filenames to .gitignore
Selftest binary anyways need to be compiled with cfi enabled compiler which
will make sure that landing pad and shadow stack are enabled. Thus removed
separate enable/disable tests. Cleaned up tests a bit.
- Link to v3: https://lore.kernel.org/lkml/20240403234054.2020347-1-debug@rivosinc.com/
v2:
- Using config `CONFIG_RISCV_USER_CFI`, kernel support for riscv control flow
integrity for user mode programs can be compiled in the kernel.
- Enabling of control flow integrity for user programs is left to user runtime
- This patch series introduces arch agnostic `prctls` to enable shadow stack
and indirect branch tracking. And implements them on riscv.
---
Changes in v20:
- Link to v19: https://lore.kernel.org/r/20250731-v5_user_cfi_series-v19-0-09b468d7beab@ri…
Changes in v19:
- Link to v18: https://lore.kernel.org/r/20250711-v5_user_cfi_series-v18-0-a8ee62f9f38e@ri…
Changes in v18:
- Link to v17: https://lore.kernel.org/r/20250604-v5_user_cfi_series-v17-0-4565c2cf869f@ri…
Changes in v17:
- Link to v16: https://lore.kernel.org/r/20250522-v5_user_cfi_series-v16-0-64f61a35eee7@ri…
Changes in v16:
- Link to v15: https://lore.kernel.org/r/20250502-v5_user_cfi_series-v15-0-914966471885@ri…
Changes in v15:
- changelog posted just below cover letter
- Link to v14: https://lore.kernel.org/r/20250429-v5_user_cfi_series-v14-0-5239410d012a@ri…
Changes in v14:
- changelog posted just below cover letter
- Link to v13: https://lore.kernel.org/r/20250424-v5_user_cfi_series-v13-0-971437de586a@ri…
Changes in v13:
- changelog posted just below cover letter
- Link to v12: https://lore.kernel.org/r/20250314-v5_user_cfi_series-v12-0-e51202b53138@ri…
Changes in v12:
- changelog posted just below cover letter
- Link to v11: https://lore.kernel.org/r/20250310-v5_user_cfi_series-v11-0-86b36cbfb910@ri…
Changes in v11:
- changelog posted just below cover letter
- Link to v10: https://lore.kernel.org/r/20250210-v5_user_cfi_series-v10-0-163dcfa31c60@ri…
---
Andy Chiu (1):
riscv: signal: abstract header saving for setup_sigcontext
Deepak Gupta (26):
mm: VM_SHADOW_STACK definition for riscv
dt-bindings: riscv: zicfilp and zicfiss in dt-bindings (extensions.yaml)
riscv: zicfiss / zicfilp enumeration
riscv: zicfiss / zicfilp extension csr and bit definitions
riscv: usercfi state for task and save/restore of CSR_SSP on trap entry/exit
riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE
riscv/mm: manufacture shadow stack pte
riscv/mm: teach pte_mkwrite to manufacture shadow stack PTEs
riscv/mm: write protect and shadow stack
riscv/mm: Implement map_shadow_stack() syscall
riscv/shstk: If needed allocate a new shadow stack on clone
riscv: Implements arch agnostic shadow stack prctls
prctl: arch-agnostic prctl for indirect branch tracking
riscv: Implements arch agnostic indirect branch tracking prctls
riscv/traps: Introduce software check exception and uprobe handling
riscv/signal: save and restore of shadow stack for signal
riscv/kernel: update __show_regs to print shadow stack register
riscv/ptrace: riscv cfi status and state via ptrace and in core files
riscv/hwprobe: zicfilp / zicfiss enumeration in hwprobe
riscv: kernel command line option to opt out of user cfi
riscv: enable kernel access to shadow stack memory via FWFT sbi call
arch/riscv: dual vdso creation logic and select vdso based on hw
riscv: create a config for shadow stack and landing pad instr support
riscv: Documentation for landing pad / indirect branch tracking
riscv: Documentation for shadow stack on riscv
kselftest/riscv: kselftest for user mode cfi
Jim Shu (1):
arch/riscv: compile vdso with landing pad and shadow stack note
Documentation/admin-guide/kernel-parameters.txt | 8 +
Documentation/arch/riscv/index.rst | 2 +
Documentation/arch/riscv/zicfilp.rst | 115 +++++
Documentation/arch/riscv/zicfiss.rst | 179 +++++++
.../devicetree/bindings/riscv/extensions.yaml | 14 +
arch/riscv/Kconfig | 21 +
arch/riscv/Makefile | 8 +-
arch/riscv/configs/hardening.config | 4 +
arch/riscv/include/asm/asm-prototypes.h | 1 +
arch/riscv/include/asm/assembler.h | 44 ++
arch/riscv/include/asm/cpufeature.h | 12 +
arch/riscv/include/asm/csr.h | 16 +
arch/riscv/include/asm/entry-common.h | 2 +
arch/riscv/include/asm/hwcap.h | 2 +
arch/riscv/include/asm/mman.h | 26 +
arch/riscv/include/asm/mmu_context.h | 7 +
arch/riscv/include/asm/pgtable.h | 30 +-
arch/riscv/include/asm/processor.h | 1 +
arch/riscv/include/asm/thread_info.h | 3 +
arch/riscv/include/asm/usercfi.h | 95 ++++
arch/riscv/include/asm/vdso.h | 7 +-
arch/riscv/include/asm/vector.h | 3 +
arch/riscv/include/uapi/asm/hwprobe.h | 2 +
arch/riscv/include/uapi/asm/ptrace.h | 34 ++
arch/riscv/include/uapi/asm/sigcontext.h | 1 +
arch/riscv/kernel/Makefile | 2 +
arch/riscv/kernel/asm-offsets.c | 10 +
arch/riscv/kernel/cpufeature.c | 27 +
arch/riscv/kernel/entry.S | 38 ++
arch/riscv/kernel/head.S | 27 +
arch/riscv/kernel/process.c | 27 +-
arch/riscv/kernel/ptrace.c | 95 ++++
arch/riscv/kernel/signal.c | 148 +++++-
arch/riscv/kernel/sys_hwprobe.c | 2 +
arch/riscv/kernel/sys_riscv.c | 10 +
arch/riscv/kernel/traps.c | 54 ++
arch/riscv/kernel/usercfi.c | 545 +++++++++++++++++++++
arch/riscv/kernel/vdso.c | 7 +
arch/riscv/kernel/vdso/Makefile | 40 +-
arch/riscv/kernel/vdso/flush_icache.S | 4 +
arch/riscv/kernel/vdso/gen_vdso_offsets.sh | 4 +-
arch/riscv/kernel/vdso/getcpu.S | 4 +
arch/riscv/kernel/vdso/note.S | 3 +
arch/riscv/kernel/vdso/rt_sigreturn.S | 4 +
arch/riscv/kernel/vdso/sys_hwprobe.S | 4 +
arch/riscv/kernel/vdso/vgetrandom-chacha.S | 5 +-
arch/riscv/kernel/vdso_cfi/Makefile | 25 +
arch/riscv/kernel/vdso_cfi/vdso-cfi.S | 11 +
arch/riscv/mm/init.c | 2 +-
arch/riscv/mm/pgtable.c | 16 +
include/linux/cpu.h | 4 +
include/linux/mm.h | 7 +
include/uapi/linux/elf.h | 2 +
include/uapi/linux/prctl.h | 27 +
kernel/sys.c | 30 ++
tools/testing/selftests/riscv/Makefile | 2 +-
tools/testing/selftests/riscv/cfi/.gitignore | 3 +
tools/testing/selftests/riscv/cfi/Makefile | 16 +
tools/testing/selftests/riscv/cfi/cfi_rv_test.h | 82 ++++
tools/testing/selftests/riscv/cfi/riscv_cfi_test.c | 173 +++++++
tools/testing/selftests/riscv/cfi/shadowstack.c | 385 +++++++++++++++
tools/testing/selftests/riscv/cfi/shadowstack.h | 27 +
62 files changed, 2468 insertions(+), 41 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20240930-v5_user_cfi_series-3dc332f8f5b2
--
- debug
v21: fixed build errors.
Basics and overview
===================
Software with larger attack surfaces (e.g. network facing apps like databases,
browsers or apps relying on browser runtimes) suffer from memory corruption
issues which can be utilized by attackers to bend control flow of the program
to eventually gain control (by making their payload executable). Attackers are
able to perform such attacks by leveraging call-sites which rely on indirect
calls or return sites which rely on obtaining return address from stack memory.
To mitigate such attacks, risc-v extension zicfilp enforces that all indirect
calls must land on a landing pad instruction `lpad` else cpu will raise software
check exception (a new cpu exception cause code on riscv).
Similarly for return flow, risc-v extension zicfiss extends architecture with
- `sspush` instruction to push return address on a shadow stack
- `sspopchk` instruction to pop return address from shadow stack
and compare with input operand (i.e. return address on stack)
- `sspopchk` to raise software check exception if comparision above
was a mismatch
- Protection mechanism using which shadow stack is not writeable via
regular store instructions
More information an details can be found at extensions github repo [1].
Equivalent to landing pad (zicfilp) on x86 is `ENDBRANCH` instruction in Intel
CET [3] and branch target identification (BTI) [4] on arm.
Similarly x86's Intel CET has shadow stack [5] and arm64 has guarded control
stack (GCS) [6] which are very similar to risc-v's zicfiss shadow stack.
x86 and arm64 support for user mode shadow stack is already in mainline.
Kernel awareness for user control flow integrity
================================================
This series picks up Samuel Holland's envcfg changes [2] as well. So if those are
being applied independently, they should be removed from this series.
Enabling:
In order to maintain compatibility and not break anything in user mode, kernel
doesn't enable control flow integrity cpu extensions on binary by default.
Instead exposes a prctl interface to enable, disable and lock the shadow stack
or landing pad feature for a task. This allows userspace (loader) to enumerate
if all objects in its address space are compiled with shadow stack and landing
pad support and accordingly enable the feature. Additionally if a subsequent
`dlopen` happens on a library, user mode can take a decision again to disable
the feature (if incoming library is not compiled with support) OR terminate the
task (if user mode policy is strict to have all objects in address space to be
compiled with control flow integirty cpu feature). prctl to enable shadow stack
results in allocating shadow stack from virtual memory and activating for user
address space. x86 and arm64 are also following same direction due to similar
reason(s).
clone/fork:
On clone and fork, cfi state for task is inherited by child. Shadow stack is
part of virtual memory and is a writeable memory from kernel perspective
(writeable via a restricted set of instructions aka shadow stack instructions)
Thus kernel changes ensure that this memory is converted into read-only when
fork/clone happens and COWed when fault is taken due to sspush, sspopchk or
ssamoswap. In case `CLONE_VM` is specified and shadow stack is to be enabled,
kernel will automatically allocate a shadow stack for that clone call.
map_shadow_stack:
x86 introduced `map_shadow_stack` system call to allow user space to explicitly
map shadow stack memory in its address space. It is useful to allocate shadow
for different contexts managed by a single thread (green threads or contexts)
risc-v implements this system call as well.
signal management:
If shadow stack is enabled for a task, kernel performs an asynchronous control
flow diversion to deliver the signal and eventually expects userspace to issue
sigreturn so that original execution can be resumed. Even though resume context
is prepared by kernel, it is in user space memory and is subject to memory
corruption and corruption bugs can be utilized by attacker in this race window
to perform arbitrary sigreturn and eventually bypass cfi mechanism.
Another issue is how to ensure that cfi related state on sigcontext area is not
trampled by legacy apps or apps compiled with old kernel headers.
In order to mitigate control-flow hijacting, kernel prepares a token and place
it on shadow stack before signal delivery and places address of token in
sigcontext structure. During sigreturn, kernel obtains address of token from
sigcontext struture, reads token from shadow stack and validates it and only
then allow sigreturn to succeed. Compatiblity issue is solved by adopting
dynamic sigcontext management introduced for vector extension. This series
re-factor the code little bit to allow future sigcontext management easy (as
proposed by Andy Chiu from SiFive)
config and compilation:
Introduce a new risc-v config option `CONFIG_RISCV_USER_CFI`. Selecting this
config option picks the kernel support for user control flow integrity. This
optin is presented only if toolchain has shadow stack and landing pad support.
And is on purpose guarded by toolchain support. Reason being that eventually
vDSO also needs to be compiled in with shadow stack and landing pad support.
vDSO compile patches are not included as of now because landing pad labeling
scheme is yet to settle for usermode runtime.
To get more information on kernel interactions with respect to
zicfilp and zicfiss, patch series adds documentation for
`zicfilp` and `zicfiss` in following:
Documentation/arch/riscv/zicfiss.rst
Documentation/arch/riscv/zicfilp.rst
How to test this series
=======================
Toolchain
---------
$ git clone git@github.com:sifive/riscv-gnu-toolchain.git -b cfi-dev
$ riscv-gnu-toolchain/configure --prefix=<path-to-where-to-build> --with-arch=rv64gc_zicfilp_zicfiss --enable-linux --disable-gdb --with-extra-multilib-test="rv64gc_zicfilp_zicfiss-lp64d:-static"
$ make -j$(nproc)
Qemu
----
Get the lastest qemu
$ cd qemu
$ mkdir build
$ cd build
$ ../configure --target-list=riscv64-softmmu
$ make -j$(nproc)
Opensbi
-------
$ git clone git@github.com:deepak0414/opensbi.git -b v6_cfi_spec_split_opensbi
$ make CROSS_COMPILE=<your riscv toolchain> -j$(nproc) PLATFORM=generic
Linux
-----
Running defconfig is fine. CFI is enabled by default if the toolchain
supports it.
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc) defconfig
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc)
Running
-------
Modify your qemu command to have:
-bios <path-to-cfi-opensbi>/build/platform/generic/firmware/fw_dynamic.bin
-cpu rv64,zicfilp=true,zicfiss=true,zimop=true,zcmop=true
References
==========
[1] - https://github.com/riscv/riscv-cfi
[2] - https://lore.kernel.org/all/20240814081126.956287-1-samuel.holland@sifive.c…
[3] - https://lwn.net/Articles/889475/
[4] - https://developer.arm.com/documentation/109576/0100/Branch-Target-Identific…
[5] - https://www.intel.com/content/dam/develop/external/us/en/documents/catc17-i…
[6] - https://lwn.net/Articles/940403/
To: Thomas Gleixner <tglx(a)linutronix.de>
To: Ingo Molnar <mingo(a)redhat.com>
To: Borislav Petkov <bp(a)alien8.de>
To: Dave Hansen <dave.hansen(a)linux.intel.com>
To: x86(a)kernel.org
To: H. Peter Anvin <hpa(a)zytor.com>
To: Andrew Morton <akpm(a)linux-foundation.org>
To: Liam R. Howlett <Liam.Howlett(a)oracle.com>
To: Vlastimil Babka <vbabka(a)suse.cz>
To: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
To: Paul Walmsley <paul.walmsley(a)sifive.com>
To: Palmer Dabbelt <palmer(a)dabbelt.com>
To: Albert Ou <aou(a)eecs.berkeley.edu>
To: Conor Dooley <conor(a)kernel.org>
To: Rob Herring <robh(a)kernel.org>
To: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
To: Arnd Bergmann <arnd(a)arndb.de>
To: Christian Brauner <brauner(a)kernel.org>
To: Peter Zijlstra <peterz(a)infradead.org>
To: Oleg Nesterov <oleg(a)redhat.com>
To: Eric Biederman <ebiederm(a)xmission.com>
To: Kees Cook <kees(a)kernel.org>
To: Jonathan Corbet <corbet(a)lwn.net>
To: Shuah Khan <shuah(a)kernel.org>
To: Jann Horn <jannh(a)google.com>
To: Conor Dooley <conor+dt(a)kernel.org>
To: Miguel Ojeda <ojeda(a)kernel.org>
To: Alex Gaynor <alex.gaynor(a)gmail.com>
To: Boqun Feng <boqun.feng(a)gmail.com>
To: Gary Guo <gary(a)garyguo.net>
To: Björn Roy Baron <bjorn3_gh(a)protonmail.com>
To: Benno Lossin <benno.lossin(a)proton.me>
To: Andreas Hindborg <a.hindborg(a)kernel.org>
To: Alice Ryhl <aliceryhl(a)google.com>
To: Trevor Gross <tmgross(a)umich.edu>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-fsdevel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Cc: linux-riscv(a)lists.infradead.org
Cc: devicetree(a)vger.kernel.org
Cc: linux-arch(a)vger.kernel.org
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: alistair.francis(a)wdc.com
Cc: richard.henderson(a)linaro.org
Cc: jim.shu(a)sifive.com
Cc: andybnac(a)gmail.com
Cc: kito.cheng(a)sifive.com
Cc: charlie(a)rivosinc.com
Cc: atishp(a)rivosinc.com
Cc: evan(a)rivosinc.com
Cc: cleger(a)rivosinc.com
Cc: alexghiti(a)rivosinc.com
Cc: samitolvanen(a)google.com
Cc: broonie(a)kernel.org
Cc: rick.p.edgecombe(a)intel.com
Cc: rust-for-linux(a)vger.kernel.org
changelog
---------
v21:
- Fixing build errors due to changes in arch/riscv/include/asm/vdso.h
Using #ifdef instead of IS_ENABLED in arch/riscv/include/asm/vdso.h
vdso-cfi-offsets.h should be included only when CONFIG_RISCV_USER_CFI
is selected.
v20:
- rebased on v6.18-rc1.
- Added two vDSO support. If `CONFIG_RISCV_USER_CFI` is selected
two vDSOs are compiled (one for hardware prior to RVA23 and one
for RVA23 onwards). Kernel exposes RVA23 vDSO if hardware/cpu
implements zimop else exposes existing vDSO to userspace.
- default selection for `CONFIG_RISCV_USER_CFI` is "Yes".
- replaced "__ASSEMBLY__" with "__ASSEMBLER__"
v19:
- riscv_nousercfi was `int`. changed it to unsigned long.
Thanks to Alex Ghiti for reporting it. It was a bug.
- ELP is cleared on trap entry only when CONFIG_64BIT.
- restore ssp back on return to usermode was being done
before `riscv_v_context_nesting_end` on trap exit path.
If kernel shadow stack were enabled this would result in
kernel operating on user shadow stack and panic (as I found
in my testing of kcfi patch series). So fixed that.
v18:
- rebased on 6.16-rc1
- uprobe handling clears ELP in sstatus image in pt_regs
- vdso was missing shadow stack elf note for object files.
added that. Additional asm file for vdso needed the elf marker
flag. toolchain should complain if `-fcf-protection=full` and
marker is missing for object generated from asm file. Asked
toolchain folks to fix this. Although no reason to gate the merge
on that.
- Split up compile options for march and fcf-protection in vdso
Makefile
- CONFIG_RISCV_USER_CFI option is moved under "Kernel features" menu
Added `arch/riscv/configs/hardening.config` fragment which selects
CONFIG_RISCV_USER_CFI
v17:
- fixed warnings due to empty macros in usercfi.h (reported by alexg)
- fixed prefixes in commit titles reported by alexg
- took below uprobe with fcfi v2 patch from Zong Li and squashed it with
"riscv/traps: Introduce software check exception and uprobe handling"
https://lore.kernel.org/all/20250604093403.10916-1-zong.li@sifive.com/
v16:
- If FWFT is not implemented or returns error for shadow stack activation, then
no_usercfi is set to disable shadow stack. Although this should be picked up
by extension validation and activation. Fixed this bug for zicfilp and zicfiss
both. Thanks to Charlie Jenkins for reporting this.
- If toolchain doesn't support cfi, cfi kselftest shouldn't build. Suggested by
Charlie Jenkins.
- Default for CONFIG_RISCV_USER_CFI is set to no. Charlie/Atish suggested to
keep it off till we have more hardware availibility with RVA23 profile and
zimop/zcmop implemented. Else this will start breaking people's workflow
- Includes the fix if "!RV64 and !SBI" then definitions for FWFT in
asm-offsets.c error.
v15:
- Toolchain has been updated to include `-fcf-protection` flag. This
exists for x86 as well. Updated kernel patches to compile vDSO and
selftest to compile with `fcf-protection=full` flag.
- selecting CONFIG_RISCV_USERCFI selects CONFIG_RISCV_SBI.
- Patch to enable shadow stack for kernel wasn't hidden behind
CONFIG_RISCV_USERCFI and CONFIG_RISCV_SBI. fixed that.
v14:
- rebased on top of palmer/sbi-v3. Thus dropped clement's FWFT patches
Updated RISCV_ISA_EXT_XXXX in hwcap and hwprobe constants.
- Took Radim's suggestions on bitfields.
- Placed cfi_state at the end of thread_info block so that current situation
is not disturbed with respect to member fields of thread_info in single
cacheline.
v13:
- cpu_supports_shadow_stack/cpu_supports_indirect_br_lp_instr uses
riscv_has_extension_unlikely()
- uses nops(count) to create nop slide
- RISCV_ACQUIRE_BARRIER is not needed in `amo_user_shstk`. Removed it
- changed ternaries to simply use implicit casting to convert to bool.
- kernel command line allows to disable zicfilp and zicfiss independently.
updated kernel-parameters.txt.
- ptrace user abi for cfi uses bitmasks instead of bitfields. Added ptrace
kselftest.
- cosmetic and grammatical changes to documentation.
v12:
- It seems like I had accidently squashed arch agnostic indirect branch
tracking prctl and riscv implementation of those prctls. Split them again.
- set_shstk_status/set_indir_lp_status perform CSR writes only when CPU
support is available. As suggested by Zong Li.
- Some minor clean up in kselftests as suggested by Zong Li.
v11:
- patch "arch/riscv: compile vdso with landing pad" was unconditionally
selecting `_zicfilp` for vDSO compile. fixed that. Changed `lpad 1` to
to `lpad 0`.
v10:
- dropped "mm: helper `is_shadow_stack_vma` to check shadow stack vma". This patch
is not that interesting to this patch series for risc-v. There are instances in
arch directories where VM_SHADOW_STACK flag is anyways used. Dropping this patch
to expedite merging in riscv tree.
- Took suggestions from `Clement` on "riscv: zicfiss / zicfilp enumeration" to
validate presence of cfi based on config.
- Added a patch for vDSO to have `lpad 0`. I had omitted this earlier to make sure
we add single vdso object with cfi enabled. But a vdso object with scheme of
zero labeled landing pad is least common denominator and should work with all
objects of zero labeled as well as function-signature labeled objects.
v9:
- rebased on master (39a803b754d5 fix braino in "9p: fix ->rename_sem exclusion")
- dropped "mm: Introduce ARCH_HAS_USER_SHADOW_STACK" (master has it from arm64/gcs)
- dropped "prctl: arch-agnostic prctl for shadow stack" (master has it from arm64/gcs)
v8:
- rebased on palmer/for-next
- dropped samuel holland's `envcfg` context switch patches.
they are in parlmer/for-next
v7:
- Removed "riscv/Kconfig: enable HAVE_EXIT_THREAD for riscv"
Instead using `deactivate_mm` flow to clean up.
see here for more context
https://lore.kernel.org/all/20230908203655.543765-1-rick.p.edgecombe@intel.…
- Changed the header include in `kselftest`. Hopefully this fixes compile
issue faced by Zong Li at SiFive.
- Cleaned up an orphaned change to `mm/mmap.c` in below patch
"riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE"
- Lock interfaces for shadow stack and indirect branch tracking expect arg == 0
Any future evolution of this interface should accordingly define how arg should
be setup.
- `mm/map.c` has an instance of using `VM_SHADOW_STACK`. Fixed it to use helper
`is_shadow_stack_vma`.
- Link to v6: https://lore.kernel.org/r/20241008-v5_user_cfi_series-v6-0-60d9fe073f37@riv…
v6:
- Picked up Samuel Holland's changes as is with `envcfg` placed in
`thread` instead of `thread_info`
- fixed unaligned newline escapes in kselftest
- cleaned up messages in kselftest and included test output in commit message
- fixed a bug in clone path reported by Zong Li
- fixed a build issue if CONFIG_RISCV_ISA_V is not selected
(this was introduced due to re-factoring signal context
management code)
v5:
- rebased on v6.12-rc1
- Fixed schema related issues in device tree file
- Fixed some of the documentation related issues in zicfilp/ss.rst
(style issues and added index)
- added `SHADOW_STACK_SET_MARKER` so that implementation can define base
of shadow stack.
- Fixed warnings on definitions added in usercfi.h when
CONFIG_RISCV_USER_CFI is not selected.
- Adopted context header based signal handling as proposed by Andy Chiu
- Added support for enabling kernel mode access to shadow stack using
FWFT
(https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-firmware…)
- Link to v5: https://lore.kernel.org/r/20241001-v5_user_cfi_series-v1-0-3ba65b6e550f@riv…
(Note: I had an issue in my workflow due to which version number wasn't
picked up correctly while sending out patches)
v4:
- rebased on 6.11-rc6
- envcfg: Converged with Samuel Holland's patches for envcfg management on per-
thread basis.
- vma_is_shadow_stack is renamed to is_vma_shadow_stack
- picked up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` patch
- signal context: using extended context management to maintain compatibility.
- fixed `-Wmissing-prototypes` compiler warnings for prctl functions
- Documentation fixes and amending typos.
- Link to v4: https://lore.kernel.org/all/20240912231650.3740732-1-debug@rivosinc.com/
v3:
- envcfg
logic to pick up base envcfg had a bug where `ENVCFG_CBZE` could have been
picked on per task basis, even though CPU didn't implement it. Fixed in
this series.
- dt-bindings
As suggested, split into separate commit. fixed the messaging that spec is
in public review
- arch_is_shadow_stack change
arch_is_shadow_stack changed to vma_is_shadow_stack
- hwprobe
zicfiss / zicfilp if present will get enumerated in hwprobe
- selftests
As suggested, added object and binary filenames to .gitignore
Selftest binary anyways need to be compiled with cfi enabled compiler which
will make sure that landing pad and shadow stack are enabled. Thus removed
separate enable/disable tests. Cleaned up tests a bit.
- Link to v3: https://lore.kernel.org/lkml/20240403234054.2020347-1-debug@rivosinc.com/
v2:
- Using config `CONFIG_RISCV_USER_CFI`, kernel support for riscv control flow
integrity for user mode programs can be compiled in the kernel.
- Enabling of control flow integrity for user programs is left to user runtime
- This patch series introduces arch agnostic `prctls` to enable shadow stack
and indirect branch tracking. And implements them on riscv.
---
Changes in v21:
- Link to v20: https://lore.kernel.org/r/20251013-v5_user_cfi_series-v20-0-b9de4be9912e@ri…
Changes in v20:
- Link to v19: https://lore.kernel.org/r/20250731-v5_user_cfi_series-v19-0-09b468d7beab@ri…
Changes in v19:
- Link to v18: https://lore.kernel.org/r/20250711-v5_user_cfi_series-v18-0-a8ee62f9f38e@ri…
Changes in v18:
- Link to v17: https://lore.kernel.org/r/20250604-v5_user_cfi_series-v17-0-4565c2cf869f@ri…
Changes in v17:
- Link to v16: https://lore.kernel.org/r/20250522-v5_user_cfi_series-v16-0-64f61a35eee7@ri…
Changes in v16:
- Link to v15: https://lore.kernel.org/r/20250502-v5_user_cfi_series-v15-0-914966471885@ri…
Changes in v15:
- changelog posted just below cover letter
- Link to v14: https://lore.kernel.org/r/20250429-v5_user_cfi_series-v14-0-5239410d012a@ri…
Changes in v14:
- changelog posted just below cover letter
- Link to v13: https://lore.kernel.org/r/20250424-v5_user_cfi_series-v13-0-971437de586a@ri…
Changes in v13:
- changelog posted just below cover letter
- Link to v12: https://lore.kernel.org/r/20250314-v5_user_cfi_series-v12-0-e51202b53138@ri…
Changes in v12:
- changelog posted just below cover letter
- Link to v11: https://lore.kernel.org/r/20250310-v5_user_cfi_series-v11-0-86b36cbfb910@ri…
Changes in v11:
- changelog posted just below cover letter
- Link to v10: https://lore.kernel.org/r/20250210-v5_user_cfi_series-v10-0-163dcfa31c60@ri…
---
Andy Chiu (1):
riscv: signal: abstract header saving for setup_sigcontext
Deepak Gupta (26):
mm: VM_SHADOW_STACK definition for riscv
dt-bindings: riscv: zicfilp and zicfiss in dt-bindings (extensions.yaml)
riscv: zicfiss / zicfilp enumeration
riscv: zicfiss / zicfilp extension csr and bit definitions
riscv: usercfi state for task and save/restore of CSR_SSP on trap entry/exit
riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE
riscv/mm: manufacture shadow stack pte
riscv/mm: teach pte_mkwrite to manufacture shadow stack PTEs
riscv/mm: write protect and shadow stack
riscv/mm: Implement map_shadow_stack() syscall
riscv/shstk: If needed allocate a new shadow stack on clone
riscv: Implements arch agnostic shadow stack prctls
prctl: arch-agnostic prctl for indirect branch tracking
riscv: Implements arch agnostic indirect branch tracking prctls
riscv/traps: Introduce software check exception and uprobe handling
riscv/signal: save and restore of shadow stack for signal
riscv/kernel: update __show_regs to print shadow stack register
riscv/ptrace: riscv cfi status and state via ptrace and in core files
riscv/hwprobe: zicfilp / zicfiss enumeration in hwprobe
riscv: kernel command line option to opt out of user cfi
riscv: enable kernel access to shadow stack memory via FWFT sbi call
arch/riscv: dual vdso creation logic and select vdso based on hw
riscv: create a config for shadow stack and landing pad instr support
riscv: Documentation for landing pad / indirect branch tracking
riscv: Documentation for shadow stack on riscv
kselftest/riscv: kselftest for user mode cfi
Jim Shu (1):
arch/riscv: compile vdso with landing pad and shadow stack note
Documentation/admin-guide/kernel-parameters.txt | 8 +
Documentation/arch/riscv/index.rst | 2 +
Documentation/arch/riscv/zicfilp.rst | 115 +++++
Documentation/arch/riscv/zicfiss.rst | 179 +++++++
.../devicetree/bindings/riscv/extensions.yaml | 14 +
arch/riscv/Kconfig | 21 +
arch/riscv/Makefile | 8 +-
arch/riscv/configs/hardening.config | 4 +
arch/riscv/include/asm/asm-prototypes.h | 1 +
arch/riscv/include/asm/assembler.h | 44 ++
arch/riscv/include/asm/cpufeature.h | 12 +
arch/riscv/include/asm/csr.h | 16 +
arch/riscv/include/asm/entry-common.h | 2 +
arch/riscv/include/asm/hwcap.h | 2 +
arch/riscv/include/asm/mman.h | 26 +
arch/riscv/include/asm/mmu_context.h | 7 +
arch/riscv/include/asm/pgtable.h | 30 +-
arch/riscv/include/asm/processor.h | 1 +
arch/riscv/include/asm/thread_info.h | 3 +
arch/riscv/include/asm/usercfi.h | 95 ++++
arch/riscv/include/asm/vdso.h | 13 +-
arch/riscv/include/asm/vector.h | 3 +
arch/riscv/include/uapi/asm/hwprobe.h | 2 +
arch/riscv/include/uapi/asm/ptrace.h | 34 ++
arch/riscv/include/uapi/asm/sigcontext.h | 1 +
arch/riscv/kernel/Makefile | 2 +
arch/riscv/kernel/asm-offsets.c | 10 +
arch/riscv/kernel/cpufeature.c | 27 +
arch/riscv/kernel/entry.S | 38 ++
arch/riscv/kernel/head.S | 27 +
arch/riscv/kernel/process.c | 27 +-
arch/riscv/kernel/ptrace.c | 95 ++++
arch/riscv/kernel/signal.c | 148 +++++-
arch/riscv/kernel/sys_hwprobe.c | 2 +
arch/riscv/kernel/sys_riscv.c | 10 +
arch/riscv/kernel/traps.c | 54 ++
arch/riscv/kernel/usercfi.c | 545 +++++++++++++++++++++
arch/riscv/kernel/vdso.c | 7 +
arch/riscv/kernel/vdso/Makefile | 40 +-
arch/riscv/kernel/vdso/flush_icache.S | 4 +
arch/riscv/kernel/vdso/gen_vdso_offsets.sh | 4 +-
arch/riscv/kernel/vdso/getcpu.S | 4 +
arch/riscv/kernel/vdso/note.S | 3 +
arch/riscv/kernel/vdso/rt_sigreturn.S | 4 +
arch/riscv/kernel/vdso/sys_hwprobe.S | 4 +
arch/riscv/kernel/vdso/vgetrandom-chacha.S | 5 +-
arch/riscv/kernel/vdso_cfi/Makefile | 25 +
arch/riscv/kernel/vdso_cfi/vdso-cfi.S | 11 +
arch/riscv/mm/init.c | 2 +-
arch/riscv/mm/pgtable.c | 16 +
include/linux/cpu.h | 4 +
include/linux/mm.h | 7 +
include/uapi/linux/elf.h | 2 +
include/uapi/linux/prctl.h | 27 +
kernel/sys.c | 30 ++
tools/testing/selftests/riscv/Makefile | 2 +-
tools/testing/selftests/riscv/cfi/.gitignore | 3 +
tools/testing/selftests/riscv/cfi/Makefile | 16 +
tools/testing/selftests/riscv/cfi/cfi_rv_test.h | 82 ++++
tools/testing/selftests/riscv/cfi/riscv_cfi_test.c | 173 +++++++
tools/testing/selftests/riscv/cfi/shadowstack.c | 385 +++++++++++++++
tools/testing/selftests/riscv/cfi/shadowstack.h | 27 +
62 files changed, 2474 insertions(+), 41 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20240930-v5_user_cfi_series-3dc332f8f5b2
--
- debug
[ Background ]
On ARM GIC systems and others, the target address of the MSI is translated
by the IOMMU. For GIC, the MSI address page is called "ITS" page. When the
IOMMU is disabled, the MSI address is programmed to the physical location
of the GIC ITS page (e.g. 0x20200000). When the IOMMU is enabled, the ITS
page is behind the IOMMU, so the MSI address is programmed to an allocated
IO virtual address (a.k.a IOVA), e.g. 0xFFFF0000, which must be mapped to
the physical ITS page: IOVA (0xFFFF0000) ===> PA (0x20200000).
When a 2-stage translation is enabled, IOVA will be still used to program
the MSI address, though the mappings will be in two stages:
IOVA (0xFFFF0000) ===> IPA (e.g. 0x80900000) ===> PA (0x20200000)
(IPA stands for Intermediate Physical Address).
If the device that generates MSI is attached to an IOMMU_DOMAIN_DMA, the
IOVA is dynamically allocated from the top of the IOVA space. If attached
to an IOMMU_DOMAIN_UNMANAGED (e.g. a VFIO passthrough device), the IOVA is
fixed to an MSI window reported by the IOMMU driver via IOMMU_RESV_SW_MSI,
which is hardwired to MSI_IOVA_BASE (IOVA==0x8000000) for ARM IOMMUs.
So far, this IOMMU_RESV_SW_MSI works well as kernel is entirely in charge
of the IOMMU translation (1-stage translation), since the IOVA for the ITS
page is fixed and known by kernel. However, with virtual machine enabling
a nested IOMMU translation (2-stage), a guest kernel directly controls the
stage-1 translation with an IOMMU_DOMAIN_DMA, mapping a vITS page (at an
IPA 0x80900000) onto its own IOVA space (e.g. 0xEEEE0000). Then, the host
kernel can't know that guest-level IOVA to program the MSI address.
There have been two approaches to solve this problem:
1. Create an identity mapping in the stage-1. VMM could insert a few RMRs
(Reserved Memory Regions) in guest's IORT. Then the guest kernel would
fetch these RMR entries from the IORT and create an IOMMU_RESV_DIRECT
region per iommu group for a direct mapping. Eventually, the mappings
would look like: IOVA (0x8000000) === IPA (0x8000000) ===> 0x20200000
This requires an IOMMUFD ioctl for kernel and VMM to agree on the IPA.
2. Forward the guest-level MSI IOVA captured by VMM to the host-level GIC
driver, to program the correct MSI IOVA. Forward the VMM-defined vITS
page location (IPA) to the kernel for the stage-2 mapping. Eventually:
IOVA (0xFFFF0000) ===> IPA (0x80900000) ===> PA (0x20200000)
This requires a VFIO ioctl (for IOVA) and an IOMMUFD ioctl (for IPA).
Worth mentioning that when Eric Auger was working on the same topic with
the VFIO iommu uAPI, he had the approach (2) first, and then switched to
the approach (1), suggested by Jean-Philippe for reduction of complexity.
The approach (1) basically feels like the existing VFIO passthrough that
has a 1-stage mapping for the unmanaged domain, yet only by shifting the
MSI mapping from stage 1 (guest-has-no-iommu case) to stage 2 (guest-has-
iommu case). So, it could reuse the existing IOMMU_RESV_SW_MSI piece, by
sharing the same idea of "VMM leaving everything to the kernel".
The approach (2) is an ideal solution, yet it requires additional effort
for kernel to be aware of the 1-stage gIOVA(s) and 2-stage IPAs for vITS
page(s), which demands VMM to closely cooperate.
* It also brings some complicated use cases to the table where the host
or/and guest system(s) has/have multiple ITS pages.
[ Execution ]
The iommu core rework (part-1) for iommufd_sw_msi is merged. So, now the
IOMMU_RESV_SW_MSI can be used as an ABI. VMM can take this hard coded MSI
window and create a direct stage-1 mapping using RMR in the guest's IORT.
However, a proper uAPI must be defined for kernel and VMM to agree on wrt
this virtual MSI window.
Moreover, some use cases might want to map the IOVAs in IOMMU_RESV_SW_MSI
for something else. This requires kernel to provide an interface to shift
the software MSI window to a different region:
https://lore.kernel.org/all/20250909154600.910110-1-shyamsaini@linux.micros…
This series, as a follow-up series, introduces a pair of iommufd options
for user space to configure the software MSI window.
[ Future Plan ]
Part-3 and beyond will continue the effort of supporting the approach (2)
for a complete vITS-to-pITS mapping:
1) Map the phsical ITS page (potentially via IOMMUFD_CMD_IOAS_MAP_MSI)
2) Convey the IOVAs per-irq (potentially via VFIO_IRQ_SET_ACTION_PREPARE)
Note that the set_option uAPI in this series might not fit since this
requires it is an array of MSI IOVAs.)
This series is on github:
https://github.com/nicolinc/iommufd/commits/iommufd_msi_p2-v2
Pairing QEMU branch for testing (approach 1):
https://github.com/nicolinc/qemu/commits/wip/for_iommufd_msi_p2-v2-rmr
Changelog
v2
* Rebase on v6.18-rc1
* Update commit logs and kdocs
* Add a patch fixing iommufd_device_is_attached()
* Add sanity check for overflow and cover it in the selftest
v1 (containing part-1 that is now merged)
https://lore.kernel.org/all/cover.1739005085.git.nicolinc@nvidia.com/
Thanks!
Nicolin
Nicolin Chen (7):
iommufd/device: Move sw_msi_start from igroup to idev
iommufd: Pass in idev to iopt_table_enforce_dev_resv_regions
iommufd/device: Make iommufd_device_is_attached non-static
iommufd: Add IOMMU_OPTION_SW_MSI_START/SIZE ioctls
iommufd/selftest: Add MOCK_FLAGS_DEVICE_NO_ATTACH
iommufd/selftest: Add a testing reserved region
iommufd/selftest: Add coverage for IOMMU_OPTION_SW_MSI_START/SIZE
drivers/iommu/iommufd/iommufd_private.h | 7 +-
drivers/iommu/iommufd/iommufd_test.h | 4 +
include/uapi/linux/iommufd.h | 21 +++-
drivers/iommu/iommufd/device.c | 43 +++----
drivers/iommu/iommufd/driver.c | 4 +-
drivers/iommu/iommufd/io_pagetable.c | 18 ++-
drivers/iommu/iommufd/ioas.c | 113 ++++++++++++++++++
drivers/iommu/iommufd/main.c | 4 +
drivers/iommu/iommufd/selftest.c | 35 +++++-
tools/testing/selftests/iommu/iommufd.c | 105 ++++++++++++++++
.../selftests/iommu/iommufd_fail_nth.c | 21 ++++
11 files changed, 339 insertions(+), 36 deletions(-)
--
2.43.0
According to Peter, we've had for a very long time an issue on some
mutltiouch touchpads where the fingers were stuck in a scrolling mode,
or 3 fingers gesture mode. I was unable to debug it because it was
rather hard to reproduce.
Recently, some people raised the issue again on libinput, and this time
added a recording of the actual bug.
It turns out that the sticky finger quirk that was introduced back in
2017 was only checking the last report, and that those missing releases
also happen when moving from 3 to 1 finger (only 1 is released instead
of 2).
This solution seems to me to be the most sensible, because we could also
add the NSMU quirk to win8 multitouch touchpads, but this would involve
a lot more computations at each report for rather annoying corner cases.
Link: https://gitlab.freedesktop.org/libinput/libinput/-/issues/1194
Signed-off-by: Benjamin Tissoires <bentiss(a)kernel.org>
---
Benjamin Tissoires (2):
HID: multitouch: fix sticky fingers
selftests/hid: add tests for missing release on the Dell Synaptics
drivers/hid/hid-multitouch.c | 27 ++++++-----
.../testing/selftests/hid/tests/test_multitouch.py | 55 ++++++++++++++++++++++
2 files changed, 69 insertions(+), 13 deletions(-)
---
base-commit: 54ba6d9b1393a0061600c0e49c8ebef65d60a8b2
change-id: 20250926-fix-sticky-fingers-8ae88436ae82
Best regards,
--
Benjamin Tissoires <bentiss(a)kernel.org>
This series adds comprehensive testing infrastructure for Netlink
and Generic Netlink
The implementation includes both kernel module and userspace tests to
verify correct Generic Netlink and Netlink behaviors under
various conditions.
Yana Bashlykova (15):
genetlink: add sysfs test module for Generic Netlink
genetlink: add TEST_GENL family for netlink testing
genetlink: add PARALLEL_GENL test family
genetlink: add test case for duplicate genl family registration
genetlink: add test case for family with invalid ops
genetlink: add netlink notifier support
genetlink: add THIRD_GENL family
genetlink: verify unregister fails for non-registered family
genetlink: add LARGE_GENL stress test family
selftests: net: genetlink: add packet capture test infrastructure
selftests: net: genetlink: add /proc/net/netlink test
selftests: net: genetlink: add Generic Netlink controller tests
selftests: net: genetlink: add large family ID resolution test
selftests: net: genetlink: add Netlink and Generic Netlink test suite
selftests: net: genetlink: fix expectation for large family resolution
drivers/net/Kconfig | 2 +
drivers/net/Makefile | 2 +
drivers/net/genetlink/Kconfig | 8 +
drivers/net/genetlink/Makefile | 3 +
.../net-pf-16-proto-16-family-PARALLEL_GENL.c | 1921 ++++++
tools/testing/selftests/net/Makefile | 6 +
tools/testing/selftests/net/genetlink.c | 5152 +++++++++++++++++
7 files changed, 7094 insertions(+)
create mode 100644 drivers/net/genetlink/Kconfig
create mode 100644 drivers/net/genetlink/Makefile
create mode 100644 drivers/net/genetlink/net-pf-16-proto-16-family-PARALLEL_GENL.c
create mode 100644 tools/testing/selftests/net/genetlink.c
--
2.34.1
I looked at the fchmodat2() tests since I've been experiencing some
random intermittent segfaults with them in my test systems, while doing
so I noticed these two issues. Unfortunately I didn't figure out the
original yet, unless I managed to fix it unwittingly.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Changes in v3:
- Rebase onto v6.18-rc1.
- Link to v2: https://lore.kernel.org/r/20250812-selftests-fchmodat2-v2-0-f2d5380e94c3@ke…
Changes in v2:
- Rebase onto v6.17-rc1.
- Link to v1: https://lore.kernel.org/r/20250714-selftests-fchmodat2-v1-0-b74f3ee0d09c@ke…
---
Mark Brown (2):
selftests/fchmodat2: Clean up temporary files and directories
selftests/fchmodat2: Use ksft_finished()
tools/testing/selftests/fchmodat2/fchmodat2_test.c | 166 ++++++++++++++-------
1 file changed, 112 insertions(+), 54 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20250711-selftests-fchmodat2-c30374c376f8
Best regards,
--
Mark Brown <broonie(a)kernel.org>
Since v4.1 kernel, a new interface for ftrace called "tracefs" was
introduced, which is usually mounted in /sys/kernel/tracing. Therefore,
tracing files can now be accessed via either the legacy path
/sys/kernel/debug/tracing or the newer path /sys/kernel/tracing.
Signed-off-by: Fushuai Wang <wangfushuai(a)baidu.com>
---
tools/testing/selftests/livepatch/functions.sh | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh
index 46991a029f7c..8ec0cb64ad94 100644
--- a/tools/testing/selftests/livepatch/functions.sh
+++ b/tools/testing/selftests/livepatch/functions.sh
@@ -10,7 +10,11 @@ SYSFS_KERNEL_DIR="/sys/kernel"
SYSFS_KLP_DIR="$SYSFS_KERNEL_DIR/livepatch"
SYSFS_DEBUG_DIR="$SYSFS_KERNEL_DIR/debug"
SYSFS_KPROBES_DIR="$SYSFS_DEBUG_DIR/kprobes"
-SYSFS_TRACING_DIR="$SYSFS_DEBUG_DIR/tracing"
+if [[ -e /sys/kernel/tracing/trace ]]; then
+ SYSFS_TRACING_DIR="$SYSFS_KERNEL_DIR/tracing"
+else
+ SYSFS_TRACING_DIR="$SYSFS_DEBUG_DIR/tracing"
+fi
# Kselftest framework requirement - SKIP code is 4
ksft_skip=4
--
2.36.1
At this point I think everyone in the on the kernel side is happy with
this but there were some questions from the glibc side about the value
of controlling the shadow stack placement and size, especially with the
current inability to reuse the shadow stack for an exited thread. With
support for reuse it would be possible to have a cache of shadow stacks
as is currently supported for the normal stack.
Since the discussion petered out I'm resending this in order to give
people something work with while prototyping. It should be possible to
prototype any potential kernel features to help build out shadow stack
support in userspace by enabling shadow stack writes, as suggested by
Rick Edgecombe this may end up being required anyway for supporting more
exotic scenarios. On all current architectures with the feature writes
to shadow stack require specific instructions so there are still
security benefits even with writes enabled.
I did send a change implementing a feature writing a token on thread
exit to allow reuse:
https://lore.kernel.org/r/20250921-arm64-gcs-exit-token-v1-0-45cf64e648d5@k…
but wasn't planning to refresh it without some indication from the
userspace side that that'd be useful.
Non-process cover letter:
The kernel has added support for shadow stacks, currently x86 only using
their CET feature but both arm64 and RISC-V have equivalent features
(GCS and Zicfiss respectively), I am actively working on GCS[1]. With
shadow stacks the hardware maintains an additional stack containing only
the return addresses for branch instructions which is not generally
writeable by userspace and ensures that any returns are to the recorded
addresses. This provides some protection against ROP attacks and making
it easier to collect call stacks. These shadow stacks are allocated in
the address space of the userspace process.
Our API for shadow stacks does not currently offer userspace any
flexiblity for managing the allocation of shadow stacks for newly
created threads, instead the kernel allocates a new shadow stack with
the same size as the normal stack whenever a thread is created with the
feature enabled. The stacks allocated in this way are freed by the
kernel when the thread exits or shadow stacks are disabled for the
thread. This lack of flexibility and control isn't ideal, in the vast
majority of cases the shadow stack will be over allocated and the
implicit allocation and deallocation is not consistent with other
interfaces. As far as I can tell the interface is done in this manner
mainly because the shadow stack patches were in development since before
clone3() was implemented.
Since clone3() is readily extensible let's add support for specifying a
shadow stack when creating a new thread or process, keeping the current
implicit allocation behaviour if one is not specified either with
clone3() or through the use of clone(). The user must provide a shadow
stack pointer, this must point to memory mapped for use as a shadow
stackby map_shadow_stack() with an architecture specified shadow stack
token at the top of the stack.
Yuri Khrustalev has raised questions from the libc side regarding
discoverability of extended clone3() structure sizes[2], this seems like
a general issue with clone3(). There was a suggestion to add a hwcap on
arm64 which isn't ideal but is doable there, though architecture
specific mechanisms would also be needed for x86 (and RISC-V if it's
support gets merged before this does). The idea has, however, had
strong pushback from the architecture maintainers and it is possible to
detect support for this in clone3() by attempting a call with a
misaligned shadow stack pointer specified so no hwcap has been added.
[1] https://lore.kernel.org/linux-arm-kernel/20241001-arm64-gcs-v13-0-222b78d87…
[2] https://lore.kernel.org/r/aCs65ccRQtJBnZ_5@arm.com
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Changes in v22:
- Rebase onto v6.18-rc1.
- Cover letter updates.
- Link to v21: https://lore.kernel.org/r/20250916-clone3-shadow-stack-v21-0-910493527013@k…
Changes in v21:
- Rebase onto https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git kernel-6.18.clone3
- Rename shadow_stack_token to shstk_token, since it's a simple rename I've
kept the acks and reviews but I dropped the tested-bys just to be safe.
- Link to v20: https://lore.kernel.org/r/20250902-clone3-shadow-stack-v20-0-4d9fff1c53e7@k…
Changes in v20:
- Comment fixes and clarifications in x86 arch_shstk_validate_clone()
from Rick Edgecombe.
- Spelling fix in documentation.
- Link to v19: https://lore.kernel.org/r/20250819-clone3-shadow-stack-v19-0-bc957075479b@k…
Changes in v19:
- Rebase onto v6.17-rc1.
- Link to v18: https://lore.kernel.org/r/20250702-clone3-shadow-stack-v18-0-7965d2b694db@k…
Changes in v18:
- Rebase onto v6.16-rc3.
- Thanks to pointers from Yuri Khrustalev this version has been tested
on x86 so I have removed the RFT tag.
- Clarify clone3_shadow_stack_valid() comment about the Kconfig check.
- Remove redundant GCSB DSYNCs in arm64 code.
- Fix token validation on x86.
- Link to v17: https://lore.kernel.org/r/20250609-clone3-shadow-stack-v17-0-8840ed97ff6f@k…
Changes in v17:
- Rebase onto v6.16-rc1.
- Link to v16: https://lore.kernel.org/r/20250416-clone3-shadow-stack-v16-0-2ffc9ca3917b@k…
Changes in v16:
- Rebase onto v6.15-rc2.
- Roll in fixes from x86 testing from Rick Edgecombe.
- Rework so that the argument is shadow_stack_token.
- Link to v15: https://lore.kernel.org/r/20250408-clone3-shadow-stack-v15-0-3fa245c6e3be@k…
Changes in v15:
- Rebase onto v6.15-rc1.
- Link to v14: https://lore.kernel.org/r/20250206-clone3-shadow-stack-v14-0-805b53af73b9@k…
Changes in v14:
- Rebase onto v6.14-rc1.
- Link to v13: https://lore.kernel.org/r/20241203-clone3-shadow-stack-v13-0-93b89a81a5ed@k…
Changes in v13:
- Rebase onto v6.13-rc1.
- Link to v12: https://lore.kernel.org/r/20241031-clone3-shadow-stack-v12-0-7183eb8bee17@k…
Changes in v12:
- Add the regular prctl() to the userspace API document since arm64
support is queued in -next.
- Link to v11: https://lore.kernel.org/r/20241005-clone3-shadow-stack-v11-0-2a6a2bd6d651@k…
Changes in v11:
- Rebase onto arm64 for-next/gcs, which is based on v6.12-rc1, and
integrate arm64 support.
- Rework the interface to specify a shadow stack pointer rather than a
base and size like we do for the regular stack.
- Link to v10: https://lore.kernel.org/r/20240821-clone3-shadow-stack-v10-0-06e8797b9445@k…
Changes in v10:
- Integrate fixes & improvements for the x86 implementation from Rick
Edgecombe.
- Require that the shadow stack be VM_WRITE.
- Require that the shadow stack base and size be sizeof(void *) aligned.
- Clean up trailing newline.
- Link to v9: https://lore.kernel.org/r/20240819-clone3-shadow-stack-v9-0-962d74f99464@ke…
Changes in v9:
- Pull token validation earlier and report problems with an error return
to parent rather than signal delivery to the child.
- Verify that the top of the supplied shadow stack is VM_SHADOW_STACK.
- Rework token validation to only do the page mapping once.
- Drop no longer needed support for testing for signals in selftest.
- Fix typo in comments.
- Link to v8: https://lore.kernel.org/r/20240808-clone3-shadow-stack-v8-0-0acf37caf14c@ke…
Changes in v8:
- Fix token verification with user specified shadow stack.
- Don't track user managed shadow stacks for child processes.
- Link to v7: https://lore.kernel.org/r/20240731-clone3-shadow-stack-v7-0-a9532eebfb1d@ke…
Changes in v7:
- Rebase onto v6.11-rc1.
- Typo fixes.
- Link to v6: https://lore.kernel.org/r/20240623-clone3-shadow-stack-v6-0-9ee7783b1fb9@ke…
Changes in v6:
- Rebase onto v6.10-rc3.
- Ensure we don't try to free the parent shadow stack in error paths of
x86 arch code.
- Spelling fixes in userspace API document.
- Additional cleanups and improvements to the clone3() tests to support
the shadow stack tests.
- Link to v5: https://lore.kernel.org/r/20240203-clone3-shadow-stack-v5-0-322c69598e4b@ke…
Changes in v5:
- Rebase onto v6.8-rc2.
- Rework ABI to have the user allocate the shadow stack memory with
map_shadow_stack() and a token.
- Force inlining of the x86 shadow stack enablement.
- Move shadow stack enablement out into a shared header for reuse by
other tests.
- Link to v4: https://lore.kernel.org/r/20231128-clone3-shadow-stack-v4-0-8b28ffe4f676@ke…
Changes in v4:
- Formatting changes.
- Use a define for minimum shadow stack size and move some basic
validation to fork.c.
- Link to v3: https://lore.kernel.org/r/20231120-clone3-shadow-stack-v3-0-a7b8ed3e2acc@ke…
Changes in v3:
- Rebase onto v6.7-rc2.
- Remove stale shadow_stack in internal kargs.
- If a shadow stack is specified unconditionally use it regardless of
CLONE_ parameters.
- Force enable shadow stacks in the selftest.
- Update changelogs for RISC-V feature rename.
- Link to v2: https://lore.kernel.org/r/20231114-clone3-shadow-stack-v2-0-b613f8681155@ke…
Changes in v2:
- Rebase onto v6.7-rc1.
- Remove ability to provide preallocated shadow stack, just specify the
desired size.
- Link to v1: https://lore.kernel.org/r/20231023-clone3-shadow-stack-v1-0-d867d0b5d4d0@ke…
---
Mark Brown (8):
arm64/gcs: Return a success value from gcs_alloc_thread_stack()
Documentation: userspace-api: Add shadow stack API documentation
selftests: Provide helper header for shadow stack testing
fork: Add shadow stack support to clone3()
selftests/clone3: Remove redundant flushes of output streams
selftests/clone3: Factor more of main loop into test_clone3()
selftests/clone3: Allow tests to flag if -E2BIG is a valid error code
selftests/clone3: Test shadow stack support
Documentation/userspace-api/index.rst | 1 +
Documentation/userspace-api/shadow_stack.rst | 44 +++++
arch/arm64/include/asm/gcs.h | 8 +-
arch/arm64/kernel/process.c | 8 +-
arch/arm64/mm/gcs.c | 55 +++++-
arch/x86/include/asm/shstk.h | 11 +-
arch/x86/kernel/process.c | 2 +-
arch/x86/kernel/shstk.c | 53 ++++-
include/asm-generic/cacheflush.h | 11 ++
include/linux/sched/task.h | 17 ++
include/uapi/linux/sched.h | 9 +-
kernel/fork.c | 93 +++++++--
tools/testing/selftests/clone3/clone3.c | 226 ++++++++++++++++++----
tools/testing/selftests/clone3/clone3_selftests.h | 65 ++++++-
tools/testing/selftests/ksft_shstk.h | 98 ++++++++++
15 files changed, 620 insertions(+), 81 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20231019-clone3-shadow-stack-15d40d2bf536
Best regards,
--
Mark Brown <broonie(a)kernel.org>
This series makes the output from the ofdlocks test a bit easier for
tooling to work with, and also ignores the generated file while we're
here.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Changes in v2:
- Rebase onto v6.18-rc1.
- Link to v1: https://lore.kernel.org/r/20250818-selftest-filelock-ktap-v1-0-d41af77f1396…
---
Mark Brown (3):
kselftest/filelock: Use ksft_perror()
kselftest/filelock: Report each test in oftlocks separately
kselftest/filelock: Add a .gitignore file
tools/testing/selftests/filelock/.gitignore | 1 +
tools/testing/selftests/filelock/ofdlocks.c | 94 +++++++++++++----------------
2 files changed, 42 insertions(+), 53 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20250604-selftest-filelock-ktap-f2ae998a0de0
Best regards,
--
Mark Brown <broonie(a)kernel.org>
From: Li RongQing <lirongqing(a)baidu.com>
Currently, when 'hung_task_panic' is enabled, the kernel panics
immediately upon detecting the first hung task. However, some hung
tasks are transient and the system can recover, while others are
persistent and may accumulate progressively.
This patch extends the 'hung_task_panic' sysctl to allow specifying
the number of hung tasks that must be detected before triggering
a kernel panic. This provides finer control for environments where
transient hangs may occur but persistent hangs should still be fatal.
The sysctl can be set to:
- 0: disabled (never panic)
- 1: original behavior (panic on first hung task)
- N: panic when N hung tasks are detected
This maintains backward compatibility while providing more flexibility
for handling different hang scenarios.
Signed-off-by: Li RongQing <lirongqing(a)baidu.com>
---
Diff with v2: not add new sysctl, extend hung_task_panic
Documentation/admin-guide/kernel-parameters.txt | 20 +++++++++++++-------
Documentation/admin-guide/sysctl/kernel.rst | 3 ++-
arch/arm/configs/aspeed_g5_defconfig | 2 +-
kernel/configs/debug.config | 2 +-
kernel/hung_task.c | 16 +++++++++++-----
lib/Kconfig.debug | 10 ++++++----
tools/testing/selftests/wireguard/qemu/kernel.config | 2 +-
7 files changed, 35 insertions(+), 20 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a51ab46..7d9a8ee 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1992,14 +1992,20 @@
the added memory block itself do not be affected.
hung_task_panic=
- [KNL] Should the hung task detector generate panics.
- Format: 0 | 1
+ [KNL] Number of hung tasks to trigger kernel panic.
+ Format: <int>
+
+ Set this to the number of hung tasks that must be
+ detected before triggering a kernel panic.
+
+ 0: don't panic
+ 1: panic immediately on first hung task
+ N: panic after N hung tasks are detect
- A value of 1 instructs the kernel to panic when a
- hung task is detected. The default value is controlled
- by the CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time
- option. The value selected by this boot parameter can
- be changed later by the kernel.hung_task_panic sysctl.
+ The default value is controlled by the
+ CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time option. The value
+ selected by this boot parameter can be changed later by the
+ kernel.hung_task_panic sysctl.
hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC)
terminal devices. Valid values: 0..8
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index f3ee807..0a8dfab 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -397,7 +397,8 @@ a hung task is detected.
hung_task_panic
===============
-Controls the kernel's behavior when a hung task is detected.
+When set to a non-zero value, a kernel panic will be triggered if the
+number of detected hung tasks reaches this value
This file shows up if ``CONFIG_DETECT_HUNG_TASK`` is enabled.
= =================================================
diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig
index 61cee1e..c3b0d5f 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -308,7 +308,7 @@ CONFIG_PANIC_ON_OOPS=y
CONFIG_PANIC_TIMEOUT=-1
CONFIG_SOFTLOCKUP_DETECTOR=y
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
-CONFIG_BOOTPARAM_HUNG_TASK_PANIC=y
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1
CONFIG_WQ_WATCHDOG=y
# CONFIG_SCHED_DEBUG is not set
CONFIG_FUNCTION_TRACER=y
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index e81327d..9f6ab7d 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -83,7 +83,7 @@ CONFIG_SLUB_DEBUG_ON=y
#
# Debug Oops, Lockups and Hangs
#
-# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0
# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_DETECT_HUNG_TASK=y
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index b2c1f14..3929ed9 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -81,7 +81,7 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
* hung task is detected:
*/
static unsigned int __read_mostly sysctl_hung_task_panic =
- IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC);
+ CONFIG_BOOTPARAM_HUNG_TASK_PANIC;
static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -218,8 +218,11 @@ static inline void debug_show_blocker(struct task_struct *task, unsigned long ti
}
#endif
-static void check_hung_task(struct task_struct *t, unsigned long timeout)
+static void check_hung_task(struct task_struct *t, unsigned long timeout,
+ unsigned long prev_detect_count)
{
+ unsigned long total_hung_task;
+
if (!task_is_hung(t, timeout))
return;
@@ -229,9 +232,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
*/
sysctl_hung_task_detect_count++;
+ total_hung_task = sysctl_hung_task_detect_count - prev_detect_count;
trace_sched_process_hang(t);
- if (sysctl_hung_task_panic) {
+ if (sysctl_hung_task_panic &&
+ (total_hung_task >= sysctl_hung_task_panic)) {
console_verbose();
hung_task_show_lock = true;
hung_task_call_panic = true;
@@ -300,6 +305,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
int max_count = sysctl_hung_task_check_count;
unsigned long last_break = jiffies;
struct task_struct *g, *t;
+ unsigned long prev_detect_count = sysctl_hung_task_detect_count;
/*
* If the system crashed already then all bets are off,
@@ -320,7 +326,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
last_break = jiffies;
}
- check_hung_task(t, timeout);
+ check_hung_task(t, timeout, prev_detect_count);
}
unlock:
rcu_read_unlock();
@@ -389,7 +395,7 @@ static const struct ctl_table hung_task_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "hung_task_check_count",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 3034e294..077b9e4 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1258,12 +1258,14 @@ config DEFAULT_HUNG_TASK_TIMEOUT
Keeping the default should be fine in most cases.
config BOOTPARAM_HUNG_TASK_PANIC
- bool "Panic (Reboot) On Hung Tasks"
+ int "Number of hung tasks to trigger kernel panic"
depends on DETECT_HUNG_TASK
+ default 0
help
- Say Y here to enable the kernel to panic on "hung tasks",
- which are bugs that cause the kernel to leave a task stuck
- in uninterruptible "D" state.
+ The number of hung tasks must be detected to trigger kernel panic.
+
+ - 0: Don't trigger panic
+ - N: Panic when N hung tasks are detected
The panic can be used in combination with panic_timeout,
to cause the system to reboot automatically after a
diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config
index 936b18b..0504c11 100644
--- a/tools/testing/selftests/wireguard/qemu/kernel.config
+++ b/tools/testing/selftests/wireguard/qemu/kernel.config
@@ -81,7 +81,7 @@ CONFIG_WQ_WATCHDOG=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
-CONFIG_BOOTPARAM_HUNG_TASK_PANIC=y
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1
CONFIG_PANIC_TIMEOUT=-1
CONFIG_STACKTRACE=y
CONFIG_EARLY_PRINTK=y
--
2.9.4
Add the dma_map_benchmark binary to .gitignore to prevent it from being
shown as an untracked file after building the selftests.
Signed-off-by: Kriish Sharma <kriish.sharma2006(a)gmail.com>
---
tools/testing/selftests/dma/.gitignore | 2 ++
1 file changed, 2 insertions(+)
create mode 100644 tools/testing/selftests/dma/.gitignore
diff --git a/tools/testing/selftests/dma/.gitignore b/tools/testing/selftests/dma/.gitignore
new file mode 100644
index 000000000000..b4b99b6ffea3
--- /dev/null
+++ b/tools/testing/selftests/dma/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+dma_map_benchmark
--
2.34.1
From: Steven Rostedt <rostedt(a)goodmis.org>
Commit 64cf7d058a00 ("tracing: Have trace_marker use per-cpu data to read
user space") made an update that fixed both trace_marker and
trace_marker_raw. But the small difference made to trace_marker_raw had a
blatant bug in it that any basic testing would have uncovered.
Unfortunately, the self tests have tests for trace_marker but nothing for
trace_marker_raw which allowed the bug to get upstream.
Add basic selftests to test trace_marker_raw so that this doesn't happen
again.
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
.../ftrace/test.d/00basic/trace_marker_raw.tc | 107 ++++++++++++++++++
1 file changed, 107 insertions(+)
create mode 100644 tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc
new file mode 100644
index 000000000000..7daf7292209e
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc
@@ -0,0 +1,107 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Basic tests on writing to trace_marker_raw
+# requires: trace_marker_raw
+# flags: instance
+
+is_little_endian() {
+ if lscpu | grep -q 'Little Endian'; then
+ echo 1;
+ else
+ echo 0;
+ fi
+}
+
+little=`is_little_endian`
+
+make_str() {
+ id=$1
+ cnt=$2
+
+ if [ $little -eq 1 ]; then
+ val=`printf "\\%03o\\%03o\\%03o\\%03o" \
+ $(($id & 0xff)) \
+ $((($id >> 8) & 0xff)) \
+ $((($id >> 16) & 0xff)) \
+ $((($id >> 24) & 0xff))`
+ else
+ val=`printf "\\%03o\\%03o\\%03o\\%03o" \
+ $((($id >> 24) & 0xff)) \
+ $((($id >> 16) & 0xff)) \
+ $((($id >> 8) & 0xff)) \
+ $(($id & 0xff))`
+ fi
+
+ data=`printf -- 'X%.0s' $(seq $cnt)`
+
+ printf "${val}${data}"
+}
+
+write_buffer() {
+ id=$1
+ size=$2
+
+ # write the string into the raw marker
+ make_str $id $size > trace_marker_raw
+}
+
+
+test_multiple_writes() {
+
+ # Write a bunch of data where the id is the count of
+ # data to write
+ for i in `seq 1 10` `seq 101 110` `seq 1001 1010`; do
+ write_buffer $i $i
+ done
+
+ # add a little buffer
+ echo stop > trace_marker
+
+ # Check to make sure the number of entries is the id (rounded up by 4)
+ awk '/.*: # [0-9a-f]* / {
+ print;
+ cnt = -1;
+ for (i = 0; i < NF; i++) {
+ # The counter is after the "#" marker
+ if ( $i == "#" ) {
+ i++;
+ cnt = strtonum("0x" $i);
+ num = NF - (i + 1);
+ # The number of items is always rounded up by 4
+ cnt2 = int((cnt + 3) / 4) * 4;
+ if (cnt2 != num) {
+ exit 1;
+ }
+ break;
+ }
+ }
+ }
+ // { if (NR > 30) { exit 0; } } ' trace_pipe;
+}
+
+
+get_buffer_data_size() {
+ sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page
+}
+
+test_buffer() {
+
+ # The id must be four bytes, test that 3 bytes fails a write
+ if echo -n abc > ./trace_marker_raw ; then
+ echo "Too small of write expected to fail but did not"
+ exit_fail
+ fi
+
+ size=`get_buffer_data_size`
+ echo size = $size
+
+ # Now add a little more than what it can handle
+
+ if write_buffer 0xdeadbeef $size ; then
+ echo "Too big of write expected to fail but did not"
+ exit_fail
+ fi
+}
+
+test_buffer
+test_multiple_writes
--
2.51.0
Parsing KTAP is quite an inconvenience, but most of the time the thing
you really want to know is "did anything fail"?
Let's give the user the his information without them needing
to parse anything.
Because of the use of subshells and namespaces, this needs to be
communicated via a file. Just write arbitrary data into the file and
treat non-empty content as a signal that something failed.
In case any user depends on the current behaviour, such as running this
from a script with `set -e` and parsing the result for failures
afterwards, add a flag they can set to get the old behaviour, namely
--no-error-on-fail.
Signed-off-by: Brendan Jackman <jackmanb(a)google.com>
---
Changes in v2:
- Fixed bug in report_failure()
- Made error-on-fail the default
- Link to v1: https://lore.kernel.org/r/20251007-b4-ksft-error-on-fail-v1-1-71bf058f5662@…
---
tools/testing/selftests/kselftest/runner.sh | 14 ++++++++++----
tools/testing/selftests/run_kselftest.sh | 14 ++++++++++++++
2 files changed, 24 insertions(+), 4 deletions(-)
diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh
index 2c3c58e65a419f5ee8d7dc51a37671237a07fa0b..3a62039fa6217f3453423ff011575d0a1eb8c275 100644
--- a/tools/testing/selftests/kselftest/runner.sh
+++ b/tools/testing/selftests/kselftest/runner.sh
@@ -44,6 +44,12 @@ tap_timeout()
fi
}
+report_failure()
+{
+ echo "not ok $*"
+ echo "$*" >> "$kselftest_failures_file"
+}
+
run_one()
{
DIR="$1"
@@ -105,7 +111,7 @@ run_one()
echo "# $TEST_HDR_MSG"
if [ ! -e "$TEST" ]; then
echo "# Warning: file $TEST is missing!"
- echo "not ok $test_num $TEST_HDR_MSG"
+ report_failure "$test_num $TEST_HDR_MSG"
else
if [ -x /usr/bin/stdbuf ]; then
stdbuf="/usr/bin/stdbuf --output=L "
@@ -123,7 +129,7 @@ run_one()
interpreter=$(head -n 1 "$TEST" | cut -c 3-)
cmd="$stdbuf $interpreter ./$BASENAME_TEST"
else
- echo "not ok $test_num $TEST_HDR_MSG"
+ report_failure "$test_num $TEST_HDR_MSG"
return
fi
fi
@@ -137,9 +143,9 @@ run_one()
echo "ok $test_num $TEST_HDR_MSG # SKIP"
elif [ $rc -eq $timeout_rc ]; then \
echo "#"
- echo "not ok $test_num $TEST_HDR_MSG # TIMEOUT $kselftest_timeout seconds"
+ report_failure "$test_num $TEST_HDR_MSG # TIMEOUT $kselftest_timeout seconds"
else
- echo "not ok $test_num $TEST_HDR_MSG # exit=$rc"
+ report_failure "$test_num $TEST_HDR_MSG # exit=$rc"
fi)
cd - >/dev/null
fi
diff --git a/tools/testing/selftests/run_kselftest.sh b/tools/testing/selftests/run_kselftest.sh
index 0443beacf3621ae36cb12ffd57f696ddef3526b5..cc1b4190edacedadafd9b993a351e4cfbf17ccd5 100755
--- a/tools/testing/selftests/run_kselftest.sh
+++ b/tools/testing/selftests/run_kselftest.sh
@@ -33,6 +33,7 @@ Usage: $0 [OPTIONS]
-c | --collection COLLECTION Run all tests from COLLECTION
-l | --list List the available collection:test entries
-d | --dry-run Don't actually run any tests
+ -f | --no-error-on-fail Don't exit with an error just because tests failed
-n | --netns Run each test in namespace
-h | --help Show this usage info
-o | --override-timeout Number of seconds after which we timeout
@@ -44,6 +45,7 @@ COLLECTIONS=""
TESTS=""
dryrun=""
kselftest_override_timeout=""
+ERROR_ON_FAIL=true
while true; do
case "$1" in
-s | --summary)
@@ -65,6 +67,9 @@ while true; do
-d | --dry-run)
dryrun="echo"
shift ;;
+ -f | --no-error-on-fail)
+ ERROR_ON_FAIL=false
+ shift ;;
-n | --netns)
RUN_IN_NETNS=1
shift ;;
@@ -105,9 +110,18 @@ if [ -n "$TESTS" ]; then
available="$(echo "$valid" | sed -e 's/ /\n/g')"
fi
+kselftest_failures_file=$(mktemp --tmpdir kselftest-failures-XXXXXX)
+export kselftest_failures_file
+
collections=$(echo "$available" | cut -d: -f1 | sort | uniq)
for collection in $collections ; do
[ -w /dev/kmsg ] && echo "kselftest: Running tests in $collection" >> /dev/kmsg
tests=$(echo "$available" | grep "^$collection:" | cut -d: -f2)
($dryrun cd "$collection" && $dryrun run_many $tests)
done
+
+failures="$(cat "$kselftest_failures_file")"
+rm "$kselftest_failures_file"
+if "$ERROR_ON_FAIL" && [ "$failures" ]; then
+ exit 1
+fi
---
base-commit: 8f5ae30d69d7543eee0d70083daf4de8fe15d585
change-id: 20251007-b4-ksft-error-on-fail-0c2cb3246041
Best regards,
--
Brendan Jackman <jackmanb(a)google.com>
When the bpf ring buffer is full, new events can not be recorded util
the consumer consumes some events to free space. This may cause critical
events to be discarded, such as in fault diagnostic, where recent events
are more critical than older ones.
So add ovewrite mode for bpf ring buffer. In this mode, the new event
overwrites the oldest event when the buffer is full.
v2:
- remove libbpf changes (Andrii)
- update overwrite benchmark
v1:
https://lore.kernel.org/bpf/20250804022101.2171981-1-xukuohai@huaweicloud.c…
Xu Kuohai (3):
bpf: Add overwrite mode for bpf ring buffer
selftests/bpf: Add test for overwrite ring buffer
selftests/bpf/benchs: Add producer and overwrite bench for ring buffer
include/uapi/linux/bpf.h | 4 +
kernel/bpf/ringbuf.c | 159 +++++++++++++++---
tools/include/uapi/linux/bpf.h | 4 +
tools/testing/selftests/bpf/Makefile | 3 +-
tools/testing/selftests/bpf/bench.c | 2 +
.../selftests/bpf/benchs/bench_ringbufs.c | 95 ++++++++++-
.../bpf/benchs/run_bench_ringbufs.sh | 4 +
.../selftests/bpf/prog_tests/ringbuf.c | 74 ++++++++
.../selftests/bpf/progs/ringbuf_bench.c | 10 ++
.../bpf/progs/test_ringbuf_overwrite.c | 98 +++++++++++
10 files changed, 418 insertions(+), 35 deletions(-)
create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_overwrite.c
--
2.43.0
The jq command is used in vlan_bridge_binding.sh, if it is not supported,
the test will spam the following log.
# ./vlan_bridge_binding.sh: line 51: jq: command not found
# ./vlan_bridge_binding.sh: line 51: jq: command not found
# ./vlan_bridge_binding.sh: line 51: jq: command not found
# ./vlan_bridge_binding.sh: line 51: jq: command not found
# ./vlan_bridge_binding.sh: line 51: jq: command not found
# TEST: Test bridge_binding on->off when lower down [FAIL]
# Got operstate of , expected 0
The rtnetlink.sh has the same problem. It makes sense to check if jq is
installed before running these tests. After this patch, the
vlan_bridge_binding.sh skipped if jq is not supported:
# timeout set to 3600
# selftests: net: vlan_bridge_binding.sh
# TEST: jq not installed [SKIP]
Fixes: dca12e9ab760 ("selftests: net: Add a VLAN bridge binding selftest")
Fixes: 6a414fd77f61 ("selftests: rtnetlink: Add an address proto test")
Signed-off-by: Wang Liang <wangliang74(a)huawei.com>
Reviewed-by: Hangbin Liu <liuhangbin(a)gmail.com>
---
tools/testing/selftests/net/rtnetlink.sh | 2 ++
tools/testing/selftests/net/vlan_bridge_binding.sh | 2 ++
2 files changed, 4 insertions(+)
diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index dbf77513f617..163a084d525d 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -1466,6 +1466,8 @@ usage: ${0##*/} OPTS
EOF
}
+require_command jq
+
#check for needed privileges
if [ "$(id -u)" -ne 0 ];then
end_test "SKIP: Need root privileges"
diff --git a/tools/testing/selftests/net/vlan_bridge_binding.sh b/tools/testing/selftests/net/vlan_bridge_binding.sh
index db481af9b6b3..e8c02c64e03a 100755
--- a/tools/testing/selftests/net/vlan_bridge_binding.sh
+++ b/tools/testing/selftests/net/vlan_bridge_binding.sh
@@ -249,6 +249,8 @@ test_binding_toggle_off_when_upper_down()
do_test_binding_off : "on->off when upper down"
}
+require_command jq
+
trap defer_scopes_cleanup EXIT
setup_prepare
tests_run
--
2.34.1
From: Wilfred Mallawa <wilfred.mallawa(a)wdc.com>
During a handshake, an endpoint may specify a maximum record size limit.
Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the
maximum record size. Meaning that, the outgoing records from the kernel
can exceed a lower size negotiated during the handshake. In such a case,
the TLS endpoint must send a fatal "record_overflow" alert [1], and
thus the record is discarded.
Upcoming Western Digital NVMe-TCP hardware controllers implement TLS
support. For these devices, supporting TLS record size negotiation is
necessary because the maximum TLS record size supported by the controller
is less than the default 16KB currently used by the kernel.
Currently, there is no way to inform the kernel of such a limit. This patch
adds support to a new setsockopt() option `TLS_TX_MAX_PAYLOAD_LEN` that
allows for setting the maximum plaintext fragment size. Once set, outgoing
records are no larger than the size specified. This option can be used to
specify the record size limit.
[1] https://www.rfc-editor.org/rfc/rfc8449
Signed-off-by: Wilfred Mallawa <wilfred.mallawa(a)wdc.com>
---
Changes V4 -> V5
- Change the socket option to TLS_TX_MAX_PAYLOAD_LEN, such that we can
limit the payload length in a generic way, as pposed to strictly
specifying record size limit. No functional changes other than
removing TLS 1.3 content byte length checks for this argument.
- Lock the socket when calling do_tls_setsockopt_tx_payload_len()
V4: https://lore.kernel.org/netdev/20250923053207.113938-1-wilfred.opensource@g…
---
Documentation/networking/tls.rst | 11 ++++++
include/net/tls.h | 3 ++
include/uapi/linux/tls.h | 2 ++
net/tls/tls_device.c | 2 +-
net/tls/tls_main.c | 62 ++++++++++++++++++++++++++++++++
net/tls/tls_sw.c | 2 +-
6 files changed, 80 insertions(+), 2 deletions(-)
diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index 36cc7afc2527..dabab17ab84a 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -280,6 +280,17 @@ If the record decrypted turns out to had been padded or is not a data
record it will be decrypted again into a kernel buffer without zero copy.
Such events are counted in the ``TlsDecryptRetry`` statistic.
+TLS_TX_MAX_PAYLOAD_LEN
+~~~~~~~~~~~~~~~~~~~~~~
+
+Sets the maximum size for the plaintext of a protected record.
+
+When this option is set, the kernel enforces this limit on all transmitted TLS
+records, ensuring no plaintext fragment exceeds the specified size. This can be
+used to specify the TLS Record Size Limit [1].
+
+[1] https://datatracker.ietf.org/doc/html/rfc8449
+
Statistics
==========
diff --git a/include/net/tls.h b/include/net/tls.h
index 857340338b69..f2af113728aa 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -53,6 +53,8 @@ struct tls_rec;
/* Maximum data size carried in a TLS record */
#define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14)
+/* Minimum record size limit as per RFC8449 */
+#define TLS_MIN_RECORD_SIZE_LIM ((size_t)1 << 6)
#define TLS_HEADER_SIZE 5
#define TLS_NONCE_OFFSET TLS_HEADER_SIZE
@@ -226,6 +228,7 @@ struct tls_context {
u8 rx_conf:3;
u8 zerocopy_sendfile:1;
u8 rx_no_pad:1;
+ u16 tx_max_payload_len;
int (*push_pending_record)(struct sock *sk, int flags);
void (*sk_write_space)(struct sock *sk);
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index b66a800389cc..b8b9c42f848c 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -41,6 +41,7 @@
#define TLS_RX 2 /* Set receive parameters */
#define TLS_TX_ZEROCOPY_RO 3 /* TX zerocopy (only sendfile now) */
#define TLS_RX_EXPECT_NO_PAD 4 /* Attempt opportunistic zero-copy */
+#define TLS_TX_MAX_PAYLOAD_LEN 5 /* Maximum plaintext size */
/* Supported versions */
#define TLS_VERSION_MINOR(ver) ((ver) & 0xFF)
@@ -194,6 +195,7 @@ enum {
TLS_INFO_RXCONF,
TLS_INFO_ZC_RO_TX,
TLS_INFO_RX_NO_PAD,
+ TLS_INFO_TX_MAX_PAYLOAD_LEN,
__TLS_INFO_MAX,
};
#define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index a64ae15b1a60..c6289c73cffc 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -461,7 +461,7 @@ static int tls_push_data(struct sock *sk,
/* TLS_HEADER_SIZE is not counted as part of the TLS record, and
* we need to leave room for an authentication tag.
*/
- max_open_record_len = TLS_MAX_PAYLOAD_SIZE +
+ max_open_record_len = tls_ctx->tx_max_payload_len +
prot->prepend_size;
do {
rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index a3ccb3135e51..b481d1add14e 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -544,6 +544,28 @@ static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval,
return 0;
}
+static int do_tls_getsockopt_tx_payload_len(struct sock *sk, char __user *optval,
+ int __user *optlen)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ u16 payload_len = ctx->tx_max_payload_len;
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ if (len < sizeof(payload_len))
+ return -EINVAL;
+
+ if (put_user(sizeof(payload_len), optlen))
+ return -EFAULT;
+
+ if (copy_to_user(optval, &payload_len, sizeof(payload_len)))
+ return -EFAULT;
+
+ return 0;
+}
+
static int do_tls_getsockopt(struct sock *sk, int optname,
char __user *optval, int __user *optlen)
{
@@ -563,6 +585,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname,
case TLS_RX_EXPECT_NO_PAD:
rc = do_tls_getsockopt_no_pad(sk, optval, optlen);
break;
+ case TLS_TX_MAX_PAYLOAD_LEN:
+ rc = do_tls_getsockopt_tx_payload_len(sk, optval, optlen);
+ break;
default:
rc = -ENOPROTOOPT;
break;
@@ -812,6 +837,30 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval,
return rc;
}
+static int do_tls_setsockopt_tx_payload_len(struct sock *sk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ struct tls_sw_context_tx *sw_ctx = tls_sw_ctx_tx(ctx);
+ u16 value;
+
+ if (sw_ctx->open_rec)
+ return -EBUSY;
+
+ if (sockptr_is_null(optval) || optlen != sizeof(value))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&value, optval, sizeof(value)))
+ return -EFAULT;
+
+ if (value < TLS_MIN_RECORD_SIZE_LIM || value > TLS_MAX_PAYLOAD_SIZE)
+ return -EINVAL;
+
+ ctx->tx_max_payload_len = value;
+
+ return 0;
+}
+
static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
unsigned int optlen)
{
@@ -833,6 +882,11 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
case TLS_RX_EXPECT_NO_PAD:
rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
break;
+ case TLS_TX_MAX_PAYLOAD_LEN:
+ lock_sock(sk);
+ rc = do_tls_setsockopt_tx_payload_len(sk, optval, optlen);
+ release_sock(sk);
+ break;
default:
rc = -ENOPROTOOPT;
break;
@@ -1022,6 +1076,7 @@ static int tls_init(struct sock *sk)
ctx->tx_conf = TLS_BASE;
ctx->rx_conf = TLS_BASE;
+ ctx->tx_max_payload_len = TLS_MAX_PAYLOAD_SIZE;
update_sk_prot(sk, ctx);
out:
write_unlock_bh(&sk->sk_callback_lock);
@@ -1111,6 +1166,12 @@ static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin)
goto nla_failure;
}
+ err = nla_put_u16(skb, TLS_INFO_TX_MAX_PAYLOAD_LEN,
+ ctx->tx_max_payload_len);
+
+ if (err)
+ goto nla_failure;
+
rcu_read_unlock();
nla_nest_end(skb, start);
return 0;
@@ -1132,6 +1193,7 @@ static size_t tls_get_info_size(const struct sock *sk, bool net_admin)
nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */
nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */
nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */
+ nla_total_size(sizeof(u16)) + /* TLS_INFO_TX_MAX_PAYLOAD_LEN */
0;
return size;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index daac9fd4be7e..e76ea38b712a 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1079,7 +1079,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
orig_size = msg_pl->sg.size;
full_record = false;
try_to_copy = msg_data_left(msg);
- record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
+ record_room = tls_ctx->tx_max_payload_len - msg_pl->sg.size;
if (try_to_copy >= record_room) {
try_to_copy = record_room;
full_record = true;
--
2.51.0
The generic vDSO provides a lot common functionality shared between
different architectures. SPARC is the last architecture not using it,
preventing some necessary code cleanup.
Make use of the generic infrastructure.
Follow-up to and replacement for Arnd's SPARC vDSO removal patches:
https://lore.kernel.org/lkml/20250707144726.4008707-1-arnd@kernel.org/
SPARC64 can not map .bss into userspace, so the vDSO datapages are
switched over to be allocated dynamically. This requires changes to the
s390 and random subsystem vDSO initialization as preparation.
The random subsystem changes in turn require some cleanup of the vDSO
headers to not end up as ugly #ifdef mess.
Tested on a Niagara T4 and QEMU.
This has a semantic conflict with my series "vdso: Reject absolute
relocations during build" [0]. The last patch of this series expects all
users of the generic vDSO library to use the vdsocheck tool.
This is not the case (yet) for SPARC64. I do have the patches for the
integration, the specifics will depend on which series is applied first.
Based on v6.18-rc1.
[0] https://lore.kernel.org/lkml/20250812-vdso-absolute-reloc-v4-0-61a8b615e5ec…
Signed-off-by: Thomas Weißschuh <thomas.weissschuh(a)linutronix.de>
---
Changes in v4:
- Rebase on v6.18-rc1.
- Keep inclusion of asm/clocksource.h from linux/clocksource.h
- Reword description of "s390/time: Set up vDSO datapage later"
- Link to v3: https://lore.kernel.org/r/20250917-vdso-sparc64-generic-2-v3-0-3679b1bc8ee8…
Changes in v3:
- Allocate vDSO data pages dynamically (and lots of preparations for that)
- Drop clock_getres()
- Fix 32bit clock_gettime() syscall fallback
- Link to v2: https://lore.kernel.org/r/20250815-vdso-sparc64-generic-2-v2-0-b5ff80672347…
Changes in v2:
- Rebase on v6.17-rc1
- Drop RFC state
- Fix typo in commit message
- Drop duplicate 'select GENERIC_TIME_VSYSCALL'
- Merge "sparc64: time: Remove architecture-specific clocksource data" into the
main conversion patch. It violated the check in __clocksource_register_scale()
- Link to v1: https://lore.kernel.org/r/20250724-vdso-sparc64-generic-2-v1-0-e376a3bd24d1…
---
Arnd Bergmann (1):
clocksource: remove ARCH_CLOCKSOURCE_DATA
Thomas Weißschuh (34):
selftests: vDSO: vdso_test_correctness: Handle different tv_usec types
arm64: vDSO: getrandom: Explicitly include asm/alternative.h
arm64: vDSO: gettimeofday: Explicitly include vdso/clocksource.h
arm64: vDSO: compat_gettimeofday: Add explicit includes
ARM: vdso: gettimeofday: Add explicit includes
powerpc/vdso/gettimeofday: Explicitly include vdso/time32.h
powerpc/vdso: Explicitly include asm/cputable.h and asm/feature-fixups.h
LoongArch: vDSO: Explicitly include asm/vdso/vdso.h
MIPS: vdso: Add include guard to asm/vdso/vdso.h
MIPS: vdso: Explicitly include asm/vdso/vdso.h
random: vDSO: Add explicit includes
vdso/gettimeofday: Add explicit includes
vdso/helpers: Explicitly include vdso/processor.h
vdso/datapage: Remove inclusion of gettimeofday.h
vdso/datapage: Trim down unnecessary includes
random: vDSO: trim vDSO includes
random: vDSO: remove ifdeffery
random: vDSO: split out datapage update into helper functions
random: vDSO: only access vDSO datapage after random_init()
s390/time: Set up vDSO datapage later
vdso/datastore: Reduce scope of some variables in vvar_fault()
vdso/datastore: Drop inclusion of linux/mmap_lock.h
vdso/datastore: Map pages through struct page
vdso/datastore: Allocate data pages dynamically
sparc64: vdso: Link with -z noexecstack
sparc64: vdso: Remove obsolete "fake section table" reservation
sparc64: vdso: Replace code patching with runtime conditional
sparc64: vdso: Move hardware counter read into header
sparc64: vdso: Move syscall fallbacks into header
sparc64: vdso: Introduce vdso/processor.h
sparc64: vdso: Switch to the generic vDSO library
sparc64: vdso2c: Drop sym_vvar_start handling
sparc64: vdso2c: Remove symbol handling
sparc64: vdso: Implement clock_gettime64()
arch/arm/include/asm/vdso/gettimeofday.h | 2 +
arch/arm64/include/asm/vdso/compat_gettimeofday.h | 3 +
arch/arm64/include/asm/vdso/gettimeofday.h | 2 +
arch/arm64/kernel/vdso/vgetrandom.c | 2 +
arch/loongarch/kernel/process.c | 1 +
arch/loongarch/kernel/vdso.c | 1 +
arch/mips/include/asm/vdso/vdso.h | 5 +
arch/mips/kernel/vdso.c | 1 +
arch/powerpc/include/asm/vdso/gettimeofday.h | 1 +
arch/powerpc/include/asm/vdso/processor.h | 3 +
arch/s390/kernel/time.c | 4 +-
arch/sparc/Kconfig | 3 +-
arch/sparc/include/asm/clocksource.h | 9 -
arch/sparc/include/asm/processor.h | 3 +
arch/sparc/include/asm/processor_32.h | 2 -
arch/sparc/include/asm/processor_64.h | 25 --
arch/sparc/include/asm/vdso.h | 2 -
arch/sparc/include/asm/vdso/clocksource.h | 10 +
arch/sparc/include/asm/vdso/gettimeofday.h | 184 ++++++++++
arch/sparc/include/asm/vdso/processor.h | 41 +++
arch/sparc/include/asm/vdso/vsyscall.h | 10 +
arch/sparc/include/asm/vvar.h | 75 ----
arch/sparc/kernel/Makefile | 1 -
arch/sparc/kernel/time_64.c | 6 +-
arch/sparc/kernel/vdso.c | 69 ----
arch/sparc/vdso/Makefile | 8 +-
arch/sparc/vdso/vclock_gettime.c | 380 ++-------------------
arch/sparc/vdso/vdso-layout.lds.S | 26 +-
arch/sparc/vdso/vdso.lds.S | 2 -
arch/sparc/vdso/vdso2c.c | 24 --
arch/sparc/vdso/vdso2c.h | 45 +--
arch/sparc/vdso/vdso32/vdso32.lds.S | 4 +-
arch/sparc/vdso/vma.c | 274 +--------------
drivers/char/random.c | 71 ++--
include/linux/clocksource.h | 6 +-
include/linux/vdso_datastore.h | 6 +
include/vdso/datapage.h | 23 +-
include/vdso/helpers.h | 1 +
init/main.c | 2 +
kernel/time/Kconfig | 4 -
lib/vdso/datastore.c | 73 ++--
lib/vdso/getrandom.c | 3 +
lib/vdso/gettimeofday.c | 17 +
.../testing/selftests/vDSO/vdso_test_correctness.c | 8 +-
44 files changed, 448 insertions(+), 994 deletions(-)
---
base-commit: 28b1ac5ccd8d4900a8f53f0e6e84d517a7ccc71f
change-id: 20250722-vdso-sparc64-generic-2-25f2e058e92c
Best regards,
--
Thomas Weißschuh <thomas.weissschuh(a)linutronix.de>
The MBM (Memory Bandwidth Monitoring) and MBA (Memory Bandwidth Allocation)
features are not enabled for AMD systems. The reason was lack of perf
counters to compare the resctrl test results.
Starting with the commit
25e56847821f ("perf/x86/amd/uncore: Add memory controller support"), AMD
now supports the UMC (Unified Memory Controller) perf events. These events
can be used to compare the test results.
This series adds the support to detect the UMC events and enable MBM/MBA
tests for AMD systems.
v3:
Note: Based the series on top of latest kselftests/master
1613e604df0cd359cf2a7fbd9be7a0bcfacfabd0 (tag: v6.10-rc1).
Also applied the patches from the series
https://lore.kernel.org/lkml/20240531131142.1716-1-ilpo.jarvinen@linux.inte…
Separated the fix patch.
Renamed the imc to just mc to make it generic.
Changed the search string "uncore_imc_" and "amd_umc_"
Changes related rebase to latest kselftest tree.
v2: Changes.
a. Rebased on top of tip/master (Apr 25, 2024)
b. Addressed Ilpo comments except the one about close call.
It seems more clear to keep READ and WRITE separate.
https://lore.kernel.org/lkml/8e4badb7-6cc5-61f1-e041-d902209a90d5@linux.int…
c. Used ksft_perror call when applicable.
d. Added vendor check for non contiguous CBM check.
v1: https://lore.kernel.org/lkml/cover.1708637563.git.babu.moger@amd.com/
Babu Moger (4):
selftests/resctrl: Rename variables and functions to generic names
selftests/resctrl: Pass sysfs controller name of the vendor
selftests/resctrl: Add support for MBM and MBA tests on AMD
selftests/resctrl: Enable MBA/MBA tests on AMD
tools/testing/selftests/resctrl/mba_test.c | 25 +-
tools/testing/selftests/resctrl/mbm_test.c | 23 +-
tools/testing/selftests/resctrl/resctrl.h | 2 +-
tools/testing/selftests/resctrl/resctrl_val.c | 305 ++++++++++--------
tools/testing/selftests/resctrl/resctrlfs.c | 2 +-
5 files changed, 191 insertions(+), 166 deletions(-)
--
2.34.1
Fix a memory leak in netpoll and introduce netconsole selftests that
expose the issue when running with kmemleak detection enabled.
This patchset includes a selftest for netpoll with multiple concurrent
users (netconsole + bonding), which simulates the scenario from test[1]
that originally demonstrated the issue allegedly fixed by commit
efa95b01da18 ("netpoll: fix use after free") - a commit that is now
being reverted.
Sending this to "net" branch because this is a fix, and the selftest
might help with the backports validation.
Link: https://lore.kernel.org/lkml/96b940137a50e5c387687bb4f57de8b0435a653f.14048… [1]
Signed-off-by: Breno Leitao <leitao(a)debian.org>
---
Changes in v7:
- Rebased on top of `net`
- Link to v6: https://lore.kernel.org/r/20251002-netconsole_torture-v6-0-543bf52f6b46@deb…
Changes in v6:
- Expand the tests even more and some small fixups
- Moved the test to bonding selftests
- Link to v5: https://lore.kernel.org/r/20250918-netconsole_torture-v5-0-77e25e0a4eb6@deb…
Changes in v5:
- Set CONFIG_BONDING=m in selftests/drivers/net/config.
- Link to v4: https://lore.kernel.org/r/20250917-netconsole_torture-v4-0-0a5b3b8f81ce@deb…
Changes in v4:
- Added an additional selftest to test multiple netpoll users in
parallel
- Link to v3: https://lore.kernel.org/r/20250905-netconsole_torture-v3-0-875c7febd316@deb…
Changes in v3:
- This patchset is a merge of the fix and the selftest together as
recommended by Jakub.
Changes in v2:
- Reuse the netconsole creation from lib_netcons.sh. Thus, refactoring
the create_dynamic_target() (Jakub)
- Move the "wait" to after all the messages has been sent.
- Link to v1: https://lore.kernel.org/r/20250902-netconsole_torture-v1-1-03c6066598e9@deb…
---
Breno Leitao (4):
net: netpoll: fix incorrect refcount handling causing incorrect cleanup
selftest: netcons: refactor target creation
selftest: netcons: create a torture test
selftest: netcons: add test for netconsole over bonded interfaces
net/core/netpoll.c | 7 +-
tools/testing/selftests/drivers/net/Makefile | 1 +
.../testing/selftests/drivers/net/bonding/Makefile | 2 +
tools/testing/selftests/drivers/net/bonding/config | 4 +
.../drivers/net/bonding/netcons_over_bonding.sh | 221 +++++++++++++++++++++
.../selftests/drivers/net/lib/sh/lib_netcons.sh | 188 ++++++++++++++++--
.../selftests/drivers/net/netcons_torture.sh | 127 ++++++++++++
7 files changed, 530 insertions(+), 20 deletions(-)
---
base-commit: 7ae421cf78bd795513ec3a7d7ef7ac9437693e23
change-id: 20250902-netconsole_torture-8fc23f0aca99
Best regards,
--
Breno Leitao <leitao(a)debian.org>
This has been tested on a Google Skylake platform.
One potential issue with this test is that it fails (that is, the
exploit succeeds) when using the conditional L1D flush, because the
gadget is injected into the hypercall path which doesn't appear to
include a flush. If this is unacceptable, we should discuss how to amend
the test so that it can be used to evaluate the conditional flush logic
as well. This would basically mean simulating some more complicated
gadget where the "attacker" has found another way to steer the host
kernel towards the target data, instead of just a simple hypercall.
The reason this limitation is tolerable to me is my ulterior motive,
i.e. because I am specifically interested in an end-to-end test for
Address Space Isolation [0], which is abstracted from these details of the
exploit.
Based on kvm/next.
[0] https://lore.kernel.org/all/20250924-b4-asi-page-alloc-v1-0-2d861768041f@go…
Signed-off-by: Brendan Jackman <jackmanb(a)google.com>
---
Alexandra Sandulescu (1):
KVM: x86: selftests: add an L1TF exploit test
Brendan Jackman (1):
selftests: fix installing nested TEST_GEN_MODS_DIR
tools/testing/selftests/kvm/Makefile.kvm | 7 +
tools/testing/selftests/kvm/x86/l1tf_test.c | 633 +++++++++++++++++++++
tools/testing/selftests/kvm/x86/l1tf_test.sh | 10 +
.../selftests/kvm/x86/test_modules/Makefile | 10 +
.../kvm/x86/test_modules/l1tf_test_helper.c | 92 +++
tools/testing/selftests/lib.mk | 2 +-
6 files changed, 753 insertions(+), 1 deletion(-)
---
base-commit: 6b36119b94d0b2bb8cea9d512017efafd461d6ac
change-id: 20251013-l1tf-test-1bee540cefb4
Best regards,
--
Brendan Jackman <jackmanb(a)google.com>
Basics and overview
===================
Software with larger attack surfaces (e.g. network facing apps like databases,
browsers or apps relying on browser runtimes) suffer from memory corruption
issues which can be utilized by attackers to bend control flow of the program
to eventually gain control (by making their payload executable). Attackers are
able to perform such attacks by leveraging call-sites which rely on indirect
calls or return sites which rely on obtaining return address from stack memory.
To mitigate such attacks, risc-v extension zicfilp enforces that all indirect
calls must land on a landing pad instruction `lpad` else cpu will raise software
check exception (a new cpu exception cause code on riscv).
Similarly for return flow, risc-v extension zicfiss extends architecture with
- `sspush` instruction to push return address on a shadow stack
- `sspopchk` instruction to pop return address from shadow stack
and compare with input operand (i.e. return address on stack)
- `sspopchk` to raise software check exception if comparision above
was a mismatch
- Protection mechanism using which shadow stack is not writeable via
regular store instructions
More information an details can be found at extensions github repo [1].
Equivalent to landing pad (zicfilp) on x86 is `ENDBRANCH` instruction in Intel
CET [3] and branch target identification (BTI) [4] on arm.
Similarly x86's Intel CET has shadow stack [5] and arm64 has guarded control
stack (GCS) [6] which are very similar to risc-v's zicfiss shadow stack.
x86 and arm64 support for user mode shadow stack is already in mainline.
Kernel awareness for user control flow integrity
================================================
This series picks up Samuel Holland's envcfg changes [2] as well. So if those are
being applied independently, they should be removed from this series.
Enabling:
In order to maintain compatibility and not break anything in user mode, kernel
doesn't enable control flow integrity cpu extensions on binary by default.
Instead exposes a prctl interface to enable, disable and lock the shadow stack
or landing pad feature for a task. This allows userspace (loader) to enumerate
if all objects in its address space are compiled with shadow stack and landing
pad support and accordingly enable the feature. Additionally if a subsequent
`dlopen` happens on a library, user mode can take a decision again to disable
the feature (if incoming library is not compiled with support) OR terminate the
task (if user mode policy is strict to have all objects in address space to be
compiled with control flow integirty cpu feature). prctl to enable shadow stack
results in allocating shadow stack from virtual memory and activating for user
address space. x86 and arm64 are also following same direction due to similar
reason(s).
clone/fork:
On clone and fork, cfi state for task is inherited by child. Shadow stack is
part of virtual memory and is a writeable memory from kernel perspective
(writeable via a restricted set of instructions aka shadow stack instructions)
Thus kernel changes ensure that this memory is converted into read-only when
fork/clone happens and COWed when fault is taken due to sspush, sspopchk or
ssamoswap. In case `CLONE_VM` is specified and shadow stack is to be enabled,
kernel will automatically allocate a shadow stack for that clone call.
map_shadow_stack:
x86 introduced `map_shadow_stack` system call to allow user space to explicitly
map shadow stack memory in its address space. It is useful to allocate shadow
for different contexts managed by a single thread (green threads or contexts)
risc-v implements this system call as well.
signal management:
If shadow stack is enabled for a task, kernel performs an asynchronous control
flow diversion to deliver the signal and eventually expects userspace to issue
sigreturn so that original execution can be resumed. Even though resume context
is prepared by kernel, it is in user space memory and is subject to memory
corruption and corruption bugs can be utilized by attacker in this race window
to perform arbitrary sigreturn and eventually bypass cfi mechanism.
Another issue is how to ensure that cfi related state on sigcontext area is not
trampled by legacy apps or apps compiled with old kernel headers.
In order to mitigate control-flow hijacting, kernel prepares a token and place
it on shadow stack before signal delivery and places address of token in
sigcontext structure. During sigreturn, kernel obtains address of token from
sigcontext struture, reads token from shadow stack and validates it and only
then allow sigreturn to succeed. Compatiblity issue is solved by adopting
dynamic sigcontext management introduced for vector extension. This series
re-factor the code little bit to allow future sigcontext management easy (as
proposed by Andy Chiu from SiFive)
config and compilation:
Introduce a new risc-v config option `CONFIG_RISCV_USER_CFI`. Selecting this
config option picks the kernel support for user control flow integrity. This
optin is presented only if toolchain has shadow stack and landing pad support.
And is on purpose guarded by toolchain support. Reason being that eventually
vDSO also needs to be compiled in with shadow stack and landing pad support.
vDSO compile patches are not included as of now because landing pad labeling
scheme is yet to settle for usermode runtime.
To get more information on kernel interactions with respect to
zicfilp and zicfiss, patch series adds documentation for
`zicfilp` and `zicfiss` in following:
Documentation/arch/riscv/zicfiss.rst
Documentation/arch/riscv/zicfilp.rst
How to test this series
=======================
Toolchain
---------
$ git clone git@github.com:sifive/riscv-gnu-toolchain.git -b cfi-dev
$ riscv-gnu-toolchain/configure --prefix=<path-to-where-to-build> --with-arch=rv64gc_zicfilp_zicfiss --enable-linux --disable-gdb --with-extra-multilib-test="rv64gc_zicfilp_zicfiss-lp64d:-static"
$ make -j$(nproc)
Qemu
----
Get the lastest qemu
$ cd qemu
$ mkdir build
$ cd build
$ ../configure --target-list=riscv64-softmmu
$ make -j$(nproc)
Opensbi
-------
$ git clone git@github.com:deepak0414/opensbi.git -b v6_cfi_spec_split_opensbi
$ make CROSS_COMPILE=<your riscv toolchain> -j$(nproc) PLATFORM=generic
Linux
-----
Running defconfig is fine. CFI is enabled by default if the toolchain
supports it.
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc) defconfig
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc)
In case you're building your own rootfs using toolchain, please make sure you
pick following patch to ensure that vDSO compiled with lpad and shadow stack.
"arch/riscv: compile vdso with landing pad"
Branch where above patch can be picked
https://github.com/deepak0414/linux-riscv-cfi/tree/vdso_user_cfi_v6.12-rc1
Running
-------
Modify your qemu command to have:
-bios <path-to-cfi-opensbi>/build/platform/generic/firmware/fw_dynamic.bin
-cpu rv64,zicfilp=true,zicfiss=true,zimop=true,zcmop=true
vDSO related Opens (in the flux)
=================================
I am listing these opens for laying out plan and what to expect in future
patch sets. And of course for the sake of discussion.
Shadow stack and landing pad enabling in vDSO
----------------------------------------------
vDSO must have shadow stack and landing pad support compiled in for task
to have shadow stack and landing pad support. This patch series doesn't
enable that (yet). Enabling shadow stack support in vDSO should be
straight forward (intend to do that in next versions of patch set). Enabling
landing pad support in vDSO requires some collaboration with toolchain folks
to follow a single label scheme for all object binaries. This is necessary to
ensure that all indirect call-sites are setting correct label and target landing
pads are decorated with same label scheme.
How many vDSOs
---------------
Shadow stack instructions are carved out of zimop (may be operations) and if CPU
doesn't implement zimop, they're illegal instructions. Kernel could be running on
a CPU which may or may not implement zimop. And thus kernel will have to carry 2
different vDSOs and expose the appropriate one depending on whether CPU implements
zimop or not.
References
==========
[1] - https://github.com/riscv/riscv-cfi
[2] - https://lore.kernel.org/all/20240814081126.956287-1-samuel.holland@sifive.c…
[3] - https://lwn.net/Articles/889475/
[4] - https://developer.arm.com/documentation/109576/0100/Branch-Target-Identific…
[5] - https://www.intel.com/content/dam/develop/external/us/en/documents/catc17-i…
[6] - https://lwn.net/Articles/940403/
To: Thomas Gleixner <tglx(a)linutronix.de>
To: Ingo Molnar <mingo(a)redhat.com>
To: Borislav Petkov <bp(a)alien8.de>
To: Dave Hansen <dave.hansen(a)linux.intel.com>
To: x86(a)kernel.org
To: H. Peter Anvin <hpa(a)zytor.com>
To: Andrew Morton <akpm(a)linux-foundation.org>
To: Liam R. Howlett <Liam.Howlett(a)oracle.com>
To: Vlastimil Babka <vbabka(a)suse.cz>
To: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
To: Paul Walmsley <paul.walmsley(a)sifive.com>
To: Palmer Dabbelt <palmer(a)dabbelt.com>
To: Albert Ou <aou(a)eecs.berkeley.edu>
To: Conor Dooley <conor(a)kernel.org>
To: Rob Herring <robh(a)kernel.org>
To: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
To: Arnd Bergmann <arnd(a)arndb.de>
To: Christian Brauner <brauner(a)kernel.org>
To: Peter Zijlstra <peterz(a)infradead.org>
To: Oleg Nesterov <oleg(a)redhat.com>
To: Eric Biederman <ebiederm(a)xmission.com>
To: Kees Cook <kees(a)kernel.org>
To: Jonathan Corbet <corbet(a)lwn.net>
To: Shuah Khan <shuah(a)kernel.org>
To: Jann Horn <jannh(a)google.com>
To: Conor Dooley <conor+dt(a)kernel.org>
To: Miguel Ojeda <ojeda(a)kernel.org>
To: Alex Gaynor <alex.gaynor(a)gmail.com>
To: Boqun Feng <boqun.feng(a)gmail.com>
To: Gary Guo <gary(a)garyguo.net>
To: Björn Roy Baron <bjorn3_gh(a)protonmail.com>
To: Benno Lossin <benno.lossin(a)proton.me>
To: Andreas Hindborg <a.hindborg(a)kernel.org>
To: Alice Ryhl <aliceryhl(a)google.com>
To: Trevor Gross <tmgross(a)umich.edu>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-fsdevel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Cc: linux-riscv(a)lists.infradead.org
Cc: devicetree(a)vger.kernel.org
Cc: linux-arch(a)vger.kernel.org
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: alistair.francis(a)wdc.com
Cc: richard.henderson(a)linaro.org
Cc: jim.shu(a)sifive.com
Cc: andybnac(a)gmail.com
Cc: kito.cheng(a)sifive.com
Cc: charlie(a)rivosinc.com
Cc: atishp(a)rivosinc.com
Cc: evan(a)rivosinc.com
Cc: cleger(a)rivosinc.com
Cc: alexghiti(a)rivosinc.com
Cc: samitolvanen(a)google.com
Cc: broonie(a)kernel.org
Cc: rick.p.edgecombe(a)intel.com
Cc: rust-for-linux(a)vger.kernel.org
changelog
---------
v19:
- riscv_nousercfi was `int`. changed it to unsigned long.
Thanks to Alex Ghiti for reporting it. It was a bug.
- ELP is cleared on trap entry only when CONFIG_64BIT.
- restore ssp back on return to usermode was being done
before `riscv_v_context_nesting_end` on trap exit path.
If kernel shadow stack were enabled this would result in
kernel operating on user shadow stack and panic (as I found
in my testing of kcfi patch series). So fixed that.
v18:
- rebased on 6.16-rc1
- uprobe handling clears ELP in sstatus image in pt_regs
- vdso was missing shadow stack elf note for object files.
added that. Additional asm file for vdso needed the elf marker
flag. toolchain should complain if `-fcf-protection=full` and
marker is missing for object generated from asm file. Asked
toolchain folks to fix this. Although no reason to gate the merge
on that.
- Split up compile options for march and fcf-protection in vdso
Makefile
- CONFIG_RISCV_USER_CFI option is moved under "Kernel features" menu
Added `arch/riscv/configs/hardening.config` fragment which selects
CONFIG_RISCV_USER_CFI
v17:
- fixed warnings due to empty macros in usercfi.h (reported by alexg)
- fixed prefixes in commit titles reported by alexg
- took below uprobe with fcfi v2 patch from Zong Li and squashed it with
"riscv/traps: Introduce software check exception and uprobe handling"
https://lore.kernel.org/all/20250604093403.10916-1-zong.li@sifive.com/
v16:
- If FWFT is not implemented or returns error for shadow stack activation, then
no_usercfi is set to disable shadow stack. Although this should be picked up
by extension validation and activation. Fixed this bug for zicfilp and zicfiss
both. Thanks to Charlie Jenkins for reporting this.
- If toolchain doesn't support cfi, cfi kselftest shouldn't build. Suggested by
Charlie Jenkins.
- Default for CONFIG_RISCV_USER_CFI is set to no. Charlie/Atish suggested to
keep it off till we have more hardware availibility with RVA23 profile and
zimop/zcmop implemented. Else this will start breaking people's workflow
- Includes the fix if "!RV64 and !SBI" then definitions for FWFT in
asm-offsets.c error.
v15:
- Toolchain has been updated to include `-fcf-protection` flag. This
exists for x86 as well. Updated kernel patches to compile vDSO and
selftest to compile with `fcf-protection=full` flag.
- selecting CONFIG_RISCV_USERCFI selects CONFIG_RISCV_SBI.
- Patch to enable shadow stack for kernel wasn't hidden behind
CONFIG_RISCV_USERCFI and CONFIG_RISCV_SBI. fixed that.
v14:
- rebased on top of palmer/sbi-v3. Thus dropped clement's FWFT patches
Updated RISCV_ISA_EXT_XXXX in hwcap and hwprobe constants.
- Took Radim's suggestions on bitfields.
- Placed cfi_state at the end of thread_info block so that current situation
is not disturbed with respect to member fields of thread_info in single
cacheline.
v13:
- cpu_supports_shadow_stack/cpu_supports_indirect_br_lp_instr uses
riscv_has_extension_unlikely()
- uses nops(count) to create nop slide
- RISCV_ACQUIRE_BARRIER is not needed in `amo_user_shstk`. Removed it
- changed ternaries to simply use implicit casting to convert to bool.
- kernel command line allows to disable zicfilp and zicfiss independently.
updated kernel-parameters.txt.
- ptrace user abi for cfi uses bitmasks instead of bitfields. Added ptrace
kselftest.
- cosmetic and grammatical changes to documentation.
v12:
- It seems like I had accidently squashed arch agnostic indirect branch
tracking prctl and riscv implementation of those prctls. Split them again.
- set_shstk_status/set_indir_lp_status perform CSR writes only when CPU
support is available. As suggested by Zong Li.
- Some minor clean up in kselftests as suggested by Zong Li.
v11:
- patch "arch/riscv: compile vdso with landing pad" was unconditionally
selecting `_zicfilp` for vDSO compile. fixed that. Changed `lpad 1` to
to `lpad 0`.
v10:
- dropped "mm: helper `is_shadow_stack_vma` to check shadow stack vma". This patch
is not that interesting to this patch series for risc-v. There are instances in
arch directories where VM_SHADOW_STACK flag is anyways used. Dropping this patch
to expedite merging in riscv tree.
- Took suggestions from `Clement` on "riscv: zicfiss / zicfilp enumeration" to
validate presence of cfi based on config.
- Added a patch for vDSO to have `lpad 0`. I had omitted this earlier to make sure
we add single vdso object with cfi enabled. But a vdso object with scheme of
zero labeled landing pad is least common denominator and should work with all
objects of zero labeled as well as function-signature labeled objects.
v9:
- rebased on master (39a803b754d5 fix braino in "9p: fix ->rename_sem exclusion")
- dropped "mm: Introduce ARCH_HAS_USER_SHADOW_STACK" (master has it from arm64/gcs)
- dropped "prctl: arch-agnostic prctl for shadow stack" (master has it from arm64/gcs)
v8:
- rebased on palmer/for-next
- dropped samuel holland's `envcfg` context switch patches.
they are in parlmer/for-next
v7:
- Removed "riscv/Kconfig: enable HAVE_EXIT_THREAD for riscv"
Instead using `deactivate_mm` flow to clean up.
see here for more context
https://lore.kernel.org/all/20230908203655.543765-1-rick.p.edgecombe@intel.…
- Changed the header include in `kselftest`. Hopefully this fixes compile
issue faced by Zong Li at SiFive.
- Cleaned up an orphaned change to `mm/mmap.c` in below patch
"riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE"
- Lock interfaces for shadow stack and indirect branch tracking expect arg == 0
Any future evolution of this interface should accordingly define how arg should
be setup.
- `mm/map.c` has an instance of using `VM_SHADOW_STACK`. Fixed it to use helper
`is_shadow_stack_vma`.
- Link to v6: https://lore.kernel.org/r/20241008-v5_user_cfi_series-v6-0-60d9fe073f37@riv…
v6:
- Picked up Samuel Holland's changes as is with `envcfg` placed in
`thread` instead of `thread_info`
- fixed unaligned newline escapes in kselftest
- cleaned up messages in kselftest and included test output in commit message
- fixed a bug in clone path reported by Zong Li
- fixed a build issue if CONFIG_RISCV_ISA_V is not selected
(this was introduced due to re-factoring signal context
management code)
v5:
- rebased on v6.12-rc1
- Fixed schema related issues in device tree file
- Fixed some of the documentation related issues in zicfilp/ss.rst
(style issues and added index)
- added `SHADOW_STACK_SET_MARKER` so that implementation can define base
of shadow stack.
- Fixed warnings on definitions added in usercfi.h when
CONFIG_RISCV_USER_CFI is not selected.
- Adopted context header based signal handling as proposed by Andy Chiu
- Added support for enabling kernel mode access to shadow stack using
FWFT
(https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-firmware…)
- Link to v5: https://lore.kernel.org/r/20241001-v5_user_cfi_series-v1-0-3ba65b6e550f@riv…
(Note: I had an issue in my workflow due to which version number wasn't
picked up correctly while sending out patches)
v4:
- rebased on 6.11-rc6
- envcfg: Converged with Samuel Holland's patches for envcfg management on per-
thread basis.
- vma_is_shadow_stack is renamed to is_vma_shadow_stack
- picked up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` patch
- signal context: using extended context management to maintain compatibility.
- fixed `-Wmissing-prototypes` compiler warnings for prctl functions
- Documentation fixes and amending typos.
- Link to v4: https://lore.kernel.org/all/20240912231650.3740732-1-debug@rivosinc.com/
v3:
- envcfg
logic to pick up base envcfg had a bug where `ENVCFG_CBZE` could have been
picked on per task basis, even though CPU didn't implement it. Fixed in
this series.
- dt-bindings
As suggested, split into separate commit. fixed the messaging that spec is
in public review
- arch_is_shadow_stack change
arch_is_shadow_stack changed to vma_is_shadow_stack
- hwprobe
zicfiss / zicfilp if present will get enumerated in hwprobe
- selftests
As suggested, added object and binary filenames to .gitignore
Selftest binary anyways need to be compiled with cfi enabled compiler which
will make sure that landing pad and shadow stack are enabled. Thus removed
separate enable/disable tests. Cleaned up tests a bit.
- Link to v3: https://lore.kernel.org/lkml/20240403234054.2020347-1-debug@rivosinc.com/
v2:
- Using config `CONFIG_RISCV_USER_CFI`, kernel support for riscv control flow
integrity for user mode programs can be compiled in the kernel.
- Enabling of control flow integrity for user programs is left to user runtime
- This patch series introduces arch agnostic `prctls` to enable shadow stack
and indirect branch tracking. And implements them on riscv.
---
Changes in v19:
- Link to v18: https://lore.kernel.org/r/20250711-v5_user_cfi_series-v18-0-a8ee62f9f38e@ri…
Changes in v18:
- Link to v17: https://lore.kernel.org/r/20250604-v5_user_cfi_series-v17-0-4565c2cf869f@ri…
Changes in v17:
- Link to v16: https://lore.kernel.org/r/20250522-v5_user_cfi_series-v16-0-64f61a35eee7@ri…
Changes in v16:
- Link to v15: https://lore.kernel.org/r/20250502-v5_user_cfi_series-v15-0-914966471885@ri…
Changes in v15:
- changelog posted just below cover letter
- Link to v14: https://lore.kernel.org/r/20250429-v5_user_cfi_series-v14-0-5239410d012a@ri…
Changes in v14:
- changelog posted just below cover letter
- Link to v13: https://lore.kernel.org/r/20250424-v5_user_cfi_series-v13-0-971437de586a@ri…
Changes in v13:
- changelog posted just below cover letter
- Link to v12: https://lore.kernel.org/r/20250314-v5_user_cfi_series-v12-0-e51202b53138@ri…
Changes in v12:
- changelog posted just below cover letter
- Link to v11: https://lore.kernel.org/r/20250310-v5_user_cfi_series-v11-0-86b36cbfb910@ri…
Changes in v11:
- changelog posted just below cover letter
- Link to v10: https://lore.kernel.org/r/20250210-v5_user_cfi_series-v10-0-163dcfa31c60@ri…
---
Andy Chiu (1):
riscv: signal: abstract header saving for setup_sigcontext
Deepak Gupta (25):
mm: VM_SHADOW_STACK definition for riscv
dt-bindings: riscv: zicfilp and zicfiss in dt-bindings (extensions.yaml)
riscv: zicfiss / zicfilp enumeration
riscv: zicfiss / zicfilp extension csr and bit definitions
riscv: usercfi state for task and save/restore of CSR_SSP on trap entry/exit
riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE
riscv/mm: manufacture shadow stack pte
riscv/mm: teach pte_mkwrite to manufacture shadow stack PTEs
riscv/mm: write protect and shadow stack
riscv/mm: Implement map_shadow_stack() syscall
riscv/shstk: If needed allocate a new shadow stack on clone
riscv: Implements arch agnostic shadow stack prctls
prctl: arch-agnostic prctl for indirect branch tracking
riscv: Implements arch agnostic indirect branch tracking prctls
riscv/traps: Introduce software check exception and uprobe handling
riscv/signal: save and restore of shadow stack for signal
riscv/kernel: update __show_regs to print shadow stack register
riscv/ptrace: riscv cfi status and state via ptrace and in core files
riscv/hwprobe: zicfilp / zicfiss enumeration in hwprobe
riscv: kernel command line option to opt out of user cfi
riscv: enable kernel access to shadow stack memory via FWFT sbi call
riscv: create a config for shadow stack and landing pad instr support
riscv: Documentation for landing pad / indirect branch tracking
riscv: Documentation for shadow stack on riscv
kselftest/riscv: kselftest for user mode cfi
Jim Shu (1):
arch/riscv: compile vdso with landing pad and shadow stack note
Documentation/admin-guide/kernel-parameters.txt | 8 +
Documentation/arch/riscv/index.rst | 2 +
Documentation/arch/riscv/zicfilp.rst | 115 +++++
Documentation/arch/riscv/zicfiss.rst | 179 +++++++
.../devicetree/bindings/riscv/extensions.yaml | 14 +
arch/riscv/Kconfig | 21 +
arch/riscv/Makefile | 5 +-
arch/riscv/configs/hardening.config | 4 +
arch/riscv/include/asm/asm-prototypes.h | 1 +
arch/riscv/include/asm/assembler.h | 44 ++
arch/riscv/include/asm/cpufeature.h | 12 +
arch/riscv/include/asm/csr.h | 16 +
arch/riscv/include/asm/entry-common.h | 2 +
arch/riscv/include/asm/hwcap.h | 2 +
arch/riscv/include/asm/mman.h | 26 +
arch/riscv/include/asm/mmu_context.h | 7 +
arch/riscv/include/asm/pgtable.h | 30 +-
arch/riscv/include/asm/processor.h | 1 +
arch/riscv/include/asm/thread_info.h | 3 +
arch/riscv/include/asm/usercfi.h | 95 ++++
arch/riscv/include/asm/vector.h | 3 +
arch/riscv/include/uapi/asm/hwprobe.h | 2 +
arch/riscv/include/uapi/asm/ptrace.h | 34 ++
arch/riscv/include/uapi/asm/sigcontext.h | 1 +
arch/riscv/kernel/Makefile | 1 +
arch/riscv/kernel/asm-offsets.c | 10 +
arch/riscv/kernel/cpufeature.c | 27 +
arch/riscv/kernel/entry.S | 38 ++
arch/riscv/kernel/head.S | 27 +
arch/riscv/kernel/process.c | 27 +-
arch/riscv/kernel/ptrace.c | 95 ++++
arch/riscv/kernel/signal.c | 148 +++++-
arch/riscv/kernel/sys_hwprobe.c | 2 +
arch/riscv/kernel/sys_riscv.c | 10 +
arch/riscv/kernel/traps.c | 54 ++
arch/riscv/kernel/usercfi.c | 545 +++++++++++++++++++++
arch/riscv/kernel/vdso/Makefile | 11 +-
arch/riscv/kernel/vdso/flush_icache.S | 4 +
arch/riscv/kernel/vdso/getcpu.S | 4 +
arch/riscv/kernel/vdso/rt_sigreturn.S | 4 +
arch/riscv/kernel/vdso/sys_hwprobe.S | 4 +
arch/riscv/kernel/vdso/vgetrandom-chacha.S | 5 +-
arch/riscv/mm/init.c | 2 +-
arch/riscv/mm/pgtable.c | 16 +
include/linux/cpu.h | 4 +
include/linux/mm.h | 7 +
include/uapi/linux/elf.h | 2 +
include/uapi/linux/prctl.h | 27 +
kernel/sys.c | 30 ++
tools/testing/selftests/riscv/Makefile | 2 +-
tools/testing/selftests/riscv/cfi/.gitignore | 3 +
tools/testing/selftests/riscv/cfi/Makefile | 16 +
tools/testing/selftests/riscv/cfi/cfi_rv_test.h | 82 ++++
tools/testing/selftests/riscv/cfi/riscv_cfi_test.c | 173 +++++++
tools/testing/selftests/riscv/cfi/shadowstack.c | 385 +++++++++++++++
tools/testing/selftests/riscv/cfi/shadowstack.h | 27 +
56 files changed, 2389 insertions(+), 30 deletions(-)
---
base-commit: a2a05801de77ca5122fc34e3eb84d6359ef70389
change-id: 20240930-v5_user_cfi_series-3dc332f8f5b2
--
- debug