This introduces signal->exec_bprm, which is used to
fix the case when at least one of the sibling threads
is traced, and therefore the trace process may dead-lock
in ptrace_attach, but de_thread will need to wait for the
tracer to continue execution.
The solution is to detect this situation and allow
ptrace_attach to continue by temporarily releasing the
cred_guard_mutex, while de_thread() is still waiting for
traced zombies to be eventually released by the tracer.
In the case of the thread group leader we only have to wait
for the thread to become a zombie, which may also need
co-operation from the tracer due to PTRACE_O_TRACEEXIT.
When a tracer wants to ptrace_attach a task that already
is in execve, we simply retry the ptrace_may_access
check while temporarily installing the new credentials
and dumpability which are about to be used after execve
completes. If the ptrace_attach happens on a thread that
is a sibling-thread of the thread doing execve, it is
sufficient to check against the old credentials, as this
thread will be waited for, before the new credentials are
installed.
Other threads die quickly since the cred_guard_mutex is
released, but a deadly signal is already pending. In case
the mutex_lock_killable misses the signal, the non-zero
current->signal->exec_bprm makes sure they release the
mutex immediately and return with -ERESTARTNOINTR.
This means there is no API change, unlike the previous
version of this patch which was discussed here:
https://lore.kernel.org/lkml/b6537ae6-31b1-5c50-f32b-8b8332ace882@hotmail.d…
See tools/testing/selftests/ptrace/vmaccess.c
for a test case that gets fixed by this change.
Note that since the test case was originally designed to
test the ptrace_attach returning an error in this situation,
the test expectation needed to be adjusted, to allow the
API to succeed at the first attempt.
Signed-off-by: Bernd Edlinger <bernd.edlinger(a)hotmail.de>
---
fs/exec.c | 69 ++++++++++++++++-------
fs/proc/base.c | 6 ++
include/linux/cred.h | 1 +
include/linux/sched/signal.h | 18 ++++++
kernel/cred.c | 28 +++++++--
kernel/ptrace.c | 32 +++++++++++
kernel/seccomp.c | 12 +++-
tools/testing/selftests/ptrace/vmaccess.c | 23 +++++---
8 files changed, 155 insertions(+), 34 deletions(-)
v10: Changes to previous version, make the PTRACE_ATTACH
retun -EAGAIN, instead of execve return -ERESTARTSYS.
Added some lessions learned to the description.
v11: Check old and new credentials in PTRACE_ATTACH again without
changing the API.
Note: I got actually one response from an automatic checker to the v11 patch,
https://lore.kernel.org/lkml/202107121344.wu68hEPF-lkp@intel.com/
which is complaining about:
>> kernel/ptrace.c:425:26: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct cred const *old_cred @@ got struct cred const [noderef] __rcu *real_cred @@
417 struct linux_binprm *bprm = task->signal->exec_bprm;
418 const struct cred *old_cred;
419 struct mm_struct *old_mm;
420
421 retval = down_write_killable(&task->signal->exec_update_lock);
422 if (retval)
423 goto unlock_creds;
424 task_lock(task);
> 425 old_cred = task->real_cred;
v12: Essentially identical to v11.
- Fixed a minor merge conflict in linux v5.17, and fixed the
above mentioned nit by adding __rcu to the declaration.
- re-tested the patch with all linux versions from v5.11 to v6.6
v10 was an alternative approach which did imply an API change.
But I would prefer to avoid such an API change.
The difficult part is getting the right dumpability flags assigned
before de_thread starts, hope you like this version.
If not, the v10 is of course also acceptable.
Thanks
Bernd.
diff --git a/fs/exec.c b/fs/exec.c
index 2f2b0acec4f0..902d3b230485 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1041,11 +1041,13 @@ static int exec_mmap(struct mm_struct *mm)
return 0;
}
-static int de_thread(struct task_struct *tsk)
+static int de_thread(struct task_struct *tsk, struct linux_binprm *bprm)
{
struct signal_struct *sig = tsk->signal;
struct sighand_struct *oldsighand = tsk->sighand;
spinlock_t *lock = &oldsighand->siglock;
+ struct task_struct *t = tsk;
+ bool unsafe_execve_in_progress = false;
if (thread_group_empty(tsk))
goto no_thread_group;
@@ -1068,6 +1070,19 @@ static int de_thread(struct task_struct *tsk)
if (!thread_group_leader(tsk))
sig->notify_count--;
+ while_each_thread(tsk, t) {
+ if (unlikely(t->ptrace)
+ && (t != tsk->group_leader || !t->exit_state))
+ unsafe_execve_in_progress = true;
+ }
+
+ if (unlikely(unsafe_execve_in_progress)) {
+ spin_unlock_irq(lock);
+ sig->exec_bprm = bprm;
+ mutex_unlock(&sig->cred_guard_mutex);
+ spin_lock_irq(lock);
+ }
+
while (sig->notify_count) {
__set_current_state(TASK_KILLABLE);
spin_unlock_irq(lock);
@@ -1158,6 +1173,11 @@ static int de_thread(struct task_struct *tsk)
release_task(leader);
}
+ if (unlikely(unsafe_execve_in_progress)) {
+ mutex_lock(&sig->cred_guard_mutex);
+ sig->exec_bprm = NULL;
+ }
+
sig->group_exec_task = NULL;
sig->notify_count = 0;
@@ -1169,6 +1189,11 @@ static int de_thread(struct task_struct *tsk)
return 0;
killed:
+ if (unlikely(unsafe_execve_in_progress)) {
+ mutex_lock(&sig->cred_guard_mutex);
+ sig->exec_bprm = NULL;
+ }
+
/* protects against exit_notify() and __exit_signal() */
read_lock(&tasklist_lock);
sig->group_exec_task = NULL;
@@ -1253,6 +1278,24 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
return retval;
+ /* If the binary is not readable then enforce mm->dumpable=0 */
+ would_dump(bprm, bprm->file);
+ if (bprm->have_execfd)
+ would_dump(bprm, bprm->executable);
+
+ /*
+ * Figure out dumpability. Note that this checking only of current
+ * is wrong, but userspace depends on it. This should be testing
+ * bprm->secureexec instead.
+ */
+ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
+ is_dumpability_changed(current_cred(), bprm->cred) ||
+ !(uid_eq(current_euid(), current_uid()) &&
+ gid_eq(current_egid(), current_gid())))
+ set_dumpable(bprm->mm, suid_dumpable);
+ else
+ set_dumpable(bprm->mm, SUID_DUMP_USER);
+
/*
* Ensure all future errors are fatal.
*/
@@ -1261,7 +1304,7 @@ int begin_new_exec(struct linux_binprm * bprm)
/*
* Make this the only thread in the thread group.
*/
- retval = de_thread(me);
+ retval = de_thread(me, bprm);
if (retval)
goto out;
@@ -1284,11 +1327,6 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
goto out;
- /* If the binary is not readable then enforce mm->dumpable=0 */
- would_dump(bprm, bprm->file);
- if (bprm->have_execfd)
- would_dump(bprm, bprm->executable);
-
/*
* Release all of the old mmap stuff
*/
@@ -1350,18 +1388,6 @@ int begin_new_exec(struct linux_binprm * bprm)
me->sas_ss_sp = me->sas_ss_size = 0;
- /*
- * Figure out dumpability. Note that this checking only of current
- * is wrong, but userspace depends on it. This should be testing
- * bprm->secureexec instead.
- */
- if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
- !(uid_eq(current_euid(), current_uid()) &&
- gid_eq(current_egid(), current_gid())))
- set_dumpable(current->mm, suid_dumpable);
- else
- set_dumpable(current->mm, SUID_DUMP_USER);
-
perf_event_exec();
__set_task_comm(me, kbasename(bprm->filename), true);
@@ -1480,6 +1506,11 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
return -ERESTARTNOINTR;
+ if (unlikely(current->signal->exec_bprm)) {
+ mutex_unlock(¤t->signal->cred_guard_mutex);
+ return -ERESTARTNOINTR;
+ }
+
bprm->cred = prepare_exec_creds();
if (likely(bprm->cred))
return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ffd54617c354..0da9adfadb48 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2788,6 +2788,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
if (rv < 0)
goto out_free;
+ if (unlikely(current->signal->exec_bprm)) {
+ mutex_unlock(¤t->signal->cred_guard_mutex);
+ rv = -ERESTARTNOINTR;
+ goto out_free;
+ }
+
rv = security_setprocattr(PROC_I(inode)->op.lsm,
file->f_path.dentry->d_name.name, page,
count);
diff --git a/include/linux/cred.h b/include/linux/cred.h
index f923528d5cc4..b01e309f5686 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -159,6 +159,7 @@ extern const struct cred *get_task_cred(struct task_struct *);
extern struct cred *cred_alloc_blank(void);
extern struct cred *prepare_creds(void);
extern struct cred *prepare_exec_creds(void);
+extern bool is_dumpability_changed(const struct cred *, const struct cred *);
extern int commit_creds(struct cred *);
extern void abort_creds(struct cred *);
extern const struct cred *override_creds(const struct cred *);
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0014d3adaf84..14df7073a0a8 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -234,9 +234,27 @@ struct signal_struct {
struct mm_struct *oom_mm; /* recorded mm when the thread group got
* killed by the oom killer */
+ struct linux_binprm *exec_bprm; /* Used to check ptrace_may_access
+ * against new credentials while
+ * de_thread is waiting for other
+ * traced threads to terminate.
+ * Set while de_thread is executing.
+ * The cred_guard_mutex is released
+ * after de_thread() has called
+ * zap_other_threads(), therefore
+ * a fatal signal is guaranteed to be
+ * already pending in the unlikely
+ * event, that
+ * current->signal->exec_bprm happens
+ * to be non-zero after the
+ * cred_guard_mutex was acquired.
+ */
+
struct mutex cred_guard_mutex; /* guard against foreign influences on
* credential calculations
* (notably. ptrace)
+ * Held while execve runs, except when
+ * a sibling thread is being traced.
* Deprecated do not use in new code.
* Use exec_update_lock instead.
*/
diff --git a/kernel/cred.c b/kernel/cred.c
index 98cb4eca23fb..586cb6c7cf6b 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -433,6 +433,28 @@ static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
return false;
}
+/**
+ * is_dumpability_changed - Will changing creds from old to new
+ * affect the dumpability in commit_creds?
+ *
+ * Return: false - dumpability will not be changed in commit_creds.
+ * Return: true - dumpability will be changed to non-dumpable.
+ *
+ * @old: The old credentials
+ * @new: The new credentials
+ */
+bool is_dumpability_changed(const struct cred *old, const struct cred *new)
+{
+ if (!uid_eq(old->euid, new->euid) ||
+ !gid_eq(old->egid, new->egid) ||
+ !uid_eq(old->fsuid, new->fsuid) ||
+ !gid_eq(old->fsgid, new->fsgid) ||
+ !cred_cap_issubset(old, new))
+ return true;
+
+ return false;
+}
+
/**
* commit_creds - Install new credentials upon the current task
* @new: The credentials to be assigned
@@ -467,11 +489,7 @@ int commit_creds(struct cred *new)
get_cred(new); /* we will require a ref for the subj creds too */
/* dumpability changes */
- if (!uid_eq(old->euid, new->euid) ||
- !gid_eq(old->egid, new->egid) ||
- !uid_eq(old->fsuid, new->fsuid) ||
- !gid_eq(old->fsgid, new->fsgid) ||
- !cred_cap_issubset(old, new)) {
+ if (is_dumpability_changed(old, new)) {
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 443057bee87c..eb1c450bb7d7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -20,6 +20,7 @@
#include <linux/pagemap.h>
#include <linux/ptrace.h>
#include <linux/security.h>
+#include <linux/binfmts.h>
#include <linux/signal.h>
#include <linux/uio.h>
#include <linux/audit.h>
@@ -435,6 +436,28 @@ static int ptrace_attach(struct task_struct *task, long request,
if (retval)
goto unlock_creds;
+ if (unlikely(task->in_execve)) {
+ struct linux_binprm *bprm = task->signal->exec_bprm;
+ const struct cred __rcu *old_cred;
+ struct mm_struct *old_mm;
+
+ retval = down_write_killable(&task->signal->exec_update_lock);
+ if (retval)
+ goto unlock_creds;
+ task_lock(task);
+ old_cred = task->real_cred;
+ old_mm = task->mm;
+ rcu_assign_pointer(task->real_cred, bprm->cred);
+ task->mm = bprm->mm;
+ retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
+ rcu_assign_pointer(task->real_cred, old_cred);
+ task->mm = old_mm;
+ task_unlock(task);
+ up_write(&task->signal->exec_update_lock);
+ if (retval)
+ goto unlock_creds;
+ }
+
write_lock_irq(&tasklist_lock);
retval = -EPERM;
if (unlikely(task->exit_state))
@@ -508,6 +531,14 @@ static int ptrace_traceme(void)
{
int ret = -EPERM;
+ if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
+ return -ERESTARTNOINTR;
+
+ if (unlikely(current->signal->exec_bprm)) {
+ mutex_unlock(¤t->signal->cred_guard_mutex);
+ return -ERESTARTNOINTR;
+ }
+
write_lock_irq(&tasklist_lock);
/* Are we already being traced? */
if (!current->ptrace) {
@@ -523,6 +554,7 @@ static int ptrace_traceme(void)
}
}
write_unlock_irq(&tasklist_lock);
+ mutex_unlock(¤t->signal->cred_guard_mutex);
return ret;
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 255999ba9190..b29bbfa0b044 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1955,9 +1955,15 @@ static long seccomp_set_mode_filter(unsigned int flags,
* Make sure we cannot change seccomp or nnp state via TSYNC
* while another thread is in the middle of calling exec.
*/
- if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
- mutex_lock_killable(¤t->signal->cred_guard_mutex))
- goto out_put_fd;
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
+ if (mutex_lock_killable(¤t->signal->cred_guard_mutex))
+ goto out_put_fd;
+
+ if (unlikely(current->signal->exec_bprm)) {
+ mutex_unlock(¤t->signal->cred_guard_mutex);
+ goto out_put_fd;
+ }
+ }
spin_lock_irq(¤t->sighand->siglock);
diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c
index 4db327b44586..3b7d81fb99bb 100644
--- a/tools/testing/selftests/ptrace/vmaccess.c
+++ b/tools/testing/selftests/ptrace/vmaccess.c
@@ -39,8 +39,15 @@ TEST(vmaccess)
f = open(mm, O_RDONLY);
ASSERT_GE(f, 0);
close(f);
- f = kill(pid, SIGCONT);
- ASSERT_EQ(f, 0);
+ f = waitpid(-1, NULL, 0);
+ ASSERT_NE(f, -1);
+ ASSERT_NE(f, 0);
+ ASSERT_NE(f, pid);
+ f = waitpid(-1, NULL, 0);
+ ASSERT_EQ(f, pid);
+ f = waitpid(-1, NULL, 0);
+ ASSERT_EQ(f, -1);
+ ASSERT_EQ(errno, ECHILD);
}
TEST(attach)
@@ -57,22 +64,24 @@ TEST(attach)
sleep(1);
k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
- ASSERT_EQ(errno, EAGAIN);
- ASSERT_EQ(k, -1);
+ ASSERT_EQ(k, 0);
k = waitpid(-1, &s, WNOHANG);
ASSERT_NE(k, -1);
ASSERT_NE(k, 0);
ASSERT_NE(k, pid);
ASSERT_EQ(WIFEXITED(s), 1);
ASSERT_EQ(WEXITSTATUS(s), 0);
- sleep(1);
- k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
+ k = waitpid(-1, &s, 0);
+ ASSERT_EQ(k, pid);
+ ASSERT_EQ(WIFSTOPPED(s), 1);
+ ASSERT_EQ(WSTOPSIG(s), SIGTRAP);
+ k = ptrace(PTRACE_CONT, pid, 0L, 0L);
ASSERT_EQ(k, 0);
k = waitpid(-1, &s, 0);
ASSERT_EQ(k, pid);
ASSERT_EQ(WIFSTOPPED(s), 1);
ASSERT_EQ(WSTOPSIG(s), SIGSTOP);
- k = ptrace(PTRACE_DETACH, pid, 0L, 0L);
+ k = ptrace(PTRACE_CONT, pid, 0L, 0L);
ASSERT_EQ(k, 0);
k = waitpid(-1, &s, 0);
ASSERT_EQ(k, pid);
--
2.39.2
[Joerg, can you put this and vtd in linux-next please. The vtd series is still
good at v3 thanks]
Currently each of the iommu page table formats duplicates all of the logic
to maintain the page table and perform map/unmap/etc operations. There are
several different versions of the algorithms between all the different
formats. The io-pgtable system provides an interface to help isolate the
page table code from the iommu driver, but doesn't provide tools to
implement the common algorithms.
This makes it very hard to improve the state of the pagetable code under
the iommu domains as any proposed improvement needs to alter a large
number of different driver code paths. Combined with a lack of software
based testing this makes improvement in this area very hard.
iommufd wants several new page table operations:
- More efficient map/unmap operations, using iommufd's batching logic
- unmap that returns the physical addresses into a batch as it progresses
- cut that allows splitting areas so large pages can have holes
poked in them dynamically (ie guestmemfd hitless shared/private
transitions)
- More agressive freeing of table memory to avoid waste
- Fragmenting large pages so that dirty tracking can be more granular
- Reassembling large pages so that VMs can run at full IO performance
in migration/dirty tracking error flows
- KHO integration for kernel live upgrade
Together these are algorithmically complex enough to be a very significant
task to go and implement in all the page table formats we support. Just
the "server" focused drivers use almost all the formats (ARMv8 S1&S2 / x86
PAE / AMDv1 / VT-d SS / RISCV)
Instead of doing the duplicated work, this series takes the first step to
consolidate the algorithms into one places. In spirit it is similar to the
work Christoph did a few years back to pull the redundant get_user_pages()
implementations out of the arch code into core MM. This unlocked a great
deal of improvement in that space in the following years. I would like to
see the same benefit in iommu as well.
My first RFC showed a bigger picture with all most all formats and more
algorithms. This series reorganizes that to be narrowly focused on just
enough to convert the AMD driver to use the new mechanism.
kunit tests are provided that allow good testing of the algorithms and all
formats on x86, nothing is arch specific.
AMD is one of the simpler options as the HW is quite uniform with few
different options/bugs while still requiring the complicated contiguous
pages support. The HW also has a very simple range based invalidation
approach that is easy to implement.
The AMD v1 and AMD v2 page table formats are implemented bit for bit
identical to the current code, tested using a compare kunit test that
checks against the io-pgtable version (on github, see below).
Updating the AMD driver to replace the io-pgtable layer with the new stuff
is fairly straightforward now. The layering is fixed up in the new version
so that all the invalidation goes through function pointers.
Several small fixing patches have come out of this as I've been fixing the
problems that the test suite uncovers in the current code, and
implementing the fixed version in iommupt.
On performance, there is a quite wide variety of implementation designs
across all the drivers. Looking at some key performance across
the main formats:
iommu_map():
pgsz ,avg new,old ns, min new,old ns , min % (+ve is better)
2^12, 53,66 , 51,63 , 19.19 (AMDV1)
256*2^12, 386,1909 , 367,1795 , 79.79
256*2^21, 362,1633 , 355,1556 , 77.77
2^12, 56,62 , 52,59 , 11.11 (AMDv2)
256*2^12, 405,1355 , 357,1292 , 72.72
256*2^21, 393,1160 , 358,1114 , 67.67
2^12, 55,65 , 53,62 , 14.14 (VT-d second stage)
256*2^12, 391,518 , 332,512 , 35.35
256*2^21, 383,635 , 336,624 , 46.46
2^12, 57,65 , 55,63 , 12.12 (ARM 64 bit)
256*2^12, 380,389 , 361,369 , 2.02
256*2^21, 358,419 , 345,400 , 13.13
iommu_unmap():
pgsz ,avg new,old ns, min new,old ns , min % (+ve is better)
2^12, 69,88 , 65,85 , 23.23 (AMDv1)
256*2^12, 353,6498 , 331,6029 , 94.94
256*2^21, 373,6014 , 360,5706 , 93.93
2^12, 71,72 , 66,69 , 4.04 (AMDv2)
256*2^12, 228,891 , 206,871 , 76.76
256*2^21, 254,721 , 245,711 , 65.65
2^12, 69,87 , 65,82 , 20.20 (VT-d second stage)
256*2^12, 210,321 , 200,315 , 36.36
256*2^21, 255,349 , 238,342 , 30.30
2^12, 72,77 , 68,74 , 8.08 (ARM 64 bit)
256*2^12, 521,357 , 447,346 , -29.29
256*2^21, 489,358 , 433,345 , -25.25
* Above numbers include additional patches to remove the iommu_pgsize()
overheads. gcc 13.3.0, i7-12700
This version provides fairly consistent performance across formats. ARM
unmap performance is quite different because this version supports
contiguous pages and uses a very different algorithm for unmapping. Though
why it is so worse compared to AMDv1 I haven't figured out yet.
The per-format commits include a more detailed chart.
There is a second branch:
https://github.com/jgunthorpe/linux/commits/iommu_pt_all
Containing supporting work and future steps:
- ARM short descriptor (32 bit), ARM long descriptor (64 bit) formats
- RISCV format and RISCV conversion
https://github.com/jgunthorpe/linux/commits/iommu_pt_riscv
- Support for a DMA incoherent HW page table walker
- VT-d second stage format and VT-d conversion
https://github.com/jgunthorpe/linux/commits/iommu_pt_vtd
- DART v1 & v2 format
- Draft of a iommufd 'cut' operation to break down huge pages
- A compare test that checks the iommupt formats against the iopgtable
interface, including updating AMD to have a working iopgtable and patches
to make VT-d have an iopgtable for testing.
- A performance test to micro-benchmark map and unmap against iogptable
My strategy is to go one by one for the drivers:
- AMD driver conversion
- RISCV page table and driver
- Intel VT-d driver and VTDSS page table
- Flushing improvements for RISCV
- ARM SMMUv3
And concurrently work on the algorithm side:
- debugfs content dump, like VT-d has
- Cut support
- Increase/Decrease page size support
- map/unmap batching
- KHO
As we make more algorithm improvements the value to convert the drivers
increases.
This is on github: https://github.com/jgunthorpe/linux/commits/iommu_pt
v8:
- Remove unused to_amdv1pt/common_to_amdv1pt/to_x86_64_pt/common_to_x86_64_pt
- Fix 32 bit udiv compile failure in the kunit
v7: https://patch.msgid.link/r/0-v7-ab019a8791e2+175b8-iommu_pt_jgg@nvidia.com
- Rebase to v6.18-rc2
- Improve comments and documentation
- Add a few missed __sme_sets() for AMD CC
- Rename pt_iommu_flush_ops -> pt_iommu_driver_ops
VT-D -> VT-d
pt_clear_entry -> pt_clear_entries
pt_entry_write_is_dirty -> pt_entry_is_write_dirty
pt_entry_set_write_clean -> pt_entry_make_write_clean
- Tidy some of the map flow into a new function do_map()
- Fix ffz64()
v6: https://patch.msgid.link/r/0-v6-0fb54a1d9850+36b-iommu_pt_jgg@nvidia.com
- Improve comments and documentation
- Rename pt_entry_oa_full -> pt_entry_oa_exact
pt_has_system_page -> pt_has_system_page_size
pt_max_output_address_lg2 -> pt_max_oa_lg2
log2_f*() -> vaf* / oaf* / f*_t
pt_item_fully_covered -> pt_entry_fully_covered
- Fix missed constant propogation causing division
- Consolidate debugging checks to pt_check_install_leaf_args()
- Change collect->ignore_mapped to check_mapped
- Shuffle some hunks around to more appropriate patches
- Two new mini kunit tests
v5: https://patch.msgid.link/r/0-v5-116c4948af3d+68091-iommu_pt_jgg@nvidia.com
- Text grammar updates and kdoc fixes
v4: https://patch.msgid.link/r/0-v4-0d6a6726a372+18959-iommu_pt_jgg@nvidia.com
- Rebase on v6.16-rc3
- Integrate the HATS/HATDis changes
- Remove 'default n' from kconfig
- Remove unused 'PT_FIXED_TOP_LEVEL'
- Improve comments and documentation
- Fix some compile warnings from kbuild robots
v3: https://patch.msgid.link/r/0-v3-a93aab628dbc+521-iommu_pt_jgg@nvidia.com
- Rebase on v6.16-rc2
- s/PT_ENTRY_WORD_SIZE/PT_ITEM_WORD_SIZE/s to follow the language better
- Comment and documentation updates
- Add PT_TOP_PHYS_MASK to help manage alignment restrictions on the top
pointer
- Add missed force_aperture = true
- Make pt_iommu_deinit() take care of the not-yet-inited error case
internally as AMD/RISCV/VTD all shared this logic
- Change gather_range() into gather_range_pages() so it also deals with
the page list. This makes the following cache flushing series simpler
- Fix missed update of unmap->unmapped in some error cases
- Change clear_contig() to order the gather more logically
- Remove goto from the error handling in __map_range_leaf()
- s/log2_/oalog2_/ in places where the argument is an oaddr_t
- Pass the pts to pt_table_install64/32()
- Do not use SIGN_EXTEND for the AMDv2 page table because of Vasant's
information on how PASID 0 works.
v2: https://patch.msgid.link/r/0-v2-5c26bde5c22d+58b-iommu_pt_jgg@nvidia.com
- AMD driver only, many code changes
RFC: https://lore.kernel.org/all/0-v1-01fa10580981+1d-iommu_pt_jgg@nvidia.com/
Cc: Michael Roth <michael.roth(a)amd.com>
Cc: Alexey Kardashevskiy <aik(a)amd.com>
Cc: Pasha Tatashin <pasha.tatashin(a)soleen.com>
Cc: James Gowans <jgowans(a)amazon.com>
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
Alejandro Jimenez (1):
iommu/amd: Use the generic iommu page table
Jason Gunthorpe (14):
genpt: Generic Page Table base API
genpt: Add Documentation/ files
iommupt: Add the basic structure of the iommu implementation
iommupt: Add the AMD IOMMU v1 page table format
iommupt: Add iova_to_phys op
iommupt: Add unmap_pages op
iommupt: Add map_pages op
iommupt: Add read_and_clear_dirty op
iommupt: Add a kunit test for Generic Page Table
iommupt: Add a mock pagetable format for iommufd selftest to use
iommufd: Change the selftest to use iommupt instead of xarray
iommupt: Add the x86 64 bit page table format
iommu/amd: Remove AMD io_pgtable support
iommupt: Add a kunit test for the IOMMU implementation
.clang-format | 1 +
Documentation/driver-api/generic_pt.rst | 142 ++
Documentation/driver-api/index.rst | 1 +
drivers/iommu/Kconfig | 2 +
drivers/iommu/Makefile | 1 +
drivers/iommu/amd/Kconfig | 5 +-
drivers/iommu/amd/Makefile | 2 +-
drivers/iommu/amd/amd_iommu.h | 1 -
drivers/iommu/amd/amd_iommu_types.h | 110 +-
drivers/iommu/amd/io_pgtable.c | 577 --------
drivers/iommu/amd/io_pgtable_v2.c | 370 ------
drivers/iommu/amd/iommu.c | 538 ++++----
drivers/iommu/generic_pt/.kunitconfig | 13 +
drivers/iommu/generic_pt/Kconfig | 68 +
drivers/iommu/generic_pt/fmt/Makefile | 26 +
drivers/iommu/generic_pt/fmt/amdv1.h | 411 ++++++
drivers/iommu/generic_pt/fmt/defs_amdv1.h | 21 +
drivers/iommu/generic_pt/fmt/defs_x86_64.h | 21 +
drivers/iommu/generic_pt/fmt/iommu_amdv1.c | 15 +
drivers/iommu/generic_pt/fmt/iommu_mock.c | 10 +
drivers/iommu/generic_pt/fmt/iommu_template.h | 48 +
drivers/iommu/generic_pt/fmt/iommu_x86_64.c | 11 +
drivers/iommu/generic_pt/fmt/x86_64.h | 255 ++++
drivers/iommu/generic_pt/iommu_pt.h | 1162 +++++++++++++++++
drivers/iommu/generic_pt/kunit_generic_pt.h | 713 ++++++++++
drivers/iommu/generic_pt/kunit_iommu.h | 183 +++
drivers/iommu/generic_pt/kunit_iommu_pt.h | 487 +++++++
drivers/iommu/generic_pt/pt_common.h | 358 +++++
drivers/iommu/generic_pt/pt_defs.h | 329 +++++
drivers/iommu/generic_pt/pt_fmt_defaults.h | 233 ++++
drivers/iommu/generic_pt/pt_iter.h | 636 +++++++++
drivers/iommu/generic_pt/pt_log2.h | 122 ++
drivers/iommu/io-pgtable.c | 4 -
drivers/iommu/iommufd/Kconfig | 1 +
drivers/iommu/iommufd/iommufd_test.h | 11 +-
drivers/iommu/iommufd/selftest.c | 438 +++----
include/linux/generic_pt/common.h | 167 +++
include/linux/generic_pt/iommu.h | 271 ++++
include/linux/io-pgtable.h | 2 -
include/linux/irqchip/riscv-imsic.h | 3 +-
tools/testing/selftests/iommu/iommufd.c | 60 +-
tools/testing/selftests/iommu/iommufd_utils.h | 12 +
42 files changed, 6229 insertions(+), 1612 deletions(-)
create mode 100644 Documentation/driver-api/generic_pt.rst
delete mode 100644 drivers/iommu/amd/io_pgtable.c
delete mode 100644 drivers/iommu/amd/io_pgtable_v2.c
create mode 100644 drivers/iommu/generic_pt/.kunitconfig
create mode 100644 drivers/iommu/generic_pt/Kconfig
create mode 100644 drivers/iommu/generic_pt/fmt/Makefile
create mode 100644 drivers/iommu/generic_pt/fmt/amdv1.h
create mode 100644 drivers/iommu/generic_pt/fmt/defs_amdv1.h
create mode 100644 drivers/iommu/generic_pt/fmt/defs_x86_64.h
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_amdv1.c
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_mock.c
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_template.h
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_x86_64.c
create mode 100644 drivers/iommu/generic_pt/fmt/x86_64.h
create mode 100644 drivers/iommu/generic_pt/iommu_pt.h
create mode 100644 drivers/iommu/generic_pt/kunit_generic_pt.h
create mode 100644 drivers/iommu/generic_pt/kunit_iommu.h
create mode 100644 drivers/iommu/generic_pt/kunit_iommu_pt.h
create mode 100644 drivers/iommu/generic_pt/pt_common.h
create mode 100644 drivers/iommu/generic_pt/pt_defs.h
create mode 100644 drivers/iommu/generic_pt/pt_fmt_defaults.h
create mode 100644 drivers/iommu/generic_pt/pt_iter.h
create mode 100644 drivers/iommu/generic_pt/pt_log2.h
create mode 100644 include/linux/generic_pt/common.h
create mode 100644 include/linux/generic_pt/iommu.h
base-commit: 8440410283bb5533b676574211f31f030a18011b
--
2.43.0
'available_events' is actually not required by
'test.d/event/toplevel-enable.tc' and its Existence has been tested in
'test.d/00basic/basic4.tc'.
So the require of 'available_events' can be dropped and then we can add
'instance' flag to test 'test.d/event/toplevel-enable.tc' for instance.
Test result show as below:
# ./ftracetest test.d/event/toplevel-enable.tc
=== Ftrace unit tests ===
[1] event tracing - enable/disable with top level files [PASS]
[2] (instance) event tracing - enable/disable with top level files [PASS]
# of passed: 2
# of failed: 0
# of unresolved: 0
# of untested: 0
# of unsupported: 0
# of xfailed: 0
# of undefined(test bug): 0
Signed-off-by: Zheng Yejian <zhengyejian1(a)huawei.com>
---
tools/testing/selftests/ftrace/test.d/event/toplevel-enable.tc | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/ftrace/test.d/event/toplevel-enable.tc b/tools/testing/selftests/ftrace/test.d/event/toplevel-enable.tc
index 93c10ea42a68..8b8e1aea985b 100644
--- a/tools/testing/selftests/ftrace/test.d/event/toplevel-enable.tc
+++ b/tools/testing/selftests/ftrace/test.d/event/toplevel-enable.tc
@@ -1,7 +1,8 @@
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
# description: event tracing - enable/disable with top level files
-# requires: available_events set_event events/enable
+# requires: set_event events/enable
+# flags: instance
do_reset() {
echo > set_event
--
2.25.1
At this point I think everyone in the on the kernel side is happy with
this but there were some questions from the glibc side about the value
of controlling the shadow stack placement and size, especially with the
current inability to reuse the shadow stack for an exited thread. With
support for reuse it would be possible to have a cache of shadow stacks
as is currently supported for the normal stack.
Since the discussion petered out I'm resending this in order to give
people something work with while prototyping. It should be possible to
prototype any potential kernel features to help build out shadow stack
support in userspace by enabling shadow stack writes, as suggested by
Rick Edgecombe this may end up being required anyway for supporting more
exotic scenarios. On all current architectures with the feature writes
to shadow stack require specific instructions so there are still
security benefits even with writes enabled.
I did send a change implementing a feature writing a token on thread
exit to allow reuse:
https://lore.kernel.org/r/20250921-arm64-gcs-exit-token-v1-0-45cf64e648d5@k…
but wasn't planning to refresh it without some indication from the
userspace side that that'd be useful.
Non-process cover letter:
The kernel has added support for shadow stacks, currently x86 only using
their CET feature but both arm64 and RISC-V have equivalent features
(GCS and Zicfiss respectively), I am actively working on GCS[1]. With
shadow stacks the hardware maintains an additional stack containing only
the return addresses for branch instructions which is not generally
writeable by userspace and ensures that any returns are to the recorded
addresses. This provides some protection against ROP attacks and making
it easier to collect call stacks. These shadow stacks are allocated in
the address space of the userspace process.
Our API for shadow stacks does not currently offer userspace any
flexiblity for managing the allocation of shadow stacks for newly
created threads, instead the kernel allocates a new shadow stack with
the same size as the normal stack whenever a thread is created with the
feature enabled. The stacks allocated in this way are freed by the
kernel when the thread exits or shadow stacks are disabled for the
thread. This lack of flexibility and control isn't ideal, in the vast
majority of cases the shadow stack will be over allocated and the
implicit allocation and deallocation is not consistent with other
interfaces. As far as I can tell the interface is done in this manner
mainly because the shadow stack patches were in development since before
clone3() was implemented.
Since clone3() is readily extensible let's add support for specifying a
shadow stack when creating a new thread or process, keeping the current
implicit allocation behaviour if one is not specified either with
clone3() or through the use of clone(). The user must provide a shadow
stack pointer, this must point to memory mapped for use as a shadow
stackby map_shadow_stack() with an architecture specified shadow stack
token at the top of the stack.
Yuri Khrustalev has raised questions from the libc side regarding
discoverability of extended clone3() structure sizes[2], this seems like
a general issue with clone3(). There was a suggestion to add a hwcap on
arm64 which isn't ideal but is doable there, though architecture
specific mechanisms would also be needed for x86 (and RISC-V if it's
support gets merged before this does). The idea has, however, had
strong pushback from the architecture maintainers and it is possible to
detect support for this in clone3() by attempting a call with a
misaligned shadow stack pointer specified so no hwcap has been added.
[1] https://lore.kernel.org/linux-arm-kernel/20241001-arm64-gcs-v13-0-222b78d87…
[2] https://lore.kernel.org/r/aCs65ccRQtJBnZ_5@arm.com
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Changes in v23:
- Rebase onto v6.19-rc1.
- Link to v22: https://lore.kernel.org/r/20251015-clone3-shadow-stack-v22-0-a8c8da011427@k…
Changes in v22:
- Rebase onto v6.18-rc1.
- Cover letter updates.
- Link to v21: https://lore.kernel.org/r/20250916-clone3-shadow-stack-v21-0-910493527013@k…
Changes in v21:
- Rebase onto https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git kernel-6.18.clone3
- Rename shadow_stack_token to shstk_token, since it's a simple rename I've
kept the acks and reviews but I dropped the tested-bys just to be safe.
- Link to v20: https://lore.kernel.org/r/20250902-clone3-shadow-stack-v20-0-4d9fff1c53e7@k…
Changes in v20:
- Comment fixes and clarifications in x86 arch_shstk_validate_clone()
from Rick Edgecombe.
- Spelling fix in documentation.
- Link to v19: https://lore.kernel.org/r/20250819-clone3-shadow-stack-v19-0-bc957075479b@k…
Changes in v19:
- Rebase onto v6.17-rc1.
- Link to v18: https://lore.kernel.org/r/20250702-clone3-shadow-stack-v18-0-7965d2b694db@k…
Changes in v18:
- Rebase onto v6.16-rc3.
- Thanks to pointers from Yuri Khrustalev this version has been tested
on x86 so I have removed the RFT tag.
- Clarify clone3_shadow_stack_valid() comment about the Kconfig check.
- Remove redundant GCSB DSYNCs in arm64 code.
- Fix token validation on x86.
- Link to v17: https://lore.kernel.org/r/20250609-clone3-shadow-stack-v17-0-8840ed97ff6f@k…
Changes in v17:
- Rebase onto v6.16-rc1.
- Link to v16: https://lore.kernel.org/r/20250416-clone3-shadow-stack-v16-0-2ffc9ca3917b@k…
Changes in v16:
- Rebase onto v6.15-rc2.
- Roll in fixes from x86 testing from Rick Edgecombe.
- Rework so that the argument is shadow_stack_token.
- Link to v15: https://lore.kernel.org/r/20250408-clone3-shadow-stack-v15-0-3fa245c6e3be@k…
Changes in v15:
- Rebase onto v6.15-rc1.
- Link to v14: https://lore.kernel.org/r/20250206-clone3-shadow-stack-v14-0-805b53af73b9@k…
Changes in v14:
- Rebase onto v6.14-rc1.
- Link to v13: https://lore.kernel.org/r/20241203-clone3-shadow-stack-v13-0-93b89a81a5ed@k…
Changes in v13:
- Rebase onto v6.13-rc1.
- Link to v12: https://lore.kernel.org/r/20241031-clone3-shadow-stack-v12-0-7183eb8bee17@k…
Changes in v12:
- Add the regular prctl() to the userspace API document since arm64
support is queued in -next.
- Link to v11: https://lore.kernel.org/r/20241005-clone3-shadow-stack-v11-0-2a6a2bd6d651@k…
Changes in v11:
- Rebase onto arm64 for-next/gcs, which is based on v6.12-rc1, and
integrate arm64 support.
- Rework the interface to specify a shadow stack pointer rather than a
base and size like we do for the regular stack.
- Link to v10: https://lore.kernel.org/r/20240821-clone3-shadow-stack-v10-0-06e8797b9445@k…
Changes in v10:
- Integrate fixes & improvements for the x86 implementation from Rick
Edgecombe.
- Require that the shadow stack be VM_WRITE.
- Require that the shadow stack base and size be sizeof(void *) aligned.
- Clean up trailing newline.
- Link to v9: https://lore.kernel.org/r/20240819-clone3-shadow-stack-v9-0-962d74f99464@ke…
Changes in v9:
- Pull token validation earlier and report problems with an error return
to parent rather than signal delivery to the child.
- Verify that the top of the supplied shadow stack is VM_SHADOW_STACK.
- Rework token validation to only do the page mapping once.
- Drop no longer needed support for testing for signals in selftest.
- Fix typo in comments.
- Link to v8: https://lore.kernel.org/r/20240808-clone3-shadow-stack-v8-0-0acf37caf14c@ke…
Changes in v8:
- Fix token verification with user specified shadow stack.
- Don't track user managed shadow stacks for child processes.
- Link to v7: https://lore.kernel.org/r/20240731-clone3-shadow-stack-v7-0-a9532eebfb1d@ke…
Changes in v7:
- Rebase onto v6.11-rc1.
- Typo fixes.
- Link to v6: https://lore.kernel.org/r/20240623-clone3-shadow-stack-v6-0-9ee7783b1fb9@ke…
Changes in v6:
- Rebase onto v6.10-rc3.
- Ensure we don't try to free the parent shadow stack in error paths of
x86 arch code.
- Spelling fixes in userspace API document.
- Additional cleanups and improvements to the clone3() tests to support
the shadow stack tests.
- Link to v5: https://lore.kernel.org/r/20240203-clone3-shadow-stack-v5-0-322c69598e4b@ke…
Changes in v5:
- Rebase onto v6.8-rc2.
- Rework ABI to have the user allocate the shadow stack memory with
map_shadow_stack() and a token.
- Force inlining of the x86 shadow stack enablement.
- Move shadow stack enablement out into a shared header for reuse by
other tests.
- Link to v4: https://lore.kernel.org/r/20231128-clone3-shadow-stack-v4-0-8b28ffe4f676@ke…
Changes in v4:
- Formatting changes.
- Use a define for minimum shadow stack size and move some basic
validation to fork.c.
- Link to v3: https://lore.kernel.org/r/20231120-clone3-shadow-stack-v3-0-a7b8ed3e2acc@ke…
Changes in v3:
- Rebase onto v6.7-rc2.
- Remove stale shadow_stack in internal kargs.
- If a shadow stack is specified unconditionally use it regardless of
CLONE_ parameters.
- Force enable shadow stacks in the selftest.
- Update changelogs for RISC-V feature rename.
- Link to v2: https://lore.kernel.org/r/20231114-clone3-shadow-stack-v2-0-b613f8681155@ke…
Changes in v2:
- Rebase onto v6.7-rc1.
- Remove ability to provide preallocated shadow stack, just specify the
desired size.
- Link to v1: https://lore.kernel.org/r/20231023-clone3-shadow-stack-v1-0-d867d0b5d4d0@ke…
---
Mark Brown (8):
arm64/gcs: Return a success value from gcs_alloc_thread_stack()
Documentation: userspace-api: Add shadow stack API documentation
selftests: Provide helper header for shadow stack testing
fork: Add shadow stack support to clone3()
selftests/clone3: Remove redundant flushes of output streams
selftests/clone3: Factor more of main loop into test_clone3()
selftests/clone3: Allow tests to flag if -E2BIG is a valid error code
selftests/clone3: Test shadow stack support
Documentation/userspace-api/index.rst | 1 +
Documentation/userspace-api/shadow_stack.rst | 44 +++++
arch/arm64/include/asm/gcs.h | 8 +-
arch/arm64/kernel/process.c | 8 +-
arch/arm64/mm/gcs.c | 55 +++++-
arch/x86/include/asm/shstk.h | 11 +-
arch/x86/kernel/process.c | 2 +-
arch/x86/kernel/shstk.c | 53 ++++-
include/asm-generic/cacheflush.h | 11 ++
include/linux/sched/task.h | 17 ++
include/uapi/linux/sched.h | 9 +-
kernel/fork.c | 93 +++++++--
tools/testing/selftests/clone3/clone3.c | 226 ++++++++++++++++++----
tools/testing/selftests/clone3/clone3_selftests.h | 65 ++++++-
tools/testing/selftests/ksft_shstk.h | 98 ++++++++++
15 files changed, 620 insertions(+), 81 deletions(-)
---
base-commit: 8f0b4cce4481fb22653697cced8d0d04027cb1e8
change-id: 20231019-clone3-shadow-stack-15d40d2bf536
Best regards,
--
Mark Brown <broonie(a)kernel.org>
The devpts_pts selftest has an ifdef in case an architecture does not
define TIOCGPTPEER, but the handling for this is broken since we need
errno to be set to EINVAL in order to skip the test as we should. Given
that this ioctl() has been defined since v4.15 we may as well just assume
it's there rather than write handling code which will probably never get
used.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Changes in v2:
- Rebase onto v6.19-rc1.
- Link to v1: https://patch.msgid.link/20251126-selftests-filesystems-devpts-tiocgptpeer-…
---
tools/testing/selftests/filesystems/devpts_pts.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/tools/testing/selftests/filesystems/devpts_pts.c b/tools/testing/selftests/filesystems/devpts_pts.c
index 54fea349204e..950e8b7f675b 100644
--- a/tools/testing/selftests/filesystems/devpts_pts.c
+++ b/tools/testing/selftests/filesystems/devpts_pts.c
@@ -100,7 +100,7 @@ static int resolve_procfd_symlink(int fd, char *buf, size_t buflen)
static int do_tiocgptpeer(char *ptmx, char *expected_procfd_contents)
{
int ret;
- int master = -1, slave = -1, fret = -1;
+ int master = -1, slave, fret = -1;
master = open(ptmx, O_RDWR | O_NOCTTY | O_CLOEXEC);
if (master < 0) {
@@ -119,9 +119,7 @@ static int do_tiocgptpeer(char *ptmx, char *expected_procfd_contents)
goto do_cleanup;
}
-#ifdef TIOCGPTPEER
slave = ioctl(master, TIOCGPTPEER, O_RDWR | O_NOCTTY | O_CLOEXEC);
-#endif
if (slave < 0) {
if (errno == EINVAL) {
fprintf(stderr, "TIOCGPTPEER is not supported. "
---
base-commit: 8f0b4cce4481fb22653697cced8d0d04027cb1e8
change-id: 20251126-selftests-filesystems-devpts-tiocgptpeer-fbd30e579859
Best regards,
--
Mark Brown <broonie(a)kernel.org>
During my testing, I found that guest debugging with 'DR6.BD' does not
work in instruction emulation, as the current code only considers the
guest's DR7. Upon reviewing the code, I also observed that the checks
for the userspace guest debugging feature and the guest's own debugging
feature are repeated in different places during instruction
emulation, but the overall logic is the same. If guest debug
is enabled, it needs to exit to userspace; otherwise, a #DB
exception needs to be injected into the guest. Therefore, as
suggested by Jiangshan Lai, some cleanup has been done for #DB
handling in instruction emulation in this patchset. A new
function named 'kvm_inject_emulated_db()' is introduced to
consolidate all the checking logic. Moreover, I hope we can make
the #DB interception path use the same function as well.
Additionally, when I looked into the single-step #DB handling in
instruction emulation, I noticed that the interrupt shadow is toggled,
but it is not considered in the single-step #DB injection. This
oversight causes VM entry to fail on VMX (due to pending debug
exceptions state checking).
As pointed out by Sean, fault-like code #DBs can be coincident with
trap-like single-step #DBs at the instruction boundary on the hardware.
However it is difficult to emulate this in the emulator, as
kvm_vcpu_check_code_breakpoint() is called at the start of the next
instruction while the single-step #DB for the previous instruction has
already been injected.
v1->v2:
- cleanup in inject_emulated_exception().
- rename 'set_pending_dbg' callback as 'refresh_pending_dbg_exceptions'.
- fold refresh_pending_dbg_exceptions() call into
kvm_vcpu_do_singlestep().
- Split the change to move up kvm_set_rflags() into a single patch.
- Move the #DB and IRQ handler registration after guest debug testcases.
Hou Wenlong (9):
KVM: x86: Capture "struct x86_exception" in
inject_emulated_exception()
KVM: x86: Set guest DR6 by kvm_queue_exception_p() in instruction
emulation
KVM: x86: Check guest debug in DR access instruction emulation
KVM: x86: Only check effective code breakpoint in emulation
KVM: x86: Consolidate KVM_GUESTDBG_SINGLESTEP check into the
kvm_inject_emulated_db()
KVM: x86: Move kvm_set_rflags() up before kvm_vcpu_do_singlestep()
KVM: VMX: Refresh 'PENDING_DBG_EXCEPTIONS.BS' bit during instruction
emulation
KVM: selftests: Verify guest debug DR7.GD checking during instruction
emulation
KVM: selftests: Verify 'BS' bit checking in pending debug exception
during VM entry
arch/x86/include/asm/kvm-x86-ops.h | 1 +
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/emulate.c | 14 +--
arch/x86/kvm/kvm_emulate.h | 7 +-
arch/x86/kvm/vmx/main.c | 9 ++
arch/x86/kvm/vmx/vmx.c | 15 ++-
arch/x86/kvm/vmx/x86_ops.h | 1 +
arch/x86/kvm/x86.c | 116 ++++++++++--------
arch/x86/kvm/x86.h | 7 ++
.../selftests/kvm/include/x86/processor.h | 3 +-
tools/testing/selftests/kvm/x86/debug_regs.c | 72 ++++++++++-
11 files changed, 178 insertions(+), 68 deletions(-)
base-commit: 5d3e2d9ba9ed68576c70c127e4f7446d896f2af2
--
2.31.1
During my testing, I found that guest debugging with 'DR6.BD' does not
work in instruction emulation, as the current code only considers the
guest's DR7. Upon reviewing the code, I also observed that the checks
for the userspace guest debugging feature and the guest's own debugging
feature are repeated in different places during instruction
emulation, but the overall logic is the same. If guest debugging
is enabled, it needs to exit to userspace; otherwise, a #DB
exception needs to be injected into the guest. Therefore, as
suggested by Jiangshan Lai, some cleanup has been done for #DB
handling in instruction emulation in this patchset. A new
function named 'kvm_inject_emulated_db()' is introduced to
consolidate all the checking logic. Moreover, I hope we can make
the #DB interception path use the same function as well.
Additionally, when I looked into the single-step #DB handling in
instruction emulation, I noticed that the interrupt shadow is toggled,
but it is not considered in the single-step #DB injection. This
oversight causes VM entry to fail on VMX (due to pending debug
exceptions checking) or breaks the 'MOV SS' suppressed #DB. For the
latter, I have kept the behavior for now in my patchset, as I need some
suggestions.
Hou Wenlong (7):
KVM: x86: Set guest DR6 by kvm_queue_exception_p() in instruction
emulation
KVM: x86: Check guest debug in DR access instruction emulation
KVM: x86: Only check effective code breakpoint in emulation
KVM: x86: Consolidate KVM_GUESTDBG_SINGLESTEP check into the
kvm_inject_emulated_db()
KVM: VMX: Set 'BS' bit in pending debug exceptions during instruction
emulation
KVM: selftests: Verify guest debug DR7.GD checking during instruction
emulation
KVM: selftests: Verify 'BS' bit checking in pending debug exception
during VM entry
arch/x86/include/asm/kvm-x86-ops.h | 1 +
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/emulate.c | 14 +--
arch/x86/kvm/kvm_emulate.h | 7 +-
arch/x86/kvm/vmx/main.c | 9 ++
arch/x86/kvm/vmx/vmx.c | 14 ++-
arch/x86/kvm/vmx/x86_ops.h | 1 +
arch/x86/kvm/x86.c | 109 +++++++++++-------
arch/x86/kvm/x86.h | 7 ++
.../selftests/kvm/include/x86/processor.h | 3 +-
tools/testing/selftests/kvm/x86/debug_regs.c | 64 +++++++++-
11 files changed, 167 insertions(+), 63 deletions(-)
base-commit: ecbcc2461839e848970468b44db32282e5059925
--
2.31.1
This series makes the output from the ofdlocks test a bit easier for
tooling to work with, and also ignores the generated file while we're
here.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Changes in v3:
- Rebase onto v6.19-rc1.
- Link to v2: https://lore.kernel.org/r/20251015-selftest-filelock-ktap-v2-0-f5fd21b75c3a…
Changes in v2:
- Rebase onto v6.18-rc1.
- Link to v1: https://lore.kernel.org/r/20250818-selftest-filelock-ktap-v1-0-d41af77f1396…
---
Mark Brown (3):
kselftest/filelock: Use ksft_perror()
kselftest/filelock: Report each test in oftlocks separately
kselftest/filelock: Add a .gitignore file
tools/testing/selftests/filelock/.gitignore | 1 +
tools/testing/selftests/filelock/ofdlocks.c | 94 +++++++++++++----------------
2 files changed, 42 insertions(+), 53 deletions(-)
---
base-commit: 8f0b4cce4481fb22653697cced8d0d04027cb1e8
change-id: 20250604-selftest-filelock-ktap-f2ae998a0de0
Best regards,
--
Mark Brown <broonie(a)kernel.org>
Hello,
While writing some Kunit tests for ext4 filesystem, I'm encountering an
issue in the way we display the diagnostic logs upon failures, when
using KUNIT_CASE_PARAM() to write the tests.
This can be observed by patching fs/ext4/mballoc-test.c to fail
and print one of the params:
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -350,6 +350,8 @@ static int mbt_kunit_init(struct kunit *test)
struct super_block *sb;
int ret;
+ KUNIT_FAIL(test, "Failed: blocksize_bits=%d", layout->blocksize_bits);
+
sb = mbt_ext4_alloc_super_block();
if (sb == NULL)
return -ENOMEM;
With the above change, we can observe the following output (snipped):
[18:50:25] ============== ext4_mballoc_test (7 subtests) ==============
[18:50:25] ================= test_new_blocks_simple ==================
[18:50:25] [FAILED] block_bits=10 cluster_bits=3 blocks_per_group=8192 group_count=4 desc_size=64
[18:50:25] # test_new_blocks_simple: EXPECTATION FAILED at fs/ext4/mballoc-test.c:364
[18:50:25] Failed: blocksize_bits=12
[18:50:25] [FAILED] block_bits=12 cluster_bits=3 blocks_per_group=8192 group_count=4 desc_size=64
[18:50:25] # test_new_blocks_simple: EXPECTATION FAILED at fs/ext4/mballoc-test.c:364
[18:50:25] Failed: blocksize_bits=16
[18:50:25] [FAILED] block_bits=16 cluster_bits=3 blocks_per_group=8192 group_count=4 desc_size=64
[18:50:25] # test_new_blocks_simple: EXPECTATION FAILED at fs/ext4/mballoc-test.c:364
[18:50:25] Failed: blocksize_bits=10
[18:50:25] # test_new_blocks_simple: pass:0 fail:3 skip:0 total:3
[18:50:25] ============= [FAILED] test_new_blocks_simple ==============
<snip>
Note that the diagnostic logs don't show up correctly. Ideally they
should be before test result but here the first [FAILED] test has no
logs printed above whereas the last "Failed: blocksize_bits=10" print
comes after the last subtest, when it actually corresponds to the first
subtest.
The KTAP file itself seems to have diagnostic logs in the right place:
KTAP version 1
1..2
KTAP version 1
# Subtest: ext4_mballoc_test
# module: ext4
1..7
KTAP version 1
# Subtest: test_new_blocks_simple
# test_new_blocks_simple: EXPECTATION FAILED at fs/ext4/mballoc-test.c:364
Failed: blocksize_bits=10
not ok 1 block_bits=10 cluster_bits=3 blocks_per_group=8192 group_count=4 desc_size=64
# test_new_blocks_simple: EXPECTATION FAILED at fs/ext4/mballoc-test.c:364
Failed: blocksize_bits=12
not ok 2 block_bits=12 cluster_bits=3 blocks_per_group=8192 group_count=4 desc_size=64
# test_new_blocks_simple: EXPECTATION FAILED at fs/ext4/mballoc-test.c:364
Failed: blocksize_bits=16
not ok 3 block_bits=16 cluster_bits=3 blocks_per_group=8192 group_count=4 desc_size=64
# test_new_blocks_simple: pass:0 fail:3 skip:0 total:3
not ok 1 test_new_blocks_simple
<snip>
By tracing kunit_parser.py script, I could see the issue here is in the
parsing of the "Subtest: test_new_blocks_simple". We end up associating
everything below the subtest till "not ok 1 block_bits=10..." as
diagnostic logs of the subtest, while these lons actually belong to the
first of the 3 subtests under this test.
I tired to figure out a way to fix the parsing but fixing one thing
broke something else. Im starting to think that the issue is that there
are 3 subtests under test_new_block_simple (array of 3 params passed to
KUNIT_CASE_PARAM), but instead of creating 3 structured subtests, the
KTAP output dumps the results of all 3 directly under
subtest:test_new_blocks_simple. Which makes it tricky to determine where
the diagnostic log/attributes of test_new_blocks_simple ends and that of its
children begins.
I'm not very familiar with KUnit framework so I though I'd reach out
here for some pointers. I can dedicate some time fixing this but I'd
like to know if this is something we need to somehow fix in parsing or
during generation of the KTAP file itself? Any pointers would be
appreciated.
Thanks,
Ojaswin
From: Li RongQing <lirongqing(a)baidu.com>
The softlockup_panic sysctl is currently a binary option: panic immediately
or never panic on soft lockups.
Panicking on any soft lockup, regardless of duration, can be overly
aggressive for brief stalls that may be caused by legitimate operations.
Conversely, never panicking may allow severe system hangs to persist
undetected.
Extend softlockup_panic to accept an integer threshold, allowing the kernel
to panic only when the normalized lockup duration exceeds N watchdog
threshold periods. This provides finer-grained control to distinguish
between transient delays and persistent system failures.
The accepted values are:
- 0: Don't panic (unchanged)
- 1: Panic when duration >= 1 * threshold (20s default, original behavior)
- N > 1: Panic when duration >= N * threshold (e.g., 2 = 40s, 3 = 60s.)
The original behavior is preserved for values 0 and 1, maintaining full
backward compatibility while allowing systems to tolerate brief lockups
while still catching severe, persistent hangs.
Signed-off-by: Li RongQing <lirongqing(a)baidu.com>
Cc: Eduard Zingerman <eddyz87(a)gmail.com>
Cc: Hao Luo <haoluo(a)google.com>
Cc: Jiri Olsa <jolsa(a)kernel.org>
Cc: John Fastabend <john.fastabend(a)gmail.com>
Cc: KP Singh <kpsingh(a)kernel.org>
Cc: Lance Yang <lance.yang(a)linux.dev>
Cc: Martin KaFai Lau <martin.lau(a)linux.dev>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Song Liu <song(a)kernel.org>
Cc: Stanislav Fomichev <sdf(a)fomichev.me>
Cc: Yonghong Song <yonghong.song(a)linux.dev>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
---
Diff with v1: add a temp variable thresh_count
chang config to 0 in kernel/configs/debug.config
Documentation/admin-guide/kernel-parameters.txt | 10 +++++-----
arch/arm/configs/aspeed_g5_defconfig | 2 +-
arch/arm/configs/pxa3xx_defconfig | 2 +-
arch/openrisc/configs/or1klitex_defconfig | 2 +-
arch/powerpc/configs/skiroot_defconfig | 2 +-
drivers/gpu/drm/ci/arm.config | 2 +-
drivers/gpu/drm/ci/arm64.config | 2 +-
drivers/gpu/drm/ci/x86_64.config | 2 +-
kernel/configs/debug.config | 2 +-
kernel/watchdog.c | 10 ++++++----
lib/Kconfig.debug | 13 +++++++------
tools/testing/selftests/bpf/config | 2 +-
tools/testing/selftests/wireguard/qemu/kernel.config | 2 +-
13 files changed, 28 insertions(+), 25 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a8d0afd..27c5f96 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6934,12 +6934,12 @@ Kernel parameters
softlockup_panic=
[KNL] Should the soft-lockup detector generate panics.
- Format: 0 | 1
+ Format: <int>
- A value of 1 instructs the soft-lockup detector
- to panic the machine when a soft-lockup occurs. It is
- also controlled by the kernel.softlockup_panic sysctl
- and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the
+ A value of non-zero instructs the soft-lockup detector
+ to panic the machine when a soft-lockup duration exceeds
+ N thresholds. It is also controlled by the kernel.softlockup_panic
+ sysctl and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the
respective build-time switch to that functionality.
softlockup_all_cpu_backtrace=
diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig
index 2e6ea13..ec558e5 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -306,7 +306,7 @@ CONFIG_SCHED_STACK_END_CHECK=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_PANIC_TIMEOUT=-1
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1
CONFIG_WQ_WATCHDOG=y
# CONFIG_SCHED_DEBUG is not set
diff --git a/arch/arm/configs/pxa3xx_defconfig b/arch/arm/configs/pxa3xx_defconfig
index 07d422f..fb272e3 100644
--- a/arch/arm/configs/pxa3xx_defconfig
+++ b/arch/arm/configs/pxa3xx_defconfig
@@ -100,7 +100,7 @@ CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_KERNEL=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_SHIRQ=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
# CONFIG_SCHED_DEBUG is not set
CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_SPINLOCK_SLEEP=y
diff --git a/arch/openrisc/configs/or1klitex_defconfig b/arch/openrisc/configs/or1klitex_defconfig
index fb1eb9a..984b0e3 100644
--- a/arch/openrisc/configs/or1klitex_defconfig
+++ b/arch/openrisc/configs/or1klitex_defconfig
@@ -52,5 +52,5 @@ CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity,bpf"
CONFIG_PRINTK_TIME=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_BUG_ON_DATA_CORRUPTION=y
diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig
index 2b71a6d..a4114fc 100644
--- a/arch/powerpc/configs/skiroot_defconfig
+++ b/arch/powerpc/configs/skiroot_defconfig
@@ -289,7 +289,7 @@ CONFIG_SCHED_STACK_END_CHECK=y
CONFIG_DEBUG_STACKOVERFLOW=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_HARDLOCKUP_DETECTOR=y
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
CONFIG_WQ_WATCHDOG=y
diff --git a/drivers/gpu/drm/ci/arm.config b/drivers/gpu/drm/ci/arm.config
index 411e814..d7c5167 100644
--- a/drivers/gpu/drm/ci/arm.config
+++ b/drivers/gpu/drm/ci/arm.config
@@ -52,7 +52,7 @@ CONFIG_TMPFS=y
CONFIG_PROVE_LOCKING=n
CONFIG_DEBUG_LOCKDEP=n
CONFIG_SOFTLOCKUP_DETECTOR=n
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=n
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0
CONFIG_FW_LOADER_COMPRESS=y
diff --git a/drivers/gpu/drm/ci/arm64.config b/drivers/gpu/drm/ci/arm64.config
index fddfbd4..ea0e307 100644
--- a/drivers/gpu/drm/ci/arm64.config
+++ b/drivers/gpu/drm/ci/arm64.config
@@ -161,7 +161,7 @@ CONFIG_TMPFS=y
CONFIG_PROVE_LOCKING=n
CONFIG_DEBUG_LOCKDEP=n
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_DETECT_HUNG_TASK=y
diff --git a/drivers/gpu/drm/ci/x86_64.config b/drivers/gpu/drm/ci/x86_64.config
index 8eaba388..7ac98a7 100644
--- a/drivers/gpu/drm/ci/x86_64.config
+++ b/drivers/gpu/drm/ci/x86_64.config
@@ -47,7 +47,7 @@ CONFIG_TMPFS=y
CONFIG_PROVE_LOCKING=n
CONFIG_DEBUG_LOCKDEP=n
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_DETECT_HUNG_TASK=y
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index 9f6ab7d..774702591 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -84,7 +84,7 @@ CONFIG_SLUB_DEBUG_ON=y
# Debug Oops, Lockups and Hangs
#
CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0
-# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_PANIC_ON_OOPS=y
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 0685e3a..8168e0d 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -363,7 +363,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly;
/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
- IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC);
+ CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC;
static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;
@@ -774,8 +774,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
unsigned long touch_ts, period_ts, now;
struct pt_regs *regs = get_irq_regs();
- int duration;
int softlockup_all_cpu_backtrace;
+ int duration, thresh_count;
unsigned long flags;
if (!watchdog_enabled)
@@ -879,7 +879,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT);
- if (softlockup_panic)
+ thresh_count = duration / get_softlockup_thresh();
+
+ if (softlockup_panic && thresh_count >= softlockup_panic)
panic("softlockup: hung tasks");
}
@@ -1228,7 +1230,7 @@ static const struct ctl_table watchdog_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "softlockup_sys_info",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ba36939..17a7a77 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1110,13 +1110,14 @@ config SOFTLOCKUP_DETECTOR_INTR_STORM
the CPU stats and the interrupt counts during the "soft lockups".
config BOOTPARAM_SOFTLOCKUP_PANIC
- bool "Panic (Reboot) On Soft Lockups"
+ int "Panic (Reboot) On Soft Lockups"
depends on SOFTLOCKUP_DETECTOR
+ default 0
help
- Say Y here to enable the kernel to panic on "soft lockups",
- which are bugs that cause the kernel to loop in kernel
- mode for more than 20 seconds (configurable using the watchdog_thresh
- sysctl), without giving other tasks a chance to run.
+ Set to a non-zero value N to enable the kernel to panic on "soft
+ lockups", which are bugs that cause the kernel to loop in kernel
+ mode for more than (N * 20 seconds) (configurable using the
+ watchdog_thresh sysctl), without giving other tasks a chance to run.
The panic can be used in combination with panic_timeout,
to cause the system to reboot automatically after a
@@ -1124,7 +1125,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC
high-availability systems that have uptime guarantees and
where a lockup must be resolved ASAP.
- Say N if unsure.
+ Say 0 if unsure.
config HAVE_HARDLOCKUP_DETECTOR_BUDDY
bool
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 558839e..2485538 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -1,6 +1,6 @@
CONFIG_BLK_DEV_LOOP=y
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_BPF=y
CONFIG_BPF_EVENTS=y
CONFIG_BPF_JIT=y
diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config
index 0504c11..bb89d2d 100644
--- a/tools/testing/selftests/wireguard/qemu/kernel.config
+++ b/tools/testing/selftests/wireguard/qemu/kernel.config
@@ -80,7 +80,7 @@ CONFIG_HARDLOCKUP_DETECTOR=y
CONFIG_WQ_WATCHDOG=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1
CONFIG_PANIC_TIMEOUT=-1
CONFIG_STACKTRACE=y
--
2.9.4
sched_ext tasks can be starved by long-running RT tasks, especially since
RT throttling was replaced by deadline servers to boost only SCHED_NORMAL
tasks.
Several users in the community have reported issues with RT stalling
sched_ext tasks. This is fairly common on distributions or environments
where applications like video compositors, audio services, etc. run as RT
tasks by default.
Example trace (showing a per-CPU kthread stalled due to the sway Wayland
compositor running as an RT task):
runnable task stall (kworker/0:0[106377] failed to run for 5.043s)
...
CPU 0 : nr_run=3 flags=0xd cpu_rel=0 ops_qseq=20646200 pnt_seq=45388738
curr=sway[994] class=rt_sched_class
R kworker/0:0[106377] -5043ms
scx_state/flags=3/0x1 dsq_flags=0x0 ops_state/qseq=0/0
sticky/holding_cpu=-1/-1 dsq_id=0x8000000000000002 dsq_vtime=0 slice=20000000
cpus=01
This is often perceived as a bug in the BPF schedulers, but in reality they
can't do much: RT tasks run outside their control and can potentially
consume 100% of the CPU bandwidth.
Fix this by adding a sched_ext deadline server, so that sched_ext tasks are
also boosted and do not suffer starvation.
Two kselftests are also provided to verify the starvation fixes and
bandwidth allocation is correct.
== Design ==
- The EXT server is initialized at boot time and remains configured
throughout the system's lifetime
- It starts automatically when the first sched_ext task is enqueued
(rq->scx.nr_running == 1)
- The server's pick function (ext_server_pick_task) always selects
sched_ext tasks when active
- Runtime accounting happens in update_curr_scx() during task execution
and update_curr_idle() when idle
- Bandwidth accounting includes both fair and ext servers in root domain
calculations
- A debugfs interface (/sys/kernel/debug/sched/ext_server/) allows runtime
tuning of server parameters
== Highlights in this version ==
As discussed at the sched_ext microconference at LPC Tokyo, the plan is to
start with a simpler approach, avoiding automatically creating or tearing
down the EXT server bandwidth reservation when a BPF scheduler is loaded or
unloaded. Instead, the reservation is kept permanently active. This
significantly simplifies the logic while still addressing the starvation
issue.
Any fine-tuning of the bandwidth reservation is delegated to the system
administrator, who can adjust it via the debugfs interface. In the future,
a more suitable interface can be introduced and automatic removal of the
reservation when the BPF scheduler is unloaded can be revisited.
This patchset is also available in the following git branch:
git://git.kernel.org/pub/scm/linux/kernel/git/arighi/linux.git scx-dl-server
Changes in v11:
- do not create/remove the bandwidth reservation for the ext server when a
BPF scheduler is loaded/unloaded, but keep the reservation bandwdith
always active
- change rt_stall kselftest to validate both FAIR and EXT DL servers
- Link to v10: https://lore.kernel.org/all/20250903095008.162049-1-arighi@nvidia.com/
Changes in v10:
- reordered patches to better isolate sched_ext changes vs sched/deadline
changes (Andrea Righi)
- define ext_server only with CONFIG_SCHED_CLASS_EXT=y (Andrea Righi)
- add WARN_ON_ONCE(!cpus) check in dl_server_apply_params() (Andrea Righi)
- wait for inactive_task_timer to fire before removing the bandwidth
reservation (Juri Lelli)
- remove explicit dl_server_stop() in dequeue_task_scx() to reduce timer
reprogramming overhead (Juri Lelli)
- do not restart pick_task() when invoked by the dl_server (Tejun Heo)
- rename rq_dl_server to dl_server (Peter Zijlstra)
- fixed a missing dl_server start in dl_server_on() (Christian Loehle)
- add a comment to the rt_stall selftest to better explain the 4%
threshold (Emil Tsalapatis)
- Link to v9: https://lore.kernel.org/all/20251017093214.70029-1-arighi@nvidia.com/
Changes in v9:
- Drop the ->balance() logic as its functionality is now integrated into
->pick_task(), allowing dl_server to call pick_task_scx() directly
- Link to v8: https://lore.kernel.org/all/20250903095008.162049-1-arighi@nvidia.com/
Changes in v8:
- Add tj's patch to de-couple balance and pick_task and avoid changing
sched/core callbacks to propagate @rf
- Simplify dl_se->dl_server check (suggested by PeterZ)
- Small coding style fixes in the kselftests
- Link to v7: https://lore.kernel.org/all/20250809184800.129831-1-joelagnelf@nvidia.com/
Changes in v7:
- Rebased to Linus master
- Link to v6: https://lore.kernel.org/all/20250702232944.3221001-1-joelagnelf@nvidia.com/
Changes in v6:
- Added Acks to few patches
- Fixes to few nits suggested by Tejun
- Link to v5: https://lore.kernel.org/all/20250620203234.3349930-1-joelagnelf@nvidia.com/
Changes in v5:
- Added a kselftest (total_bw) to sched_ext to verify bandwidth values
from debugfs
- Address comment from Andrea about redundant rq clock invalidation
- Link to v4: https://lore.kernel.org/all/20250617200523.1261231-1-joelagnelf@nvidia.com/
Changes in v4:
- Fixed issues with hotplugged CPUs having their DL server bandwidth
altered due to loading SCX
- Fixed other issues
- Rebased on Linus master
- All sched_ext kselftests reliably pass now, also verified that the
total_bw in debugfs (CONFIG_SCHED_DEBUG) is conserved with these patches
- Link to v3: https://lore.kernel.org/all/20250613051734.4023260-1-joelagnelf@nvidia.com/
Changes in v3:
- Removed code duplication in debugfs. Made ext interface separate
- Fixed issue where rq_lock_irqsave was not used in the relinquish patch
- Fixed running bw accounting issue in dl_server_remove_params
- Link to v2: https://lore.kernel.org/all/20250602180110.816225-1-joelagnelf@nvidia.com/
Changes in v2:
- Fixed a hang related to using rq_lock instead of rq_lock_irqsave
- Added support to remove BW of DL servers when they are switched to/from EXT
- Link to v1: https://lore.kernel.org/all/20250315022158.2354454-1-joelagnelf@nvidia.com/
Andrea Righi (2):
sched_ext: Add a DL server for sched_ext tasks
selftests/sched_ext: Add test for sched_ext dl_server
Joel Fernandes (5):
sched/deadline: Clear the defer params
sched/debug: Fix updating of ppos on server write ops
sched/debug: Stop and start server based on if it was active
sched/debug: Add support to change sched_ext server params
selftests/sched_ext: Add test for DL server total_bw consistency
kernel/sched/core.c | 6 +
kernel/sched/deadline.c | 87 +++++--
kernel/sched/debug.c | 171 +++++++++++---
kernel/sched/ext.c | 42 ++++
kernel/sched/idle.c | 3 +
kernel/sched/sched.h | 2 +
kernel/sched/topology.c | 5 +
tools/testing/selftests/sched_ext/Makefile | 2 +
tools/testing/selftests/sched_ext/rt_stall.bpf.c | 23 ++
tools/testing/selftests/sched_ext/rt_stall.c | 240 +++++++++++++++++++
tools/testing/selftests/sched_ext/total_bw.c | 281 +++++++++++++++++++++++
11 files changed, 811 insertions(+), 51 deletions(-)
create mode 100644 tools/testing/selftests/sched_ext/rt_stall.bpf.c
create mode 100644 tools/testing/selftests/sched_ext/rt_stall.c
create mode 100644 tools/testing/selftests/sched_ext/total_bw.c
This series creates a new PMU scheme on ARM, a partitioned PMU that
allows reserving a subset of counters for more direct guest access,
significantly reducing overhead. More details, including performance
benchmarks, can be read in the v1 cover letter linked below.
An overview of what this series accomplishes was presented at KVM
Forum 2025. Slides [1] and video [2] are linked below.
The long duration between v4 and v5 is due to time spent on this
project being monopolized preparing this feature for internal
production. As a result, there are too many improvements to fully list
here, but I will cover the notable ones.
v5:
* Rebase onto v6.18-rc7. This required pulling some reorganization
patches from Anish and Sean that were dependencies from previous
versions based on kvm/queue but never made it to upstream.
* Ensure FGTs (fine-grained traps) are correctly programmed at vCPU
load using kvm_vcpu_load_fgt() and helpers introduced by Oliver
Upton.
* Cleanly separate concerns of whether the partitioned PMU is enabled
for the guest and whether FGT should be enabled. This allows that
the capability can be VM-scoped while the implementation detail of
whether FGT and context switching are in effect can remain
vCPU-scoped.
* Shrink the uAPI change. Instead of a cap and corresponding ioctl,
the feature can be controlled by just a cap with an argument. The
cap is now also VM-scoped and enforces ordering that it should be
decided before vCPUs are created. Whether the cap is enabled is now
tracked by the new flag KVM_ARCH_ARM_PARTITIONED_PMU_ENABLED.
* Improve log messages when partitioning in the PMUv3 driver.
* Introduce a global variable armv8pmu_hpmn_max in the PMUv3 driver so
KVM code can read if a value was set before the PMU is probed. This
is needed to properly test if we have the capability before vCPUs
are created.
* Make it possible for a VMM to filter the HPMN0 feature bit.
* Fix event filter problems with PMEVTYPER handling in
writethrough_pmevtyper() and kvm_pmu_apply_event_filter() by using
kvm_pmu_event_mask() in the right spots. And if an event is
filtered, write the physical register with the appropriate exclude
bits set but keep the virtual register exactly what the guest wrote.
* Fix register access problems with the PMU register fast path handler
by lifting some static PMU access checks from sys_regs.c to use them
in the fast path too and make bit masking more strict for better ARM
compliance.
* Fix the readability and logic of programming the MDCR_EL2 register
when entering the guest. Make sure to set the HPME bit to allow host
counters to count guest events. Set TPM and TPMCR by default and
clear them if partitioning is enabled rather than the previous
inverted logic of leaving them clear and setting them if
partitioning is not enabled. Make the HPMN field computation more
clear.
* As part of lazy context switching, do a load when the guest is
switching to physical access to ensure any previous writes that only
reached the virtual registers reach the physical ones as well and
are not clobbered by the next vcpu_put().
* Other fixes and improvements that are too small to mention or left
out from my personal notes.
v4:
https://lore.kernel.org/kvmarm/20250714225917.1396543-1-coltonlewis@google.…
v3:
https://lore.kernel.org/kvm/20250626200459.1153955-1-coltonlewis@google.com/
v2:
https://lore.kernel.org/kvm/20250620221326.1261128-1-coltonlewis@google.com/
v1:
https://lore.kernel.org/kvm/20250602192702.2125115-1-coltonlewis@google.com/
[1] https://gitlab.com/qemu-project/kvm-forum/-/raw/main/_attachments/2025/Opti…
[2] https://www.youtube.com/watch?v=YRzZ8jMIA6M&list=PLW3ep1uCIRfxwmllXTOA2txfD…
Anish Ghulati (1):
KVM: arm64: Move arm_{psci,hypercalls}.h to an internal KVM path
Colton Lewis (20):
arm64: cpufeature: Add cpucap for HPMN0
KVM: arm64: Reorganize PMU functions
perf: arm_pmuv3: Introduce method to partition the PMU
perf: arm_pmuv3: Generalize counter bitmasks
perf: arm_pmuv3: Keep out of guest counter partition
KVM: arm64: Set up FGT for Partitioned PMU
KVM: arm64: Writethrough trapped PMEVTYPER register
KVM: arm64: Use physical PMSELR for PMXEVTYPER if partitioned
KVM: arm64: Writethrough trapped PMOVS register
KVM: arm64: Write fast path PMU register handlers
KVM: arm64: Setup MDCR_EL2 to handle a partitioned PMU
KVM: arm64: Account for partitioning in PMCR_EL0 access
KVM: arm64: Context swap Partitioned PMU guest registers
KVM: arm64: Enforce PMU event filter at vcpu_load()
KVM: arm64: Implement lazy PMU context swaps
perf: arm_pmuv3: Handle IRQs for Partitioned PMU guest counters
KVM: arm64: Inject recorded guest interrupts
KVM: arm64: Add KVM_CAP to partition the PMU
KVM: selftests: Add find_bit to KVM library
KVM: arm64: selftests: Add test case for partitioned PMU
Marc Zyngier (1):
KVM: arm64: Reorganize PMU includes
Sean Christopherson (2):
KVM: arm64: Include KVM headers to get forward declarations
KVM: arm64: Move ARM specific headers in include/kvm to arch directory
Documentation/virt/kvm/api.rst | 24 +
arch/arm/include/asm/arm_pmuv3.h | 28 +
arch/arm64/include/asm/arm_pmuv3.h | 61 +-
.../arm64/include/asm/kvm_arch_timer.h | 2 +
arch/arm64/include/asm/kvm_host.h | 24 +-
.../arm64/include/asm/kvm_pmu.h | 142 ++++
arch/arm64/include/asm/kvm_types.h | 7 +-
.../arm64/include/asm/kvm_vgic.h | 0
arch/arm64/kernel/cpufeature.c | 8 +
arch/arm64/kvm/Makefile | 2 +-
arch/arm64/kvm/arch_timer.c | 5 +-
arch/arm64/kvm/arm.c | 23 +-
{include => arch/arm64}/kvm/arm_hypercalls.h | 0
{include => arch/arm64}/kvm/arm_psci.h | 0
arch/arm64/kvm/config.c | 34 +-
arch/arm64/kvm/debug.c | 31 +-
arch/arm64/kvm/guest.c | 2 +-
arch/arm64/kvm/handle_exit.c | 2 +-
arch/arm64/kvm/hyp/Makefile | 6 +-
arch/arm64/kvm/hyp/include/hyp/switch.h | 211 ++++-
arch/arm64/kvm/hyp/nvhe/switch.c | 4 +-
arch/arm64/kvm/hyp/vhe/switch.c | 4 +-
arch/arm64/kvm/hypercalls.c | 4 +-
arch/arm64/kvm/pmu-direct.c | 464 +++++++++++
arch/arm64/kvm/pmu-emul.c | 678 +---------------
arch/arm64/kvm/pmu.c | 726 ++++++++++++++++++
arch/arm64/kvm/psci.c | 4 +-
arch/arm64/kvm/pvtime.c | 2 +-
arch/arm64/kvm/reset.c | 3 +-
arch/arm64/kvm/sys_regs.c | 110 +--
arch/arm64/kvm/trace_arm.h | 2 +-
arch/arm64/kvm/trng.c | 2 +-
arch/arm64/kvm/vgic/vgic-debug.c | 2 +-
arch/arm64/kvm/vgic/vgic-init.c | 2 +-
arch/arm64/kvm/vgic/vgic-irqfd.c | 2 +-
arch/arm64/kvm/vgic/vgic-kvm-device.c | 2 +-
arch/arm64/kvm/vgic/vgic-mmio-v2.c | 2 +-
arch/arm64/kvm/vgic/vgic-mmio-v3.c | 2 +-
arch/arm64/kvm/vgic/vgic-mmio.c | 4 +-
arch/arm64/kvm/vgic/vgic-v2.c | 2 +-
arch/arm64/kvm/vgic/vgic-v3-nested.c | 3 +-
arch/arm64/kvm/vgic/vgic-v3.c | 2 +-
arch/arm64/kvm/vgic/vgic-v5.c | 2 +-
arch/arm64/tools/cpucaps | 1 +
arch/arm64/tools/sysreg | 6 +-
drivers/perf/arm_pmuv3.c | 137 +++-
include/linux/perf/arm_pmu.h | 1 +
include/linux/perf/arm_pmuv3.h | 14 +-
include/uapi/linux/kvm.h | 1 +
tools/include/uapi/linux/kvm.h | 1 +
tools/testing/selftests/kvm/Makefile.kvm | 1 +
.../selftests/kvm/arm64/vpmu_counter_access.c | 77 +-
tools/testing/selftests/kvm/lib/find_bit.c | 1 +
53 files changed, 2049 insertions(+), 831 deletions(-)
rename include/kvm/arm_arch_timer.h => arch/arm64/include/asm/kvm_arch_timer.h (98%)
rename include/kvm/arm_pmu.h => arch/arm64/include/asm/kvm_pmu.h (61%)
rename include/kvm/arm_vgic.h => arch/arm64/include/asm/kvm_vgic.h (100%)
rename {include => arch/arm64}/kvm/arm_hypercalls.h (100%)
rename {include => arch/arm64}/kvm/arm_psci.h (100%)
create mode 100644 arch/arm64/kvm/pmu-direct.c
create mode 100644 tools/testing/selftests/kvm/lib/find_bit.c
base-commit: ac3fd01e4c1efce8f2c054cdeb2ddd2fc0fb150d
--
2.52.0.239.gd5f0c6e74e-goog
The IA32 Emulation support can be either removed from the kernel,
disabled by default or disabled at runtime. Some of x86 selftests
are crashing for all of above thus is_32bit_syscall_supported()
helper is added to skip int80 syscalls if they are not supported.
Slawomir Rosek (2):
selftests/x86/ldt_gdt: Skip int80 if not supported
selftests/x86/ptrace_syscall: Skip int80 if not supported
tools/testing/selftests/x86/ldt_gdt.c | 21 +++++++++++++++++++-
tools/testing/selftests/x86/ptrace_syscall.c | 20 +++++++++++++++++--
2 files changed, 38 insertions(+), 3 deletions(-)
--
2.52.0.305.g3fc767764a-goog
From: Li RongQing <lirongqing(a)baidu.com>
The softlockup_panic sysctl is currently a binary option: panic immediately
or never panic on soft lockups.
Panicking on any soft lockup, regardless of duration, can be overly
aggressive for brief stalls that may be caused by legitimate operations.
Conversely, never panicking may allow severe system hangs to persist
undetected.
Extend softlockup_panic to accept an integer threshold, allowing the kernel
to panic only when the normalized lockup duration exceeds N watchdog
threshold periods. This provides finer-grained control to distinguish
between transient delays and persistent system failures.
The accepted values are:
- 0: Don't panic (unchanged)
- 1: Panic when duration >= 1 * threshold (20s default, original behavior)
- N > 1: Panic when duration >= N * threshold (e.g., 2 = 40s, 3 = 60s.)
The original behavior is preserved for values 0 and 1, maintaining full
backward compatibility while allowing systems to tolerate brief lockups
while still catching severe, persistent hangs.
Signed-off-by: Li RongQing <lirongqing(a)baidu.com>
---
Documentation/admin-guide/kernel-parameters.txt | 10 +++++-----
arch/arm/configs/aspeed_g5_defconfig | 2 +-
arch/arm/configs/pxa3xx_defconfig | 2 +-
arch/openrisc/configs/or1klitex_defconfig | 2 +-
arch/powerpc/configs/skiroot_defconfig | 2 +-
drivers/gpu/drm/ci/arm.config | 2 +-
drivers/gpu/drm/ci/arm64.config | 2 +-
drivers/gpu/drm/ci/x86_64.config | 2 +-
kernel/watchdog.c | 8 +++++---
lib/Kconfig.debug | 13 +++++++------
tools/testing/selftests/bpf/config | 2 +-
tools/testing/selftests/wireguard/qemu/kernel.config | 2 +-
12 files changed, 26 insertions(+), 23 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a8d0afd..27c5f96 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6934,12 +6934,12 @@ Kernel parameters
softlockup_panic=
[KNL] Should the soft-lockup detector generate panics.
- Format: 0 | 1
+ Format: <int>
- A value of 1 instructs the soft-lockup detector
- to panic the machine when a soft-lockup occurs. It is
- also controlled by the kernel.softlockup_panic sysctl
- and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the
+ A value of non-zero instructs the soft-lockup detector
+ to panic the machine when a soft-lockup duration exceeds
+ N thresholds. It is also controlled by the kernel.softlockup_panic
+ sysctl and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the
respective build-time switch to that functionality.
softlockup_all_cpu_backtrace=
diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig
index 2e6ea13..ec558e5 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -306,7 +306,7 @@ CONFIG_SCHED_STACK_END_CHECK=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_PANIC_TIMEOUT=-1
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1
CONFIG_WQ_WATCHDOG=y
# CONFIG_SCHED_DEBUG is not set
diff --git a/arch/arm/configs/pxa3xx_defconfig b/arch/arm/configs/pxa3xx_defconfig
index 07d422f..fb272e3 100644
--- a/arch/arm/configs/pxa3xx_defconfig
+++ b/arch/arm/configs/pxa3xx_defconfig
@@ -100,7 +100,7 @@ CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_KERNEL=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_SHIRQ=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
# CONFIG_SCHED_DEBUG is not set
CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_SPINLOCK_SLEEP=y
diff --git a/arch/openrisc/configs/or1klitex_defconfig b/arch/openrisc/configs/or1klitex_defconfig
index fb1eb9a..984b0e3 100644
--- a/arch/openrisc/configs/or1klitex_defconfig
+++ b/arch/openrisc/configs/or1klitex_defconfig
@@ -52,5 +52,5 @@ CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity,bpf"
CONFIG_PRINTK_TIME=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_BUG_ON_DATA_CORRUPTION=y
diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig
index 2b71a6d..a4114fc 100644
--- a/arch/powerpc/configs/skiroot_defconfig
+++ b/arch/powerpc/configs/skiroot_defconfig
@@ -289,7 +289,7 @@ CONFIG_SCHED_STACK_END_CHECK=y
CONFIG_DEBUG_STACKOVERFLOW=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_HARDLOCKUP_DETECTOR=y
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
CONFIG_WQ_WATCHDOG=y
diff --git a/drivers/gpu/drm/ci/arm.config b/drivers/gpu/drm/ci/arm.config
index 411e814..d7c5167 100644
--- a/drivers/gpu/drm/ci/arm.config
+++ b/drivers/gpu/drm/ci/arm.config
@@ -52,7 +52,7 @@ CONFIG_TMPFS=y
CONFIG_PROVE_LOCKING=n
CONFIG_DEBUG_LOCKDEP=n
CONFIG_SOFTLOCKUP_DETECTOR=n
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=n
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0
CONFIG_FW_LOADER_COMPRESS=y
diff --git a/drivers/gpu/drm/ci/arm64.config b/drivers/gpu/drm/ci/arm64.config
index fddfbd4..ea0e307 100644
--- a/drivers/gpu/drm/ci/arm64.config
+++ b/drivers/gpu/drm/ci/arm64.config
@@ -161,7 +161,7 @@ CONFIG_TMPFS=y
CONFIG_PROVE_LOCKING=n
CONFIG_DEBUG_LOCKDEP=n
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_DETECT_HUNG_TASK=y
diff --git a/drivers/gpu/drm/ci/x86_64.config b/drivers/gpu/drm/ci/x86_64.config
index 8eaba388..7ac98a7 100644
--- a/drivers/gpu/drm/ci/x86_64.config
+++ b/drivers/gpu/drm/ci/x86_64.config
@@ -47,7 +47,7 @@ CONFIG_TMPFS=y
CONFIG_PROVE_LOCKING=n
CONFIG_DEBUG_LOCKDEP=n
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_DETECT_HUNG_TASK=y
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 0685e3a..a5fa116 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -363,7 +363,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly;
/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
- IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC);
+ CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC;
static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;
@@ -879,7 +879,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT);
- if (softlockup_panic)
+ duration = duration / get_softlockup_thresh();
+
+ if (softlockup_panic && duration >= softlockup_panic)
panic("softlockup: hung tasks");
}
@@ -1228,7 +1230,7 @@ static const struct ctl_table watchdog_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "softlockup_sys_info",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ba36939..17a7a77 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1110,13 +1110,14 @@ config SOFTLOCKUP_DETECTOR_INTR_STORM
the CPU stats and the interrupt counts during the "soft lockups".
config BOOTPARAM_SOFTLOCKUP_PANIC
- bool "Panic (Reboot) On Soft Lockups"
+ int "Panic (Reboot) On Soft Lockups"
depends on SOFTLOCKUP_DETECTOR
+ default 0
help
- Say Y here to enable the kernel to panic on "soft lockups",
- which are bugs that cause the kernel to loop in kernel
- mode for more than 20 seconds (configurable using the watchdog_thresh
- sysctl), without giving other tasks a chance to run.
+ Set to a non-zero value N to enable the kernel to panic on "soft
+ lockups", which are bugs that cause the kernel to loop in kernel
+ mode for more than (N * 20 seconds) (configurable using the
+ watchdog_thresh sysctl), without giving other tasks a chance to run.
The panic can be used in combination with panic_timeout,
to cause the system to reboot automatically after a
@@ -1124,7 +1125,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC
high-availability systems that have uptime guarantees and
where a lockup must be resolved ASAP.
- Say N if unsure.
+ Say 0 if unsure.
config HAVE_HARDLOCKUP_DETECTOR_BUDDY
bool
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 558839e..2485538 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -1,6 +1,6 @@
CONFIG_BLK_DEV_LOOP=y
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_BPF=y
CONFIG_BPF_EVENTS=y
CONFIG_BPF_JIT=y
diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config
index 0504c11..bb89d2d 100644
--- a/tools/testing/selftests/wireguard/qemu/kernel.config
+++ b/tools/testing/selftests/wireguard/qemu/kernel.config
@@ -80,7 +80,7 @@ CONFIG_HARDLOCKUP_DETECTOR=y
CONFIG_WQ_WATCHDOG=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1
CONFIG_PANIC_TIMEOUT=-1
CONFIG_STACKTRACE=y
--
2.9.4
The resctrl selftest currently exhibits several failures on Hygon CPUs
due to missing vendor detection and edge-case handling specific to
Hygon's architecture.
This patch series addresses three distinct issues:
1. A division-by-zero crash in SNC detection on some platforms (e.g.,
Hygon).
2. Missing CPU vendor detection, causing the test to fail with
"# Can not get vendor info..." on Hygon CPUs.
3. Incorrect handling of non-contiguous CBM support on Hygon CPUs.
These changes enable resctrl selftest to run successfully on
Hygon CPUs that support Platform QoS features.
Maintainer notes:
-----------------
Patch 1: selftests/resctrl: Fix a division by zero error on Hygon
- This is a candidate for backport with "Fixes:" tag.
Patch 2: selftests/resctrl: Define CPU vendor IDs as bits to match usage
- This is *not* a candidate for backport since it is an enhancement and
preparatory patch for patch 3.
Patch 3: selftests/resctrl: Add CPU vendor detection for Hygon
Patch 4: selftests/resctrl: Fix non-contiguous CBM check for Hygon
- Even though they are fixes they are *not* candidates for backport
since they are based on another patch series (x86/resctrl: Fix
Platform QoS issues for Hygon) which is in process of being added to
resctrl.
-----------------
Changelog:
v5:
- Patch 2:
1. Fix a nit of "reverse fir ordering" of the variable declarations in
detect_vendor() in v4 patch series (Reinette).
2. Add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>.
v4:
- Cover letter: add maintainer notes outlining how these patches to be
handled (Reinette).
- Re-organize the patch series to move original patch 3 to the beginning
of series. The patch order has changed between v3 and v4 (Reinette):
v3 -> v4
patch #3 -> patch #1
patch #1 -> patch #2
patch #2 -> patch #3
patch #4 -> patch #4
- Patch 2:
1. Resolve a conflict against latest upstream kernel (Reinette).
2. Fix a nit to maintain the reverse fir ordering of variables in
detect_vendor() (Reinette).
- Patch 3: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
- Patch 4: move the maintainer note into the cover letter (Reinette).
v3:
- Patch 1:
1. Update the return types of detect_vendor() and get_vendor() from
'int' to 'unsigned int' to align with their usage as bitmask values
and to prevent potentially risky type conversions (Fenghua).
2. Split the code changes of "define CPU vendor IDs as bits to match
usage" from original patch 1 into a separate patch (this patch,
suggested by Fenghua and Reinette).
3. Introduce the flag 'initialized' to simplify the get_vendor() ->
detect_vendor() logic (Reinette).
- Patch 2 (original patch 1):
1. Move the code changes of "define CPU vendor IDs as bits to match
usage" into patch 1.
- Patch 3 (original patch 2):
1. Fix a nit of code comment for affected platforms (Fenghua).
2. Add Reviewed-by: Fenghua Yu <fenghuay(a)nvidia.com>.
- Patch 4 (original patch 3):
1. Fix a nit to avoid calling get_vendor() twice (Fenghua).
2. Add Reviewed-by: Fenghua Yu <fenghuay(a)nvidia.com>.
v2:
- Patch 1: switch all of the vendor id bitmasks to use BIT() (Reinette)
- Patch 2: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
- Patch 3: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
add a maintainer note to highlight it is not a candidate for
backport (Reinette)
Xiaochen Shen (4):
selftests/resctrl: Fix a division by zero error on Hygon
selftests/resctrl: Define CPU vendor IDs as bits to match usage
selftests/resctrl: Add CPU vendor detection for Hygon
selftests/resctrl: Fix non-contiguous CBM check for Hygon
tools/testing/selftests/resctrl/cat_test.c | 6 ++--
tools/testing/selftests/resctrl/resctrl.h | 8 ++++--
.../testing/selftests/resctrl/resctrl_tests.c | 28 +++++++++++++------
tools/testing/selftests/resctrl/resctrlfs.c | 10 +++++++
4 files changed, 39 insertions(+), 13 deletions(-)
--
2.47.3
The resctrl selftest currently exhibits several failures on Hygon CPUs
due to missing vendor detection and edge-case handling specific to
Hygon's architecture.
This patch series addresses three distinct issues:
1. A division-by-zero crash in SNC detection on some platforms (e.g.,
Hygon).
2. Missing CPU vendor detection, causing the test to fail with
"# Can not get vendor info..." on Hygon CPUs.
3. Incorrect handling of non-contiguous CBM support on Hygon CPUs.
These changes enable resctrl selftest to run successfully on
Hygon CPUs that support Platform QoS features.
Maintainer notes:
-----------------
Patch 1: selftests/resctrl: Fix a division by zero error on Hygon
- This is a candidate for backport with "Fixes:" tag.
Patch 2: selftests/resctrl: Define CPU vendor IDs as bits to match usage
- This is *not* a candidate for backport since it is an enhancement and
preparatory patch for patch 3.
Patch 3: selftests/resctrl: Add CPU vendor detection for Hygon
Patch 4: selftests/resctrl: Fix non-contiguous CBM check for Hygon
- Even though they are fixes they are *not* candidates for backport
since they are based on another patch series (x86/resctrl: Fix
Platform QoS issues for Hygon) which is in process of being added to
resctrl.
-----------------
Changelog:
v4:
- Cover letter: add maintainer notes outlining how these patches to be
handled (Reinette).
- Re-organize the patch series to move original patch 3 to the beginning
of series. The patch order has changed between v3 and v4 (Reinette):
v3 -> v4
patch #3 -> patch #1
patch #1 -> patch #2
patch #2 -> patch #3
patch #4 -> patch #4
- Patch 2:
1. Resolve a conflict against latest upstream kernel (Reinette).
2. Fix a nit to maintain the reverse fir ordering of variables in
detect_vendor() (Reinette).
- Patch 3: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
- Patch 4: move the maintainer note into the cover letter (Reinette).
v3:
- Patch 1:
1. Update the return types of detect_vendor() and get_vendor() from
'int' to 'unsigned int' to align with their usage as bitmask values
and to prevent potentially risky type conversions (Fenghua).
2. Split the code changes of "define CPU vendor IDs as bits to match
usage" from original patch 1 into a separate patch (this patch,
suggested by Fenghua and Reinette).
3. Introduce the flag 'initialized' to simplify the get_vendor() ->
detect_vendor() logic (Reinette).
- Patch 2 (original patch 1):
1. Move the code changes of "define CPU vendor IDs as bits to match
usage" into patch 1.
- Patch 3 (original patch 2):
1. Fix a nit of code comment for affected platforms (Fenghua).
2. Add Reviewed-by: Fenghua Yu <fenghuay(a)nvidia.com>.
- Patch 4 (original patch 3):
1. Fix a nit to avoid calling get_vendor() twice (Fenghua).
2. Add Reviewed-by: Fenghua Yu <fenghuay(a)nvidia.com>.
v2:
- Patch 1: switch all of the vendor id bitmasks to use BIT() (Reinette)
- Patch 2: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
- Patch 3: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
add a maintainer note to highlight it is not a candidate for
backport (Reinette)
Xiaochen Shen (4):
selftests/resctrl: Fix a division by zero error on Hygon
selftests/resctrl: Define CPU vendor IDs as bits to match usage
selftests/resctrl: Add CPU vendor detection for Hygon
selftests/resctrl: Fix non-contiguous CBM check for Hygon
tools/testing/selftests/resctrl/cat_test.c | 6 ++--
tools/testing/selftests/resctrl/resctrl.h | 8 ++++--
.../testing/selftests/resctrl/resctrl_tests.c | 28 +++++++++++++------
tools/testing/selftests/resctrl/resctrlfs.c | 10 +++++++
4 files changed, 39 insertions(+), 13 deletions(-)
--
2.47.3
This series extends BPF's cryptographic capabilities by adding kfuncs for
SHA hashing and ECDSA signature verification. These functions enable BPF
programs to perform cryptographic operations for use cases such as content
verification, integrity checking, and data authentication.
BPF programs increasingly need to verify data integrity and authenticity in
networking, security, and observability contexts. While BPF already supports
symmetric encryption/decryption, it lacks support for:
1. Cryptographic hashing - needed for content verification, fingerprinting,
and preparing message digests for signature operations
2. Asymmetric signature verification - needed to verify signed data without
requiring the signing key in the datapath
These capabilities enable use cases such as:
- Verifying signed network packets or application data in XDP/TC programs
- Implementing integrity checks in tracing and security monitoring
- Building zero-trust security models where BPF programs verify credentials
- Content-addressed storage and deduplication in BPF-based filesystems
Implementation:
The implementation follows BPF's existing crypto patterns:
1. Uses bpf_dynptr for safe memory access without page fault risks
2. Leverages the kernel's existing crypto library (lib/crypto/sha256.c and
crypto/ecdsa.c) rather than reimplementing algorithms
3. Provides context-based API for ECDSA to enable key reuse and support
multiple program types (syscall, XDP, TC)
4. Includes comprehensive selftests with NIST test vectors
Patch 1: bpf: Extend bpf_crypto_type with hash operations
- Adds hash operation callbacks to bpf_crypto_type structure
- Adds hash() and digestsize() function pointers
- Must come before crypto module to maintain bisectability
Patch 2: crypto: Add BPF hash algorithm type registration module
- Adds bpf_crypto_shash module in crypto/ subsystem
- Registers hash type with BPF crypto infrastructure
- Enables hash algorithm access through unified bpf_crypto_type interface
- Implements callbacks: alloc_tfm, free_tfm, hash, digestsize, get_flags
- Manages shash_desc lifecycle internally
Patch 3: bpf: Add SHA hash kfunc for cryptographic hashing
- Adds bpf_crypto_hash() kfunc for SHA-256/384/512
- Updates bpf_crypto_ctx_create() to support keyless operations
- Protected by CONFIG_CRYPTO_HASH2 guards
- Uses kernel's crypto library implementations
- Fixed u64 types for dynptr sizes to prevent truncation
Patch 4: selftests/bpf: Add tests for bpf_crypto_hash kfunc
- Tests basic functionality with NIST "abc" test vectors
- Validates error handling for invalid parameters (zero-length input)
- Ensures correct hash output for SHA-256, SHA-384, and SHA-512
- Adds CONFIG_CRYPTO_HASH2 and CONFIG_CRYPTO_SHA512 to selftest config
- Refactored test setup code to reduce duplication
Patch 5: bpf: Add ECDSA signature verification kfuncs
- Context-based API: bpf_ecdsa_ctx_create/acquire/release pattern
- Supports NIST curves (P-256, P-384, P-521)
- Adds bpf_ecdsa_verify() for signature verification
- Includes size query functions: keysize, digestsize, maxsize
- Enables use in non-sleepable contexts via pre-allocated contexts
- Uses crypto_sig API with p1363 format (r || s signatures)
Patch 6: selftests/bpf: Add tests for ECDSA signature verification
- Tests valid signature acceptance with RFC 6979 test vectors for P-256
- Tests invalid signature rejection
- Tests size query functions (keysize, digestsize, maxsize)
- Uses well-known NIST test vectors with "sample" message
- Adds CONFIG_CRYPTO_ECDSA to selftest config
v2:
- Fixed redundant __bpf_dynptr_is_rdonly() checks (Vadim)
- Added BPF hash algorithm type registration module in crypto/ subsystem
- Added CONFIG_CRYPTO_HASH2 guards around bpf_crypto_hash() kfunc and its
BTF registration, matching the pattern used for CONFIG_CRYPTO_ECDSA
- Added mandatory digestsize validation for hash operations
v3:
- Fixed patch ordering - header changes now in separate first commit before
crypto module to ensure bisectability (bot+bpf-ci)
- Fixed type mismatch - changed u32 to u64 for dynptr sizes in
bpf_crypto_hash() to match __bpf_dynptr_size() return type (Mykyta)
- Added CONFIG_CRYPTO_ECDSA to selftest config (Song)
- Refactored test code duplication with setup_skel() helper (Song)
- Added copyright notices to all new files
Daniel Hodges (6):
bpf: Extend bpf_crypto_type with hash operations
crypto: Add BPF hash algorithm type registration module
bpf: Add SHA hash kfunc for cryptographic hashing
selftests/bpf: Add tests for bpf_crypto_hash kfunc
bpf: Add ECDSA signature verification kfuncs
selftests/bpf: Add tests for ECDSA signature verification kfuncs
crypto/Makefile | 3 +
crypto/bpf_crypto_shash.c | 95 ++++++
include/linux/bpf_crypto.h | 2 +
kernel/bpf/crypto.c | 306 +++++++++++++++++-
tools/testing/selftests/bpf/config | 3 +
.../selftests/bpf/prog_tests/crypto_hash.c | 147 +++++++++
.../selftests/bpf/prog_tests/ecdsa_verify.c | 75 +++++
.../testing/selftests/bpf/progs/crypto_hash.c | 142 ++++++++
.../selftests/bpf/progs/ecdsa_verify.c | 160 +++++++++
9 files changed, 925 insertions(+), 8 deletions(-)
create mode 100644 crypto/bpf_crypto_shash.c
create mode 100644 tools/testing/selftests/bpf/prog_tests/crypto_hash.c
create mode 100644 tools/testing/selftests/bpf/prog_tests/ecdsa_verify.c
create mode 100644 tools/testing/selftests/bpf/progs/crypto_hash.c
create mode 100644 tools/testing/selftests/bpf/progs/ecdsa_verify.c
--
2.51.0
When the selftest 'tap.c' is compiled with '-D_FORTIFY_SOURCE=3',
the strcpy() in rtattr_add_strsz() is replaced with a checked
version which causes the test to consistently fail when compiled
with toolchains for which this option is enabled by default.
TAP version 13
1..3
# Starting 3 tests from 1 test cases.
# RUN tap.test_packet_valid_udp_gso ...
*** buffer overflow detected ***: terminated
# test_packet_valid_udp_gso: Test terminated by assertion
# FAIL tap.test_packet_valid_udp_gso
not ok 1 tap.test_packet_valid_udp_gso
# RUN tap.test_packet_valid_udp_csum ...
*** buffer overflow detected ***: terminated
# test_packet_valid_udp_csum: Test terminated by assertion
# FAIL tap.test_packet_valid_udp_csum
not ok 2 tap.test_packet_valid_udp_csum
# RUN tap.test_packet_crash_tap_invalid_eth_proto ...
*** buffer overflow detected ***: terminated
# test_packet_crash_tap_invalid_eth_proto: Test terminated by assertion
# FAIL tap.test_packet_crash_tap_invalid_eth_proto
not ok 3 tap.test_packet_crash_tap_invalid_eth_proto
# FAILED: 0 / 3 tests passed.
# Totals: pass:0 fail:3 xfail:0 xpass:0 skip:0 error:0
A buffer overflow is detected by the fortified glibc __strcpy_chk()
since the __builtin_object_size() of `RTA_DATA(rta)` is incorrectly
reported as 1, even though there is ample space in its bounding
buffer `req`.
Additionally, given that IFLA_IFNAME also expects a null-terminated
string, callers of rtaddr_add_str{,sz}() could simply use the
rtaddr_add_strsz() variant. (which has been renamed to remove the
trailing `sz`) memset() has been used for this function since it
is unchecked and thus circumvents the issue discussed in the
previous paragraph.
Fixes: 2e64fe4624d1 ("selftests: add few test cases for tap driver")
Signed-off-by: Alice C. Munduruca <alice.munduruca(a)canonical.com>
Reviewed-by: Cengiz Can <cengiz.can(a)canonical.com>
---
tools/testing/selftests/net/tap.c | 16 +++++-----------
1 file changed, 5 insertions(+), 11 deletions(-)
diff --git a/tools/testing/selftests/net/tap.c b/tools/testing/selftests/net/tap.c
index 247c3b3ac1c9..51a209014f1c 100644
--- a/tools/testing/selftests/net/tap.c
+++ b/tools/testing/selftests/net/tap.c
@@ -56,18 +56,12 @@ static void rtattr_end(struct nlmsghdr *nh, struct rtattr *attr)
static struct rtattr *rtattr_add_str(struct nlmsghdr *nh, unsigned short type,
const char *s)
{
- struct rtattr *rta = rtattr_add(nh, type, strlen(s));
+ unsigned int strsz = strlen(s) + 1;
+ struct rtattr *rta;
- memcpy(RTA_DATA(rta), s, strlen(s));
- return rta;
-}
-
-static struct rtattr *rtattr_add_strsz(struct nlmsghdr *nh, unsigned short type,
- const char *s)
-{
- struct rtattr *rta = rtattr_add(nh, type, strlen(s) + 1);
+ rta = rtattr_add(nh, type, strsz);
- strcpy(RTA_DATA(rta), s);
+ memcpy(RTA_DATA(rta), s, strsz);
return rta;
}
@@ -119,7 +113,7 @@ static int dev_create(const char *dev, const char *link_type,
link_info = rtattr_begin(&req.nh, IFLA_LINKINFO);
- rtattr_add_strsz(&req.nh, IFLA_INFO_KIND, link_type);
+ rtattr_add_str(&req.nh, IFLA_INFO_KIND, link_type);
if (fill_info_data) {
info_data = rtattr_begin(&req.nh, IFLA_INFO_DATA);
--
2.48.1
The templated test names in psp.py had a bug that was not exposed
until 80970e0fc07e ("selftests: net: py: extract the case generation
logic") changed the order of test case evaluation and test case name
extraction.
The test cases created in psp_ip_ver_test_builder() and
ipver_test_builder() were only assigning formatted names to the test
cases they returned, when the test itself was run. This series moves
the test case naming to the point where the test function is created.
Using netdevsim psp:
Before:
./tools/testing/selftests/drivers/net/psp.py
TAP version 13
1..28
ok 1 psp.test_case
ok 2 psp.test_case
ok 3 psp.test_case
ok 4 psp.test_case
ok 5 psp.test_case
ok 6 psp.test_case
ok 7 psp.test_case
ok 8 psp.test_case
ok 9 psp.test_case
ok 10 psp.test_case
ok 11 psp.dev_list_devices
...
ok 28 psp.removal_device_bi
# Totals: pass:28 fail:0 xfail:0 xpass:0 skip:0 error:0
#
# Responder logs (0):
# STDERR:
# Set PSP enable on device 3 to 0xf
# Set PSP enable on device 3 to 0x0
After:
./tools/testing/selftests/drivers/net/psp.py
TAP version 13
1..28
ok 1 psp.data_basic_send_v0_ip4
ok 2 psp.data_basic_send_v0_ip6
ok 3 psp.data_basic_send_v1_ip4
ok 4 psp.data_basic_send_v1_ip6
ok 5 psp.data_basic_send_v2_ip4
ok 6 psp.data_basic_send_v2_ip6
ok 7 psp.data_basic_send_v3_ip4
ok 8 psp.data_basic_send_v3_ip6
ok 9 psp.data_mss_adjust_ip4
ok 10 psp.data_mss_adjust_ip6
ok 11 psp.dev_list_devices
...
ok 28 psp.removal_device_bi
# Totals: pass:28 fail:0 xfail:0 xpass:0 skip:0 error:0
#
# Responder logs (0):
# STDERR:
# Set PSP enable on device 3 to 0xf
# Set PSP enable on device 3 to 0x0
Signed-off-by: Daniel Zahka <daniel.zahka(a)gmail.com>
---
Daniel Zahka (2):
selftests: drv-net: psp: fix templated test names in psp_ip_ver_test_builder()
selftests: drv-net: psp: fix test names in ipver_test_builder()
tools/testing/selftests/drivers/net/psp.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
---
base-commit: 885bebac9909994050bbbeed0829c727e42bd1b7
change-id: 20251212-psp-test-fix-f0816c40a2c1
Best regards,
--
Daniel Zahka <daniel.zahka(a)gmail.com>
Make use of empty (NULL-terminated) array instead of NULL pointer to
avoid compiler errors while maintaining the behavior of the function
intact
Signed-off-by: Clint George <clintbgeorge(a)gmail.com>
---
[] Testing:
The diff between before and after of running the kselftest test of the
module shows no regression on system with x86 architecture
Let me know if any more testing is needed to be done
[] Error log:
~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/filesystems$ make LLVM=1 W=1
CC devpts_pts
CC file_stressor
CC anon_inode_test
anon_inode_test.c:45:37: warning: null passed to a callee that requires a non-null argument [-Wnonnull]
45 | ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0);
| ^~~~
/usr/lib/llvm-18/lib/clang/18/include/__stddef_null.h:26:14: note: expanded from macro 'NULL'
26 | #define NULL ((void*)0)
| ^~~~~~~~~~
/home/clint/Desktop/kernel-dev/linux-v1/tools/testing/selftests/../../../tools/testing/selftests/kselftest_harness.h:535:11: note: expanded from macro 'ASSERT_LT'
535 | __EXPECT(expected, #expected, seen, #seen, <, 1)
| ^~~~~~~~
/home/clint/Desktop/kernel-dev/linux-v1/tools/testing/selftests/../../../tools/testing/selftests/kselftest_harness.h:758:33: note: expanded from macro '__EXPECT'
758 | __typeof__(_expected) __exp = (_expected); \
| ^~~~~~~~~
1 warning generated.
tools/testing/selftests/filesystems/anon_inode_test.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/filesystems/anon_inode_test.c b/tools/testing/selftests/filesystems/anon_inode_test.c
index 94c6c81c2..2c4c50500 100644
--- a/tools/testing/selftests/filesystems/anon_inode_test.c
+++ b/tools/testing/selftests/filesystems/anon_inode_test.c
@@ -42,7 +42,10 @@ TEST(anon_inode_no_exec)
fd_context = sys_fsopen("tmpfs", 0);
ASSERT_GE(fd_context, 0);
- ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0);
+ char *const empty_argv[] = {NULL};
+ char *const empty_envp[] = {NULL};
+
+ ASSERT_LT(execveat(fd_context, "", empty_argv, empty_envp, AT_EMPTY_PATH), 0);
ASSERT_EQ(errno, EACCES);
EXPECT_EQ(close(fd_context), 0);
--
2.43.0
Use __builtin_trap() to truly crash the program instead of dereferencing
null pointer which may be optimized by the compiler preventing the crash
from occurring
Signed-off-by: Clint George <clintbgeorge(a)gmail.com>
---
[] Testing:
The diff between before and after of running the kselftest test of the
module shows no regression on system with x86 architecture
Let me know if any more testing is needed to be done
[] Error log:
~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/coredump$ make LLVM=1 W=1
CC stackdump_test
coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference]
59 | i = *(int *)NULL;
| ^~~~~~~~~~~~
coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile'
1 warning generated.
CC coredump_socket_test
coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference]
59 | i = *(int *)NULL;
| ^~~~~~~~~~~~
coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile'
1 warning generated.
CC coredump_socket_protocol_test
coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference]
59 | i = *(int *)NULL;
| ^~~~~~~~~~~~
coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile'
1 warning generated.
tools/testing/selftests/coredump/coredump_test_helpers.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c
index a6f6d5f2a..5c8adee63 100644
--- a/tools/testing/selftests/coredump/coredump_test_helpers.c
+++ b/tools/testing/selftests/coredump/coredump_test_helpers.c
@@ -56,7 +56,7 @@ void crashing_child(void)
pthread_create(&thread, NULL, do_nothing, NULL);
/* crash on purpose */
- i = *(int *)NULL;
+ __builtin_trap();
}
int create_detached_tmpfs(void)
--
2.43.0
Add descriptive message in the _Static_assert to comply with the C11
standard requirement to prevent compiler from throwing out error. The
compiler throws an error when _Static_assert is used without a message as
that is a C23 extension.
Signed-off-by: Clint George <clintbgeorge(a)gmail.com>
---
[] Testing:
The diff between before and after of running the kselftest test of the
module shows no regression on system with x86 architecture
Let me know if any more testing is needed to be done
[] Error log:
~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/ublk$ make LLVM=1 W=1
CC kublk
In file included from kublk.c:6:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
| ^
| , ""
1 error generated.
In file included from null.c:3:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
| ^
| , ""
1 error generated.
In file included from file_backed.c:3:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
| ^
| , ""
1 error generated.
In file included from common.c:3:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
| ^
| , ""
1 error generated.
In file included from stripe.c:3:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
| ^
| , ""
1 error generated.
In file included from fault_inject.c:11:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
| ^
| , ""
1 error generated.
make: *** [../lib.mk:225: /home/clint/Desktop/kernel-dev/linux-v1/tools/testing/selftests/ublk/kublk] Error 1
tools/testing/selftests/ublk/kublk.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index fe42705c6..e5eb5f762 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -217,7 +217,7 @@ static inline __u64 build_user_data(unsigned tag, unsigned op,
unsigned tgt_data, unsigned q_id, unsigned is_target_io)
{
/* we only have 7 bits to encode q_id */
- _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
+ _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7, "UBLK_MAX_QUEUES_SHIFT must be <= 7");
assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7));
return tag | (op << 16) | (tgt_data << 24) |
--
2.43.0
On the Android arm32 platform, when performing the futex_requeue test, it will
most likely return a failure. The specific reason is detailed in a commit[1]
previously submitted by Edward Liaw. However, this commit cannot perfectly
solve the problem. This is because using a barrier does not guarantee that
the child thread will wait on futex_wait.
This series of patches attempts to solve this problem by checking whether
the child thread is in a sleeping state. This is because when the child thread
goes to sleep, it indicates that it is waiting for the futex lock.
v1->v2:
- Solve the compilation problems found by the kernel test robot
- Cleanup the atomic library code for futex test
Link: https://lore.kernel.org/all/20240918231102.234253-1-edliaw@google.com/
In the thread_state_get() function, the logic to find the thread's state
character was using `sizeof(header) - 1` to calculate the offset from
the "State:\t" string.
The `header` variable is a `const char *` pointer. `sizeof()` on a
pointer returns the size of the pointer itself, not the length of the
string literal it points to. This makes the code's behavior dependent
on the architecture's pointer size.
This bug was identified on a 32-bit ARM build (`gsi_tv_arm`) for
Android, running on an ARMv8-based device, compiled with Clang 19.0.1.
On this 32-bit architecture, `sizeof(char *)` is 4. The expression
`sizeof(header) - 1` resulted in an incorrect offset of 3, causing the
test to read the wrong character from `/proc/[tid]/status` and fail.
On 64-bit architectures, `sizeof(char *)` is 8, so the expression
coincidentally evaluates to 7, which matches the length of "State:\t".
This is why the bug likely remained hidden on 64-bit builds.
To fix this and make the code portable and correct across all
architectures, this patch replaces `sizeof(header) - 1` with
`strlen(header)`. The `strlen()` function correctly calculates the
string's length, ensuring the correct offset is always used.
Signed-off-by: Wake Liu <wakel(a)google.com>
---
tools/testing/selftests/mm/uffd-unit-tests.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c
index f4807242c5b2..6f5e404a446c 100644
--- a/tools/testing/selftests/mm/uffd-unit-tests.c
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -1317,7 +1317,7 @@ static thread_state thread_state_get(pid_t tid)
p = strstr(tmp, header);
if (p) {
/* For example, "State:\tD (disk sleep)" */
- c = *(p + sizeof(header) - 1);
+ c = *(p + strlen(header));
return c == 'D' ?
THR_STATE_UNINTERRUPTIBLE : THR_STATE_UNKNOWN;
}
--
2.52.0.223.gf5cc29aaa4-goog
This patch series adds __rust_helper to every single rust helper. The
patches do not depend on each other, so maintainers please go ahead and
pick up any patches relevant to your subsystem! Or provide your Acked-by
so that Miguel can pick them up.
These changes were generated by adding __rust_helper and running
ClangFormat. Unrelated formatting changes were removed manually.
Why is __rust_helper needed?
============================
Currently, C helpers cannot be inlined into Rust even when using LTO
because LLVM detects slightly different options on the codegen units.
* LLVM doesn't want to inline functions compiled with
`-fno-delete-null-pointer-checks` with code compiled without. The C
CGUs all have this enabled and Rust CGUs don't. Inlining is okay since
this is one of the hardening features that does not change the ABI,
and we shouldn't have null pointer dereferences in these helpers.
* LLVM doesn't want to inline functions with different list of builtins. C
side has `-fno-builtin-wcslen`; `wcslen` is not a Rust builtin, so
they should be compatible, but LLVM does not perform inlining due to
attributes mismatch.
* clang and Rust doesn't have the exact target string. Clang generates
`+cmov,+cx8,+fxsr` but Rust doesn't enable them (in fact, Rust will
complain if `-Ctarget-feature=+cmov,+cx8,+fxsr` is used). x86-64
always enable these features, so they are in fact the same target
string, but LLVM doesn't understand this and so inlining is inhibited.
This can be bypassed with `--ignore-tti-inline-compatible`, but this
is a hidden option.
(This analysis was written by Gary Guo.)
How is this fixed?
==================
To fix this we need to add __always_inline to all helpers when compiling
with LTO. However, it should not be added when running bindgen as
bindgen will ignore functions marked inline. To achieve this, we are
using a #define called __rust_helper that is defined differently
depending on whether bindgen is running or not.
Note that __rust_helper is currently always #defined to nothing.
Changing it to __always_inline will happen separately in another patch
series.
Signed-off-by: Alice Ryhl <aliceryhl(a)google.com>
---
Alice Ryhl (46):
rust: auxiliary: add __rust_helper to helpers
rust: barrier: add __rust_helper to helpers
rust: binder: add __rust_helper to helpers
rust: bitmap: add __rust_helper to helpers
rust: bitops: add __rust_helper to helpers
rust: blk: add __rust_helper to helpers
rust: bug: add __rust_helper to helpers
rust: clk: add __rust_helper to helpers
rust: completion: add __rust_helper to helpers
rust: cpu: add __rust_helper to helpers
rust: cpufreq: add __rust_helper to helpers
rust: cpumask: add __rust_helper to helpers
rust: cred: add __rust_helper to helpers
rust: device: add __rust_helper to helpers
rust: dma: add __rust_helper to helpers
rust: drm: add __rust_helper to helpers
rust: err: add __rust_helper to helpers
rust: fs: add __rust_helper to helpers
rust: io: add __rust_helper to helpers
rust: irq: add __rust_helper to helpers
rust: jump_label: add __rust_helper to helpers
rust: kunit: add __rust_helper to helpers
rust: maple_tree: add __rust_helper to helpers
rust: mm: add __rust_helper to helpers
rust: of: add __rust_helper to helpers
rust: pci: add __rust_helper to helpers
rust: pid_namespace: add __rust_helper to helpers
rust: platform: add __rust_helper to helpers
rust: poll: add __rust_helper to helpers
rust: processor: add __rust_helper to helpers
rust: property: add __rust_helper to helpers
rust: rbtree: add __rust_helper to helpers
rust: rcu: add __rust_helper to helpers
rust: refcount: add __rust_helper to helpers
rust: regulator: add __rust_helper to helpers
rust: scatterlist: add __rust_helper to helpers
rust: security: add __rust_helper to helpers
rust: slab: add __rust_helper to helpers
rust: sync: add __rust_helper to helpers
rust: task: add __rust_helper to helpers
rust: time: add __rust_helper to helpers
rust: uaccess: add __rust_helper to helpers
rust: usb: add __rust_helper to helpers
rust: wait: add __rust_helper to helpers
rust: workqueue: add __rust_helper to helpers
rust: xarray: add __rust_helper to helpers
rust/helpers/auxiliary.c | 6 +++--
rust/helpers/barrier.c | 6 ++---
rust/helpers/binder.c | 13 ++++-----
rust/helpers/bitmap.c | 6 +++--
rust/helpers/bitops.c | 11 +++++---
rust/helpers/blk.c | 4 +--
rust/helpers/bug.c | 4 +--
rust/helpers/build_bug.c | 2 +-
rust/helpers/clk.c | 24 +++++++++--------
rust/helpers/completion.c | 2 +-
rust/helpers/cpu.c | 2 +-
rust/helpers/cpufreq.c | 3 ++-
rust/helpers/cpumask.c | 32 +++++++++++++---------
rust/helpers/cred.c | 4 +--
rust/helpers/device.c | 16 +++++------
rust/helpers/dma.c | 15 ++++++-----
rust/helpers/drm.c | 7 ++---
rust/helpers/err.c | 6 ++---
rust/helpers/fs.c | 2 +-
rust/helpers/io.c | 64 +++++++++++++++++++++++---------------------
rust/helpers/irq.c | 6 +++--
rust/helpers/jump_label.c | 2 +-
rust/helpers/kunit.c | 2 +-
rust/helpers/maple_tree.c | 3 ++-
rust/helpers/mm.c | 20 +++++++-------
rust/helpers/mutex.c | 13 ++++-----
rust/helpers/of.c | 2 +-
rust/helpers/page.c | 9 ++++---
rust/helpers/pci.c | 13 +++++----
rust/helpers/pid_namespace.c | 8 +++---
rust/helpers/platform.c | 2 +-
rust/helpers/poll.c | 5 ++--
rust/helpers/processor.c | 2 +-
rust/helpers/property.c | 2 +-
rust/helpers/rbtree.c | 5 ++--
rust/helpers/rcu.c | 4 +--
rust/helpers/refcount.c | 10 +++----
rust/helpers/regulator.c | 24 ++++++++++-------
rust/helpers/scatterlist.c | 12 +++++----
rust/helpers/security.c | 26 ++++++++++--------
rust/helpers/signal.c | 2 +-
rust/helpers/slab.c | 14 +++++-----
rust/helpers/spinlock.c | 13 ++++-----
rust/helpers/sync.c | 4 +--
rust/helpers/task.c | 24 ++++++++---------
rust/helpers/time.c | 12 ++++-----
rust/helpers/uaccess.c | 8 +++---
rust/helpers/usb.c | 3 ++-
rust/helpers/vmalloc.c | 7 ++---
rust/helpers/wait.c | 2 +-
rust/helpers/workqueue.c | 8 +++---
rust/helpers/xarray.c | 10 +++----
52 files changed, 280 insertions(+), 226 deletions(-)
---
base-commit: 54e3eae855629702c566bd2e130d9f40e7f35bde
change-id: 20251202-define-rust-helper-f7b531813007
Best regards,
--
Alice Ryhl <aliceryhl(a)google.com>
When the selftest 'tap.c' is compiled with '-D_FORTIFY_SOURCE=3', the
strcpy() in rtattr_add_strsz() is replaced with a checked version which
causes the test to consistently fail when compiled with toolchains for
which this option is enabled by default.
TAP version 13
1..3
# Starting 3 tests from 1 test cases.
# RUN tap.test_packet_valid_udp_gso ...
*** buffer overflow detected ***: terminated
# test_packet_valid_udp_gso: Test terminated by assertion
# FAIL tap.test_packet_valid_udp_gso
not ok 1 tap.test_packet_valid_udp_gso
# RUN tap.test_packet_valid_udp_csum ...
*** buffer overflow detected ***: terminated
# test_packet_valid_udp_csum: Test terminated by assertion
# FAIL tap.test_packet_valid_udp_csum
not ok 2 tap.test_packet_valid_udp_csum
# RUN tap.test_packet_crash_tap_invalid_eth_proto ...
*** buffer overflow detected ***: terminated
# test_packet_crash_tap_invalid_eth_proto: Test terminated by assertion
# FAIL tap.test_packet_crash_tap_invalid_eth_proto
not ok 3 tap.test_packet_crash_tap_invalid_eth_proto
# FAILED: 0 / 3 tests passed.
# Totals: pass:0 fail:3 xfail:0 xpass:0 skip:0 error:0
A buffer overflow is detected by the fortified glibc __strcpy_chk()
since the __builtin_object_size() of `RTA_DATA(rta)` is incorrectly
reported as 1, even though there is ample space in its bounding buffer
`req`.
Using the unchecked function memcpy() here instead allows us to match
the way rtattr_add_str() is written while avoiding the spurious test
failure.
Fixes: 2e64fe4624d1 ("selftests: add few test cases for tap driver")
Signed-off-by: Alice C. Munduruca <alice.munduruca(a)canonical.com>
---
tools/testing/selftests/net/tap.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/net/tap.c b/tools/testing/selftests/net/tap.c
index 247c3b3ac1c9..dd961b629295 100644
--- a/tools/testing/selftests/net/tap.c
+++ b/tools/testing/selftests/net/tap.c
@@ -67,7 +67,7 @@ static struct rtattr *rtattr_add_strsz(struct nlmsghdr *nh, unsigned short type,
{
struct rtattr *rta = rtattr_add(nh, type, strlen(s) + 1);
- strcpy(RTA_DATA(rta), s);
+ memcpy(RTA_DATA(rta), s, strlen(s) + 1);
return rta;
}
--
2.48.1
From: Jack Thomson <jackabt(a)amazon.com>
This patch series adds ARM64 support for the KVM_PRE_FAULT_MEMORY
feature, which was previously only available on x86 [1]. This allows us
to reduce the number of stage-2 faults during execution. This is of
benefit in post-copy migration scenarios, particularly in memory
intensive applications, where we are experiencing high latencies due to
the stage-2 faults.
Patch Overview:
- The first patch adds support for the KVM_PRE_FAULT_MEMORY ioctl
on arm64.
- The second patch updates the pre_fault_memory_test to support
arm64.
- The last patch extends the pre_fault_memory_test to cover
different vm memory backings.
=== Changes Since v2 [2] ===
- Update fault info synthesize value. Thanks Suzuki
- Remove change to selftests for unaligned mmap allocations. Thanks
Sean
[1]: https://lore.kernel.org/kvm/20240710174031.312055-1-pbonzini@redhat.com
[2]: https://lore.kernel.org/linux-arm-kernel/20251013151502.6679-1-jackabt.amaz…
Jack Thomson (3):
KVM: arm64: Add pre_fault_memory implementation
KVM: selftests: Enable pre_fault_memory_test for arm64
KVM: selftests: Add option for different backing in pre-fault tests
Documentation/virt/kvm/api.rst | 3 +-
arch/arm64/kvm/Kconfig | 1 +
arch/arm64/kvm/arm.c | 1 +
arch/arm64/kvm/mmu.c | 73 +++++++++++-
tools/testing/selftests/kvm/Makefile.kvm | 1 +
.../selftests/kvm/pre_fault_memory_test.c | 110 ++++++++++++++----
6 files changed, 159 insertions(+), 30 deletions(-)
base-commit: 8a4821412cf2c1429fffa07c012dd150f2edf78c
--
2.43.0
Since Armv9.6, FEAT_LSUI supplies the load/store instructions for
previleged level to access to access user memory without clearing
PSTATE.PAN bit.
This patchset support FEAT_LSUI and applies in futex atomic operation
and user_swpX emulation where can replace from ldxr/st{l}xr
pair implmentation with clearing PSTATE.PAN bit to correspondant
load/store unprevileged atomic operation without clearing PSTATE.PAN bit.
This patch based on v6.19-rc1
Patch Sequences
================
Patch #1 adds cpufeature for FEAT_LSUI
Patch #2-#3 expose FEAT_LSUI to guest
Patch #4 adds Kconfig for FEAT_LSUI
Patch #5-#6 support futex atomic-op with FEAT_LSUI
Patch #7-#9 support user_swpX emulation with FEAT_LSUI
Patch History
==============
from v10 to v11:
- rebase to v6.19-rc1
- use cast instruction to emulate deprecated swpb instruction
- https://lore.kernel.org/all/20251103163224.818353-1-yeoreum.yun@arm.com/
from v9 to v10:
- apply FEAT_LSUI to user_swpX emulation.
- add test coverage for LSUI bit in ID_AA64ISAR3_EL1
- rebase to v6.18-rc4
- https://lore.kernel.org/all/20250922102244.2068414-1-yeoreum.yun@arm.com/
from v8 to v9:
- refotoring __lsui_cmpxchg64()
- rebase to v6.17-rc7
- https://lore.kernel.org/all/20250917110838.917281-1-yeoreum.yun@arm.com/
from v7 to v8:
- implements futex_atomic_eor() and futex_atomic_cmpxchg() with casalt
with C helper.
- Drop the small optimisation on ll/sc futex_atomic_set operation.
- modify some commit message.
- https://lore.kernel.org/all/20250816151929.197589-1-yeoreum.yun@arm.com/
from v6 to v7:
- wrap FEAT_LSUI with CONFIG_AS_HAS_LSUI in cpufeature
- remove unnecessary addition of indentation.
- remove unnecessary mte_tco_enable()/disable() on LSUI operation.
- https://lore.kernel.org/all/20250811163635.1562145-1-yeoreum.yun@arm.com/
from v5 to v6:
- rebase to v6.17-rc1
- https://lore.kernel.org/all/20250722121956.1509403-1-yeoreum.yun@arm.com/
from v4 to v5:
- remove futex_ll_sc.h futext_lsui and lsui.h and move them to futex.h
- reorganize the patches.
- https://lore.kernel.org/all/20250721083618.2743569-1-yeoreum.yun@arm.com/
from v3 to v4:
- rebase to v6.16-rc7
- modify some patch's title.
- https://lore.kernel.org/all/20250617183635.1266015-1-yeoreum.yun@arm.com/
from v2 to v3:
- expose FEAT_LUSI to guest
- add help section for LUSI Kconfig
- https://lore.kernel.org/all/20250611151154.46362-1-yeoreum.yun@arm.com/
from v1 to v2:
- remove empty v9.6 menu entry
- locate HAS_LUSI in cpucaps in order
- https://lore.kernel.org/all/20250611104916.10636-1-yeoreum.yun@arm.com/
Yeoreum Yun (9):
arm64: cpufeature: add FEAT_LSUI
KVM: arm64: expose FEAT_LSUI to guest
KVM: arm64: kselftest: set_id_regs: add test for FEAT_LSUI
arm64: Kconfig: Detect toolchain support for LSUI
arm64: futex: refactor futex atomic operation
arm64: futex: support futex with FEAT_LSUI
arm64: separate common LSUI definitions into lsui.h
arm64: armv8_deprecated: convert user_swpX to inline function
arm64: armv8_deprecated: apply FEAT_LSUI for swpX emulation.
arch/arm64/Kconfig | 5 +
arch/arm64/include/asm/futex.h | 291 +++++++++++++++---
arch/arm64/include/asm/lsui.h | 25 ++
arch/arm64/kernel/armv8_deprecated.c | 111 +++++--
arch/arm64/kernel/cpufeature.c | 10 +
arch/arm64/kvm/sys_regs.c | 3 +-
arch/arm64/tools/cpucaps | 1 +
.../testing/selftests/kvm/arm64/set_id_regs.c | 1 +
8 files changed, 381 insertions(+), 66 deletions(-)
create mode 100644 arch/arm64/include/asm/lsui.h
base-commit: 8f0b4cce4481fb22653697cced8d0d04027cb1e8
--
LEVI:{C3F47F37-75D8-414A-A8BA-3980EC8A46D7}
Make use of empty (NULL-terminated) array instead of NULL pointer to
avoid compiler errors while maintaining the behavior of the function
intact
Signed-off-by: Clint George <clintbgeorge(a)gmail.com>
---
[] Testing:
The diff between before and after of running the kselftest test of the
module shows no regression on system with x86 architecture
Let me know if any more testing is needed to be done
[] Error log:
~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/filesystems$ make LLVM=1 W=1
CC devpts_pts
CC file_stressor
CC anon_inode_test
anon_inode_test.c:45:37: warning: null passed to a callee that requires a non-null argument [-Wnonnull]
45 | ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0);
| ^~~~
/usr/lib/llvm-18/lib/clang/18/include/__stddef_null.h:26:14: note: expanded from macro 'NULL'
26 | #define NULL ((void*)0)
| ^~~~~~~~~~
/home/clint/Desktop/kernel-dev/linux-v1/tools/testing/selftests/../../../tools/testing/selftests/kselftest_harness.h:535:11: note: expanded from macro 'ASSERT_LT'
535 | __EXPECT(expected, #expected, seen, #seen, <, 1)
| ^~~~~~~~
/home/clint/Desktop/kernel-dev/linux-v1/tools/testing/selftests/../../../tools/testing/selftests/kselftest_harness.h:758:33: note: expanded from macro '__EXPECT'
758 | __typeof__(_expected) __exp = (_expected); \
| ^~~~~~~~~
1 warning generated.
tools/testing/selftests/filesystems/anon_inode_test.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/filesystems/anon_inode_test.c b/tools/testing/selftests/filesystems/anon_inode_test.c
index 94c6c81c2..2c4c50500 100644
--- a/tools/testing/selftests/filesystems/anon_inode_test.c
+++ b/tools/testing/selftests/filesystems/anon_inode_test.c
@@ -42,7 +42,10 @@ TEST(anon_inode_no_exec)
fd_context = sys_fsopen("tmpfs", 0);
ASSERT_GE(fd_context, 0);
- ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0);
+ char *const empty_argv[] = {NULL};
+ char *const empty_envp[] = {NULL};
+
+ ASSERT_LT(execveat(fd_context, "", empty_argv, empty_envp, AT_EMPTY_PATH), 0);
ASSERT_EQ(errno, EACCES);
EXPECT_EQ(close(fd_context), 0);
--
2.43.0
Use __builtin_trap() to truly crash the program instead of dereferencing
null pointer which may be optimized by the compiler preventing the crash
from occurring
Signed-off-by: Clint George <clintbgeorge(a)gmail.com>
---
[] Testing:
The diff between before and after of running the kselftest test of the
module shows no regression on system with x86 architecture
Let me know if any more testing is needed to be done
[] Error log:
~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/coredump$ make LLVM=1 W=1
CC stackdump_test
coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference]
59 | i = *(int *)NULL;
| ^~~~~~~~~~~~
coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile'
1 warning generated.
CC coredump_socket_test
coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference]
59 | i = *(int *)NULL;
| ^~~~~~~~~~~~
coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile'
1 warning generated.
CC coredump_socket_protocol_test
coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference]
59 | i = *(int *)NULL;
| ^~~~~~~~~~~~
coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile'
1 warning generated.
tools/testing/selftests/coredump/coredump_test_helpers.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c
index a6f6d5f2a..5c8adee63 100644
--- a/tools/testing/selftests/coredump/coredump_test_helpers.c
+++ b/tools/testing/selftests/coredump/coredump_test_helpers.c
@@ -56,7 +56,7 @@ void crashing_child(void)
pthread_create(&thread, NULL, do_nothing, NULL);
/* crash on purpose */
- i = *(int *)NULL;
+ __builtin_trap();
}
int create_detached_tmpfs(void)
--
2.43.0
On the Android arm32 platform, when performing the futex_requeue test, it will
most likely return a failure. The specific reason is detailed in a commit[1]
previously submitted by Edward Liaw. However, this commit cannot perfectly
solve the problem. This is because using a barrier does not guarantee that
the child thread will wait on futex_wait.
This series of patches attempts to solve this problem by checking whether
the child thread is in a sleeping state. This is because when the child thread
goes to sleep, it indicates that it is waiting for the futex lock.
Link: https://lore.kernel.org/all/20240918231102.234253-1-edliaw@google.com/
On Thu, 11 Dec 2025, Jiaqi Yan wrote:
> CONFIGs seem alright to me. Do you boot kernel with cmdline options like "default_hugepagesz=1G hugepagesz=1G hugepages=64", or dynamically set up
> huge pages via "echo 64 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages"?
Neither of these. When I do the test is skipped:
# echo 64 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
# ./arm64/sea_to_user
Random seed: 0x6b8b4567
# Mapped 0x40000 pages: gva=0x80000000 to gpa=0xff80000000
# Before EINJect: data=0xbaadcafe
# EINJ_GVA=0x81234bad, einj_gpa=0xff81234bad, einj_hva=0xffff41234bad,
einj_hpa=0x80241234bad
ok 1 # SKIP EINJ module probably not loaded?sh: line 1:
/sys/kernel/debug/apei/einj/error_type: No such file or directory
Bail out! Failed to write EINJ entry: No such file or directory (2)
# 1 skipped test(s) detected. Consider enabling relevant config options to
improve coverage.
# Planned tests != run tests (0 != 1)
# Totals: pass:0 fail:0 xfail:0 xpass:0 skip:1 error:0
This patch series suggests fixes for several corner cases in the RISC-V
vector ptrace implementation:
- init vector context with proper vlenb, to avoid reading zero vlenb
by an early attached debugger
- follow gdbserver expectations and return ENODATA instead of EINVAL
if vector extension is supported but not yet activated for the
traced process
- validate input vector csr registers in ptrace, to maintain an accurate
view of the tracee's vector context across multiple halt/resume
debug cycles
For detailed description see the appropriate commit messages. A new test
suite validate_v_ptrace is added to the tools/testing/selftests/riscv/vector
to verify some of the vector ptrace functionality and corner cases.
So far tested on the following platforms:
- test in QEMU rv32/rv64
- test on c908 (BananaPi CanMV K230D Zero)
- test on c906 (MangoPi MQ Pro)
Previous versions:
- v4: https://lore.kernel.org/linux-riscv/20251108194207.1257866-1-geomatsi@gmail…
- v3: https://lore.kernel.org/linux-riscv/20251025210655.43099-1-geomatsi@gmail.c…
- v2: https://lore.kernel.org/linux-riscv/20250821173957.563472-1-geomatsi@gmail.…
- v1: https://lore.kernel.org/linux-riscv/20251007115840.2320557-1-geomatsi@gmail…
Changes in v5:
- add support and minimal set of tests for XTheadVector
Changes in v4:
The form 'vsetvli x0, x0, ...' can only be used if VLMAX remains
unchanged, see spec 6.2. This condition was not met by the initial
values in the selftests w.r.t. the initial zeroed context. QEMU accepted
such values, but actual hardware (c908, BananaPi CanMV Zero board) did
not, setting vill. So fix the selftests after testing on hardware:
- replace 'vsetvli x0, x0, ...' by 'vsetvli rd, x0, ...'
- fixed instruction returns VLMAX, so use it in checks as well
- replace fixed vlenb == 16 in the syscall test
Changes in v3:
Address the review comments by Andy Chiu and rework the approach:
- drop forced vector context save entirely
- perform strict validation of vector csr regs in ptrace
Changes in v2:
- add thread_info flag to allow to force vector context save
- force vector context save after vector ptrace to ensure valid vector
context in the next ptrace operations
- force vector context save on the first context switch after vector
context init to get proper vlenb
---
Ilya Mamay (1):
riscv: ptrace: return ENODATA for inactive vector extension
Sergey Matyukevich (8):
riscv: vector: init vector context with proper vlenb
riscv: csr: define vtype register elements
riscv: ptrace: validate input vector csr registers
selftests: riscv: test ptrace vector interface
selftests: riscv: verify initial vector state with ptrace
selftests: riscv: verify syscalls discard vector context
selftests: riscv: verify ptrace rejects invalid vector csr inputs
selftests: riscv: verify ptrace accepts valid vector csr values
arch/riscv/include/asm/csr.h | 17 +
arch/riscv/kernel/ptrace.c | 98 +-
arch/riscv/kernel/vector.c | 12 +-
.../testing/selftests/riscv/vector/.gitignore | 2 +
tools/testing/selftests/riscv/vector/Makefile | 10 +-
.../selftests/riscv/vector/v_helpers.c | 23 +
.../selftests/riscv/vector/v_helpers.h | 2 +
.../riscv/vector/validate_v_ptrace.c | 919 ++++++++++++++++++
8 files changed, 1075 insertions(+), 8 deletions(-)
create mode 100644 tools/testing/selftests/riscv/vector/validate_v_ptrace.c
base-commit: 8f0b4cce4481fb22653697cced8d0d04027cb1e8
--
2.52.0
Note: it's net/ only bits and doesn't include changes, which shoulf be
merged separately and are posted separately. The full branch for
convenience is at [1], and the patch is here:
https://lore.kernel.org/io-uring/7486ab32e99be1f614b3ef8d0e9bc77015b173f7.1…
Many modern NICs support configurable receive buffer lengths, and zcrx and
memory providers can use buffers larger than 4K/PAGE_SIZE on x86 to improve
performance. When paired with hw-gro larger rx buffer sizes can drastically
reduce the number of buffers traversing the stack and save a lot of processing
time. It also allows to give to users larger contiguous chunks of data. The
idea was first floated around by Saeed during netdev conf 2024 and was
asked about by a few folks.
Single stream benchmarks showed up to ~30% CPU util improvement.
E.g. comparison for 4K vs 32K buffers using a 200Gbit NIC:
packets=23987040 (MB=2745098), rps=199559 (MB/s=22837)
CPU %usr %nice %sys %iowait %irq %soft %idle
0 1.53 0.00 27.78 2.72 1.31 66.45 0.22
packets=24078368 (MB=2755550), rps=200319 (MB/s=22924)
CPU %usr %nice %sys %iowait %irq %soft %idle
0 0.69 0.00 8.26 31.65 1.83 57.00 0.57
This series adds net infrastructure for memory providers configuring
the size and implements it for bnxt. It's an opt-in feature for drivers,
they should advertise support for the parameter in the qops and must check
if the hardware supports the given size. It's limited to memory providers
as it drastically simplifies implementation. It doesn't affect the fast
path zcrx uAPI, and the sizes is defined in zcrx terms, which allows it
to be flexible and adjusted in the future, see Patch 8 for details.
A liburing example can be found at [2]
full branch:
[1] https://github.com/isilence/linux.git zcrx/large-buffers-v7
Liburing example:
[2] https://github.com/isilence/liburing.git zcrx/rx-buf-len
v7: - Add xa_destroy
- Rebase
v6: - Update docs and add a selftest
v5: https://lore.kernel.org/netdev/cover.1760440268.git.asml.silence@gmail.com/
- Remove all unnecessary bits like configuration via netlink, and
multi-stage queue configuration.
v4: https://lore.kernel.org/all/cover.1760364551.git.asml.silence@gmail.com/
- Update fbnic qops
- Propagate max buf len for hns3
- Use configured buf size in __bnxt_alloc_rx_netmem
- Minor stylistic changes
v3: https://lore.kernel.org/all/cover.1755499375.git.asml.silence@gmail.com/
- Rebased, excluded zcrx specific patches
- Set agg_size_fac to 1 on warning
v2: https://lore.kernel.org/all/cover.1754657711.git.asml.silence@gmail.com/
- Add MAX_PAGE_ORDER check on pp init
- Applied comments rewording
- Adjust pp.max_len based on order
- Patch up mlx5 queue callbacks after rebase
- Minor ->queue_mgmt_ops refactoring
- Rebased to account for both fill level and agg_size_fac
- Pass providers buf length in struct pp_memory_provider_params and
apply it in __netdev_queue_confi().
- Use ->supported_ring_params to validate drivers support of set
qcfg parameters.
Jakub Kicinski (1):
eth: bnxt: adjust the fill level of agg queues with larger buffers
Pavel Begunkov (8):
net: page pool: xa init with destroy on pp init
net: page_pool: sanitise allocation order
net: memzero mp params when closing a queue
net: let pp memory provider to specify rx buf len
eth: bnxt: store rx buffer size per queue
eth: bnxt: allow providers to set rx buf size
io_uring/zcrx: document area chunking parameter
selftests: iou-zcrx: test large chunk sizes
Documentation/networking/iou-zcrx.rst | 20 +++
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 118 ++++++++++++++----
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 +
drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 6 +-
drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h | 2 +-
include/net/netdev_queues.h | 9 ++
include/net/page_pool/types.h | 1 +
net/core/netdev_rx_queue.c | 14 ++-
net/core/page_pool.c | 4 +
.../selftests/drivers/net/hw/iou-zcrx.c | 72 +++++++++--
.../selftests/drivers/net/hw/iou-zcrx.py | 37 ++++++
11 files changed, 236 insertions(+), 49 deletions(-)
--
2.52.0
Problem
=======
When host APEI is unable to claim a synchronous external abort (SEA)
during guest abort, today KVM directly injects an asynchronous SError
into the VCPU then resumes it. The injected SError usually results in
unpleasant guest kernel panic.
One of the major situation of guest SEA is when VCPU consumes recoverable
uncorrected memory error (UER), which is not uncommon at all in modern
datacenter servers with large amounts of physical memory. Although SError
and guest panic is sufficient to stop the propagation of corrupted memory,
there is room to recover from an UER in a more graceful manner.
Proposed Solution
=================
The idea is, we can replay the SEA to the faulting VCPU. If the memory
error consumption or the fault that cause SEA is not from guest kernel,
the blast radius can be limited to the poison-consuming guest process,
while the VM can keep running.
In addition, instead of doing under the hood without involving userspace,
there are benefits to redirect the SEA to VMM:
- VM customers care about the disruptions caused by memory errors, and
VMM usually has the responsibility to start the process of notifying
the customers of memory error events in their VMs. For example some
cloud provider emits a critical log in their observability UI [1], and
provides a playbook for customers on how to mitigate disruptions to
their workloads.
- VMM can protect future memory error consumption by unmapping the poisoned
pages from stage-2 page table with KVM userfault [2], or by splitting the
memslot that contains the poisoned pages.
- VMM can keep track of SEA events in the VM. When VMM thinks the status
on the host or the VM is bad enough, e.g. number of distinct SEAs
exceeds a threshold, it can restart the VM on another healthy host.
- Behavior parity with x86 architecture. When machine check exception
(MCE) is caused by VCPU, kernel or KVM signals userspace SIGBUS to
let VMM either recover from the MCE, or terminate itself with VM.
The prior RFC proposes to implement SIGBUS on arm64 as well, but
Marc preferred KVM exit over signal [3]. However, implementation
aside, returning SEA to VMM is on par with returning MCE to VMM.
Once SEA is redirected to VMM, among other actions, VMM is encouraged
to inject external aborts into the faulting VCPU.
New UAPIs
=========
This patchset introduces following userspace-visible changes to empower
VMM to control what happens for SEA on guest memory:
- KVM_CAP_ARM_SEA_TO_USER. While taking SEA, if userspace has enabled
this new capability at VM creation, and the SEA is not owned by kernel
allocated memory, instead of injecting SError, return KVM_EXIT_ARM_SEA
to userspace.
- KVM_EXIT_ARM_SEA. This is the VM exit reason VMM gets. The details
about the SEA is provided in arm_sea as much as possible, including
sanitized ESR value at EL2, faulting guest virtual and physical
addresses if available.
* From v3 [4]
- Rebased on commit 3a8660878839 ("Linux 6.18-rc1").
- In selftest, print a message if GVA or GPA expects to be valid.
* From v2 [5]:
- Rebased on "[PATCH] KVM: arm64: nv: Handle SEAs due to VNCR redirection" [6]
and kvmarm/next commit 7b8346bd9fce6 ("KVM: arm64: Don't attempt vLPI
mappings when vPE allocation is disabled")
- Took the host_owns_sea implementation from Oliver [7, 8].
- Excluded the guest SEA injection patches.
- Updated selftest.
* From v1 [9]:
- Rebased on commit 4d62121ce9b5 ("KVM: arm64: vgic-debug: Avoid
dereferencing NULL ITE pointer").
- Sanitize ESR_EL2 before reporting it to userspace.
- Do not do KVM_EXIT_ARM_SEA when SEA is caused by memory allocated to
stage-2 translation table.
[1] https://cloud.google.com/solutions/sap/docs/manage-host-errors
[2] https://lore.kernel.org/kvm/20250109204929.1106563-1-jthoughton@google.com
[3] https://lore.kernel.org/kvm/86pljbqqh0.wl-maz@kernel.org
[4] https://lore.kernel.org/kvmarm/20250731205844.1346839-1-jiaqiyan@google.com
[5] https://lore.kernel.org/kvm/20250604050902.3944054-1-jiaqiyan@google.com
[6] https://lore.kernel.org/kvmarm/20250729182342.3281742-1-oliver.upton@linux.…
[7] https://lore.kernel.org/kvm/aHFohmTb9qR_JG1E@linux.dev
[8] https://lore.kernel.org/kvm/aHK-DPufhLy5Dtuk@linux.dev
[9] https://lore.kernel.org/kvm/20250505161412.1926643-1-jiaqiyan@google.com
Jiaqi Yan (3):
KVM: arm64: VM exit to userspace to handle SEA
KVM: selftests: Test for KVM_EXIT_ARM_SEA
Documentation: kvm: new UAPI for handling SEA
Documentation/virt/kvm/api.rst | 61 ++++
arch/arm64/include/asm/kvm_host.h | 2 +
arch/arm64/kvm/arm.c | 5 +
arch/arm64/kvm/mmu.c | 68 +++-
include/uapi/linux/kvm.h | 10 +
tools/arch/arm64/include/asm/esr.h | 2 +
tools/testing/selftests/kvm/Makefile.kvm | 1 +
.../testing/selftests/kvm/arm64/sea_to_user.c | 331 ++++++++++++++++++
tools/testing/selftests/kvm/lib/kvm_util.c | 1 +
9 files changed, 480 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/kvm/arm64/sea_to_user.c
--
2.51.0.760.g7b8bcc2412-goog
The resctrl selftest currently exhibits several failures on Hygon CPUs
due to missing vendor detection and edge-case handling specific to
Hygon's architecture.
This patch series addresses three distinct issues:
1. Missing CPU vendor detection, causing the test to fail with
"# Can not get vendor info..." on Hygon CPUs.
2. A division-by-zero crash in SNC detection on Hygon CPUs.
3. Incorrect handling of non-contiguous CBM support on Hygon CPUs.
These changes enable resctrl selftest to run successfully on
Hygon CPUs that support Platform QoS features.
Changelog:
v3:
- Patch 1:
1. Update the return types of detect_vendor() and get_vendor() from
'int' to 'unsigned int' to align with their usage as bitmask values
and to prevent potentially risky type conversions (Fenghua).
2. Split the code changes of "define CPU vendor IDs as bits to match
usage" from original patch 1 into a separate patch (this patch,
suggested by Fenghua and Reinette).
3. Introduce the flag 'initialized' to simplify the get_vendor() ->
detect_vendor() logic (Reinette).
- Patch 2 (original patch 1):
1. Move the code changes of "define CPU vendor IDs as bits to match
usage" into patch 1.
- Patch 3 (original patch 2):
1. Fix a nit of code comment for affected platforms (Fenghua).
2. Add Reviewed-by: Fenghua Yu <fenghuay(a)nvidia.com>.
- Patch 4 (original patch 3):
1. Fix a nit to avoid calling get_vendor() twice (Fenghua).
2. Add Reviewed-by: Fenghua Yu <fenghuay(a)nvidia.com>.
v2:
- Patch 1: switch all of the vendor id bitmasks to use BIT() (Reinette)
- Patch 2: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
- Patch 3: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
add a maintainer note to highlight it is not a candidate for
backport (Reinette)
Xiaochen Shen (4):
selftests/resctrl: Define CPU vendor IDs as bits to match usage
selftests/resctrl: Add CPU vendor detection for Hygon
selftests/resctrl: Fix a division by zero error on Hygon
selftests/resctrl: Fix non-contiguous CBM check for Hygon
tools/testing/selftests/resctrl/cat_test.c | 6 ++--
tools/testing/selftests/resctrl/resctrl.h | 8 ++++--
.../testing/selftests/resctrl/resctrl_tests.c | 28 +++++++++++++------
tools/testing/selftests/resctrl/resctrlfs.c | 10 +++++++
4 files changed, 39 insertions(+), 13 deletions(-)
--
2.47.3
v26: CONFIG_RISCV_USER_CFI depends on CONFIG_MMU (dependency of shadow stack
on MMU). Used b4 to pick tags, apparantly it messed up some tag picks. Fixing it
v25: Removal of `riscv_nousercfi` from `cpufeature.c` and instead placing
it as extern in `usercfi.h` was leading to build error whene cfi config
is not selected. Placed `riscv_nousercfi` outside cfi config ifdef block
in `usercfi.h`
v24:
Took PaulW suggestion for fixing commit desc and checkpatch fixes
in patch #21.
"riscv: kernel command line option to opt out of user cfi"
Collected "Tested-by" tags from Andreas and Valentin. Thanks a lot
for testing it.
v23:
fixed some of the "CHECK:" reported on checkpatch --strict.
Accepted Joel's suggestion for kselftest's Makefile.
CONFIG_RISCV_USER_CFI is enabled when zicfiss, zicfilp and fcf-protection
are all present in toolchain
v22: fixing build error due to -march=zicfiss being picked in gcc-13 and above
but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
v21: fixed build errors.
Basics and overview
===================
Software with larger attack surfaces (e.g. network facing apps like databases,
browsers or apps relying on browser runtimes) suffer from memory corruption
issues which can be utilized by attackers to bend control flow of the program
to eventually gain control (by making their payload executable). Attackers are
able to perform such attacks by leveraging call-sites which rely on indirect
calls or return sites which rely on obtaining return address from stack memory.
To mitigate such attacks, risc-v extension zicfilp enforces that all indirect
calls must land on a landing pad instruction `lpad` else cpu will raise software
check exception (a new cpu exception cause code on riscv).
Similarly for return flow, risc-v extension zicfiss extends architecture with
- `sspush` instruction to push return address on a shadow stack
- `sspopchk` instruction to pop return address from shadow stack
and compare with input operand (i.e. return address on stack)
- `sspopchk` to raise software check exception if comparision above
was a mismatch
- Protection mechanism using which shadow stack is not writeable via
regular store instructions
More information an details can be found at extensions github repo [1].
Equivalent to landing pad (zicfilp) on x86 is `ENDBRANCH` instruction in Intel
CET [3] and branch target identification (BTI) [4] on arm.
Similarly x86's Intel CET has shadow stack [5] and arm64 has guarded control
stack (GCS) [6] which are very similar to risc-v's zicfiss shadow stack.
x86 and arm64 support for user mode shadow stack is already in mainline.
Kernel awareness for user control flow integrity
================================================
This series picks up Samuel Holland's envcfg changes [2] as well. So if those are
being applied independently, they should be removed from this series.
Enabling:
In order to maintain compatibility and not break anything in user mode, kernel
doesn't enable control flow integrity cpu extensions on binary by default.
Instead exposes a prctl interface to enable, disable and lock the shadow stack
or landing pad feature for a task. This allows userspace (loader) to enumerate
if all objects in its address space are compiled with shadow stack and landing
pad support and accordingly enable the feature. Additionally if a subsequent
`dlopen` happens on a library, user mode can take a decision again to disable
the feature (if incoming library is not compiled with support) OR terminate the
task (if user mode policy is strict to have all objects in address space to be
compiled with control flow integirty cpu feature). prctl to enable shadow stack
results in allocating shadow stack from virtual memory and activating for user
address space. x86 and arm64 are also following same direction due to similar
reason(s).
clone/fork:
On clone and fork, cfi state for task is inherited by child. Shadow stack is
part of virtual memory and is a writeable memory from kernel perspective
(writeable via a restricted set of instructions aka shadow stack instructions)
Thus kernel changes ensure that this memory is converted into read-only when
fork/clone happens and COWed when fault is taken due to sspush, sspopchk or
ssamoswap. In case `CLONE_VM` is specified and shadow stack is to be enabled,
kernel will automatically allocate a shadow stack for that clone call.
map_shadow_stack:
x86 introduced `map_shadow_stack` system call to allow user space to explicitly
map shadow stack memory in its address space. It is useful to allocate shadow
for different contexts managed by a single thread (green threads or contexts)
risc-v implements this system call as well.
signal management:
If shadow stack is enabled for a task, kernel performs an asynchronous control
flow diversion to deliver the signal and eventually expects userspace to issue
sigreturn so that original execution can be resumed. Even though resume context
is prepared by kernel, it is in user space memory and is subject to memory
corruption and corruption bugs can be utilized by attacker in this race window
to perform arbitrary sigreturn and eventually bypass cfi mechanism.
Another issue is how to ensure that cfi related state on sigcontext area is not
trampled by legacy apps or apps compiled with old kernel headers.
In order to mitigate control-flow hijacting, kernel prepares a token and place
it on shadow stack before signal delivery and places address of token in
sigcontext structure. During sigreturn, kernel obtains address of token from
sigcontext struture, reads token from shadow stack and validates it and only
then allow sigreturn to succeed. Compatiblity issue is solved by adopting
dynamic sigcontext management introduced for vector extension. This series
re-factor the code little bit to allow future sigcontext management easy (as
proposed by Andy Chiu from SiFive)
config and compilation:
Introduce a new risc-v config option `CONFIG_RISCV_USER_CFI`. Selecting this
config option picks the kernel support for user control flow integrity. This
optin is presented only if toolchain has shadow stack and landing pad support.
And is on purpose guarded by toolchain support. Reason being that eventually
vDSO also needs to be compiled in with shadow stack and landing pad support.
vDSO compile patches are not included as of now because landing pad labeling
scheme is yet to settle for usermode runtime.
To get more information on kernel interactions with respect to
zicfilp and zicfiss, patch series adds documentation for
`zicfilp` and `zicfiss` in following:
Documentation/arch/riscv/zicfiss.rst
Documentation/arch/riscv/zicfilp.rst
How to test this series
=======================
Toolchain
---------
$ git clone git@github.com:sifive/riscv-gnu-toolchain.git -b cfi-dev
$ riscv-gnu-toolchain/configure --prefix=<path-to-where-to-build> --with-arch=rv64gc_zicfilp_zicfiss --enable-linux --disable-gdb --with-extra-multilib-test="rv64gc_zicfilp_zicfiss-lp64d:-static"
$ make -j$(nproc)
Qemu
----
Get the lastest qemu
$ cd qemu
$ mkdir build
$ cd build
$ ../configure --target-list=riscv64-softmmu
$ make -j$(nproc)
Opensbi
-------
$ git clone git@github.com:deepak0414/opensbi.git -b v6_cfi_spec_split_opensbi
$ make CROSS_COMPILE=<your riscv toolchain> -j$(nproc) PLATFORM=generic
Linux
-----
Running defconfig is fine. CFI is enabled by default if the toolchain
supports it.
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc) defconfig
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc)
Running
-------
Modify your qemu command to have:
-bios <path-to-cfi-opensbi>/build/platform/generic/firmware/fw_dynamic.bin
-cpu rv64,zicfilp=true,zicfiss=true,zimop=true,zcmop=true
References
==========
[1] - https://github.com/riscv/riscv-cfi
[2] - https://lore.kernel.org/all/20240814081126.956287-1-samuel.holland@sifive.c…
[3] - https://lwn.net/Articles/889475/
[4] - https://developer.arm.com/documentation/109576/0100/Branch-Target-Identific…
[5] - https://www.intel.com/content/dam/develop/external/us/en/documents/catc17-i…
[6] - https://lwn.net/Articles/940403/
To: Thomas Gleixner <tglx(a)linutronix.de>
To: Ingo Molnar <mingo(a)redhat.com>
To: Borislav Petkov <bp(a)alien8.de>
To: Dave Hansen <dave.hansen(a)linux.intel.com>
To: x86(a)kernel.org
To: H. Peter Anvin <hpa(a)zytor.com>
To: Andrew Morton <akpm(a)linux-foundation.org>
To: Liam R. Howlett <Liam.Howlett(a)oracle.com>
To: Vlastimil Babka <vbabka(a)suse.cz>
To: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
To: Paul Walmsley <paul.walmsley(a)sifive.com>
To: Palmer Dabbelt <palmer(a)dabbelt.com>
To: Albert Ou <aou(a)eecs.berkeley.edu>
To: Conor Dooley <conor(a)kernel.org>
To: Rob Herring <robh(a)kernel.org>
To: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
To: Arnd Bergmann <arnd(a)arndb.de>
To: Christian Brauner <brauner(a)kernel.org>
To: Peter Zijlstra <peterz(a)infradead.org>
To: Oleg Nesterov <oleg(a)redhat.com>
To: Eric Biederman <ebiederm(a)xmission.com>
To: Kees Cook <kees(a)kernel.org>
To: Jonathan Corbet <corbet(a)lwn.net>
To: Shuah Khan <shuah(a)kernel.org>
To: Jann Horn <jannh(a)google.com>
To: Conor Dooley <conor+dt(a)kernel.org>
To: Miguel Ojeda <ojeda(a)kernel.org>
To: Alex Gaynor <alex.gaynor(a)gmail.com>
To: Boqun Feng <boqun.feng(a)gmail.com>
To: Gary Guo <gary(a)garyguo.net>
To: Björn Roy Baron <bjorn3_gh(a)protonmail.com>
To: Benno Lossin <benno.lossin(a)proton.me>
To: Andreas Hindborg <a.hindborg(a)kernel.org>
To: Alice Ryhl <aliceryhl(a)google.com>
To: Trevor Gross <tmgross(a)umich.edu>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-fsdevel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Cc: linux-riscv(a)lists.infradead.org
Cc: devicetree(a)vger.kernel.org
Cc: linux-arch(a)vger.kernel.org
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: alistair.francis(a)wdc.com
Cc: richard.henderson(a)linaro.org
Cc: jim.shu(a)sifive.com
Cc: andybnac(a)gmail.com
Cc: kito.cheng(a)sifive.com
Cc: charlie(a)rivosinc.com
Cc: atishp(a)rivosinc.com
Cc: evan(a)rivosinc.com
Cc: cleger(a)rivosinc.com
Cc: alexghiti(a)rivosinc.com
Cc: samitolvanen(a)google.com
Cc: broonie(a)kernel.org
Cc: rick.p.edgecombe(a)intel.com
Cc: rust-for-linux(a)vger.kernel.org
changelog
---------
v26:
- fixed S-O-B tag messed up by b4.
- CONFIG_RISCV_USER_CFI should depend on CONFIG_MMU
v25:
- fixed build error when cfi config is not selected due to missing symbol
error on `riscv_nousercfi`.
v24:
- Checkpatch fixes in patch # 21.
- Collected "Tested-by" tags.
v23:
- fixed some of the "CHECK:" reported on checkpatch --strict.
- Accepted Joel's suggestion for kselftest's Makefile.
- CONFIG_RISCV_USER_CFI is enabled when zicfiss, zicfilp and fcf-protection
are all present in toolchain
v22:
- CONFIG_RISCV_USER_CFI was by default "n". With dual vdso support it is
default "y" (if toolchain supports it). Fixing build error due to
"-march=zicfiss" being picked in gcc-13 partially. gcc-13 only recognizes the
flag but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
- picked up tags and some cosmetic changes in commit message for dual vdso
patch.
v21:
- Fixing build errors due to changes in arch/riscv/include/asm/vdso.h
Using #ifdef instead of IS_ENABLED in arch/riscv/include/asm/vdso.h
vdso-cfi-offsets.h should be included only when CONFIG_RISCV_USER_CFI
is selected.
v20:
- rebased on v6.18-rc1.
- Added two vDSO support. If `CONFIG_RISCV_USER_CFI` is selected
two vDSOs are compiled (one for hardware prior to RVA23 and one
for RVA23 onwards). Kernel exposes RVA23 vDSO if hardware/cpu
implements zimop else exposes existing vDSO to userspace.
- default selection for `CONFIG_RISCV_USER_CFI` is "Yes".
- replaced "__ASSEMBLY__" with "__ASSEMBLER__"
v19:
- riscv_nousercfi was `int`. changed it to unsigned long.
Thanks to Alex Ghiti for reporting it. It was a bug.
- ELP is cleared on trap entry only when CONFIG_64BIT.
- restore ssp back on return to usermode was being done
before `riscv_v_context_nesting_end` on trap exit path.
If kernel shadow stack were enabled this would result in
kernel operating on user shadow stack and panic (as I found
in my testing of kcfi patch series). So fixed that.
v18:
- rebased on 6.16-rc1
- uprobe handling clears ELP in sstatus image in pt_regs
- vdso was missing shadow stack elf note for object files.
added that. Additional asm file for vdso needed the elf marker
flag. toolchain should complain if `-fcf-protection=full` and
marker is missing for object generated from asm file. Asked
toolchain folks to fix this. Although no reason to gate the merge
on that.
- Split up compile options for march and fcf-protection in vdso
Makefile
- CONFIG_RISCV_USER_CFI option is moved under "Kernel features" menu
Added `arch/riscv/configs/hardening.config` fragment which selects
CONFIG_RISCV_USER_CFI
v17:
- fixed warnings due to empty macros in usercfi.h (reported by alexg)
- fixed prefixes in commit titles reported by alexg
- took below uprobe with fcfi v2 patch from Zong Li and squashed it with
"riscv/traps: Introduce software check exception and uprobe handling"
https://lore.kernel.org/all/20250604093403.10916-1-zong.li@sifive.com/
v16:
- If FWFT is not implemented or returns error for shadow stack activation, then
no_usercfi is set to disable shadow stack. Although this should be picked up
by extension validation and activation. Fixed this bug for zicfilp and zicfiss
both. Thanks to Charlie Jenkins for reporting this.
- If toolchain doesn't support cfi, cfi kselftest shouldn't build. Suggested by
Charlie Jenkins.
- Default for CONFIG_RISCV_USER_CFI is set to no. Charlie/Atish suggested to
keep it off till we have more hardware availibility with RVA23 profile and
zimop/zcmop implemented. Else this will start breaking people's workflow
- Includes the fix if "!RV64 and !SBI" then definitions for FWFT in
asm-offsets.c error.
v15:
- Toolchain has been updated to include `-fcf-protection` flag. This
exists for x86 as well. Updated kernel patches to compile vDSO and
selftest to compile with `fcf-protection=full` flag.
- selecting CONFIG_RISCV_USERCFI selects CONFIG_RISCV_SBI.
- Patch to enable shadow stack for kernel wasn't hidden behind
CONFIG_RISCV_USERCFI and CONFIG_RISCV_SBI. fixed that.
v14:
- rebased on top of palmer/sbi-v3. Thus dropped clement's FWFT patches
Updated RISCV_ISA_EXT_XXXX in hwcap and hwprobe constants.
- Took Radim's suggestions on bitfields.
- Placed cfi_state at the end of thread_info block so that current situation
is not disturbed with respect to member fields of thread_info in single
cacheline.
v13:
- cpu_supports_shadow_stack/cpu_supports_indirect_br_lp_instr uses
riscv_has_extension_unlikely()
- uses nops(count) to create nop slide
- RISCV_ACQUIRE_BARRIER is not needed in `amo_user_shstk`. Removed it
- changed ternaries to simply use implicit casting to convert to bool.
- kernel command line allows to disable zicfilp and zicfiss independently.
updated kernel-parameters.txt.
- ptrace user abi for cfi uses bitmasks instead of bitfields. Added ptrace
kselftest.
- cosmetic and grammatical changes to documentation.
v12:
- It seems like I had accidently squashed arch agnostic indirect branch
tracking prctl and riscv implementation of those prctls. Split them again.
- set_shstk_status/set_indir_lp_status perform CSR writes only when CPU
support is available. As suggested by Zong Li.
- Some minor clean up in kselftests as suggested by Zong Li.
v11:
- patch "arch/riscv: compile vdso with landing pad" was unconditionally
selecting `_zicfilp` for vDSO compile. fixed that. Changed `lpad 1` to
to `lpad 0`.
v10:
- dropped "mm: helper `is_shadow_stack_vma` to check shadow stack vma". This patch
is not that interesting to this patch series for risc-v. There are instances in
arch directories where VM_SHADOW_STACK flag is anyways used. Dropping this patch
to expedite merging in riscv tree.
- Took suggestions from `Clement` on "riscv: zicfiss / zicfilp enumeration" to
validate presence of cfi based on config.
- Added a patch for vDSO to have `lpad 0`. I had omitted this earlier to make sure
we add single vdso object with cfi enabled. But a vdso object with scheme of
zero labeled landing pad is least common denominator and should work with all
objects of zero labeled as well as function-signature labeled objects.
v9:
- rebased on master (39a803b754d5 fix braino in "9p: fix ->rename_sem exclusion")
- dropped "mm: Introduce ARCH_HAS_USER_SHADOW_STACK" (master has it from arm64/gcs)
- dropped "prctl: arch-agnostic prctl for shadow stack" (master has it from arm64/gcs)
v8:
- rebased on palmer/for-next
- dropped samuel holland's `envcfg` context switch patches.
they are in parlmer/for-next
v7:
- Removed "riscv/Kconfig: enable HAVE_EXIT_THREAD for riscv"
Instead using `deactivate_mm` flow to clean up.
see here for more context
https://lore.kernel.org/all/20230908203655.543765-1-rick.p.edgecombe@intel.…
- Changed the header include in `kselftest`. Hopefully this fixes compile
issue faced by Zong Li at SiFive.
- Cleaned up an orphaned change to `mm/mmap.c` in below patch
"riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE"
- Lock interfaces for shadow stack and indirect branch tracking expect arg == 0
Any future evolution of this interface should accordingly define how arg should
be setup.
- `mm/map.c` has an instance of using `VM_SHADOW_STACK`. Fixed it to use helper
`is_shadow_stack_vma`.
- Link to v6: https://lore.kernel.org/r/20241008-v5_user_cfi_series-v6-0-60d9fe073f37@riv…
v6:
- Picked up Samuel Holland's changes as is with `envcfg` placed in
`thread` instead of `thread_info`
- fixed unaligned newline escapes in kselftest
- cleaned up messages in kselftest and included test output in commit message
- fixed a bug in clone path reported by Zong Li
- fixed a build issue if CONFIG_RISCV_ISA_V is not selected
(this was introduced due to re-factoring signal context
management code)
v5:
- rebased on v6.12-rc1
- Fixed schema related issues in device tree file
- Fixed some of the documentation related issues in zicfilp/ss.rst
(style issues and added index)
- added `SHADOW_STACK_SET_MARKER` so that implementation can define base
of shadow stack.
- Fixed warnings on definitions added in usercfi.h when
CONFIG_RISCV_USER_CFI is not selected.
- Adopted context header based signal handling as proposed by Andy Chiu
- Added support for enabling kernel mode access to shadow stack using
FWFT
(https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-firmware…)
- Link to v5: https://lore.kernel.org/r/20241001-v5_user_cfi_series-v1-0-3ba65b6e550f@riv…
(Note: I had an issue in my workflow due to which version number wasn't
picked up correctly while sending out patches)
v4:
- rebased on 6.11-rc6
- envcfg: Converged with Samuel Holland's patches for envcfg management on per-
thread basis.
- vma_is_shadow_stack is renamed to is_vma_shadow_stack
- picked up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` patch
- signal context: using extended context management to maintain compatibility.
- fixed `-Wmissing-prototypes` compiler warnings for prctl functions
- Documentation fixes and amending typos.
- Link to v4: https://lore.kernel.org/all/20240912231650.3740732-1-debug@rivosinc.com/
v3:
- envcfg
logic to pick up base envcfg had a bug where `ENVCFG_CBZE` could have been
picked on per task basis, even though CPU didn't implement it. Fixed in
this series.
- dt-bindings
As suggested, split into separate commit. fixed the messaging that spec is
in public review
- arch_is_shadow_stack change
arch_is_shadow_stack changed to vma_is_shadow_stack
- hwprobe
zicfiss / zicfilp if present will get enumerated in hwprobe
- selftests
As suggested, added object and binary filenames to .gitignore
Selftest binary anyways need to be compiled with cfi enabled compiler which
will make sure that landing pad and shadow stack are enabled. Thus removed
separate enable/disable tests. Cleaned up tests a bit.
- Link to v3: https://lore.kernel.org/lkml/20240403234054.2020347-1-debug@rivosinc.com/
v2:
- Using config `CONFIG_RISCV_USER_CFI`, kernel support for riscv control flow
integrity for user mode programs can be compiled in the kernel.
- Enabling of control flow integrity for user programs is left to user runtime
- This patch series introduces arch agnostic `prctls` to enable shadow stack
and indirect branch tracking. And implements them on riscv.
---
Changes in v26:
- Link to v25: https://lore.kernel.org/r/20251205-v5_user_cfi_series-v25-0-1a07c0127361@ri…
Changes in v25:
- Link to v24: https://lore.kernel.org/r/20251204-v5_user_cfi_series-v24-0-ada7a3ba14dc@ri…
Changes in v24:
- Link to v23: https://lore.kernel.org/r/20251112-v5_user_cfi_series-v23-0-b55691eacf4f@ri…
Changes in v23:
- Link to v22: https://lore.kernel.org/r/20251023-v5_user_cfi_series-v22-0-1935270f7636@ri…
Changes in v22:
- Link to v21: https://lore.kernel.org/r/20251015-v5_user_cfi_series-v21-0-6a07856e90e7@ri…
Changes in v21:
- Link to v20: https://lore.kernel.org/r/20251013-v5_user_cfi_series-v20-0-b9de4be9912e@ri…
Changes in v20:
- Link to v19: https://lore.kernel.org/r/20250731-v5_user_cfi_series-v19-0-09b468d7beab@ri…
Changes in v19:
- Link to v18: https://lore.kernel.org/r/20250711-v5_user_cfi_series-v18-0-a8ee62f9f38e@ri…
Changes in v18:
- Link to v17: https://lore.kernel.org/r/20250604-v5_user_cfi_series-v17-0-4565c2cf869f@ri…
Changes in v17:
- Link to v16: https://lore.kernel.org/r/20250522-v5_user_cfi_series-v16-0-64f61a35eee7@ri…
Changes in v16:
- Link to v15: https://lore.kernel.org/r/20250502-v5_user_cfi_series-v15-0-914966471885@ri…
Changes in v15:
- changelog posted just below cover letter
- Link to v14: https://lore.kernel.org/r/20250429-v5_user_cfi_series-v14-0-5239410d012a@ri…
Changes in v14:
- changelog posted just below cover letter
- Link to v13: https://lore.kernel.org/r/20250424-v5_user_cfi_series-v13-0-971437de586a@ri…
Changes in v13:
- changelog posted just below cover letter
- Link to v12: https://lore.kernel.org/r/20250314-v5_user_cfi_series-v12-0-e51202b53138@ri…
Changes in v12:
- changelog posted just below cover letter
- Link to v11: https://lore.kernel.org/r/20250310-v5_user_cfi_series-v11-0-86b36cbfb910@ri…
Changes in v11:
- changelog posted just below cover letter
- Link to v10: https://lore.kernel.org/r/20250210-v5_user_cfi_series-v10-0-163dcfa31c60@ri…
---
Andy Chiu (1):
riscv: signal: abstract header saving for setup_sigcontext
Deepak Gupta (26):
mm: VM_SHADOW_STACK definition for riscv
dt-bindings: riscv: zicfilp and zicfiss in dt-bindings (extensions.yaml)
riscv: zicfiss / zicfilp enumeration
riscv: zicfiss / zicfilp extension csr and bit definitions
riscv: usercfi state for task and save/restore of CSR_SSP on trap entry/exit
riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE
riscv/mm: manufacture shadow stack pte
riscv/mm: teach pte_mkwrite to manufacture shadow stack PTEs
riscv/mm: write protect and shadow stack
riscv/mm: Implement map_shadow_stack() syscall
riscv/shstk: If needed allocate a new shadow stack on clone
riscv: Implements arch agnostic shadow stack prctls
prctl: arch-agnostic prctl for indirect branch tracking
riscv: Implements arch agnostic indirect branch tracking prctls
riscv/traps: Introduce software check exception and uprobe handling
riscv/signal: save and restore of shadow stack for signal
riscv/kernel: update __show_regs to print shadow stack register
riscv/ptrace: riscv cfi status and state via ptrace and in core files
riscv/hwprobe: zicfilp / zicfiss enumeration in hwprobe
riscv: kernel command line option to opt out of user cfi
riscv: enable kernel access to shadow stack memory via FWFT sbi call
arch/riscv: dual vdso creation logic and select vdso based on hw
riscv: create a config for shadow stack and landing pad instr support
riscv: Documentation for landing pad / indirect branch tracking
riscv: Documentation for shadow stack on riscv
kselftest/riscv: kselftest for user mode cfi
Jim Shu (1):
arch/riscv: compile vdso with landing pad and shadow stack note
Documentation/admin-guide/kernel-parameters.txt | 8 +
Documentation/arch/riscv/index.rst | 2 +
Documentation/arch/riscv/zicfilp.rst | 115 +++++
Documentation/arch/riscv/zicfiss.rst | 179 +++++++
.../devicetree/bindings/riscv/extensions.yaml | 14 +
arch/riscv/Kconfig | 23 +
arch/riscv/Makefile | 8 +-
arch/riscv/configs/hardening.config | 4 +
arch/riscv/include/asm/asm-prototypes.h | 1 +
arch/riscv/include/asm/assembler.h | 44 ++
arch/riscv/include/asm/cpufeature.h | 12 +
arch/riscv/include/asm/csr.h | 16 +
arch/riscv/include/asm/entry-common.h | 2 +
arch/riscv/include/asm/hwcap.h | 2 +
arch/riscv/include/asm/mman.h | 26 +
arch/riscv/include/asm/mmu_context.h | 7 +
arch/riscv/include/asm/pgtable.h | 30 +-
arch/riscv/include/asm/processor.h | 1 +
arch/riscv/include/asm/thread_info.h | 3 +
arch/riscv/include/asm/usercfi.h | 97 ++++
arch/riscv/include/asm/vdso.h | 13 +-
arch/riscv/include/asm/vector.h | 3 +
arch/riscv/include/uapi/asm/hwprobe.h | 2 +
arch/riscv/include/uapi/asm/ptrace.h | 34 ++
arch/riscv/include/uapi/asm/sigcontext.h | 1 +
arch/riscv/kernel/Makefile | 2 +
arch/riscv/kernel/asm-offsets.c | 10 +
arch/riscv/kernel/cpufeature.c | 25 +
arch/riscv/kernel/entry.S | 38 ++
arch/riscv/kernel/head.S | 27 +
arch/riscv/kernel/process.c | 27 +-
arch/riscv/kernel/ptrace.c | 95 ++++
arch/riscv/kernel/signal.c | 148 +++++-
arch/riscv/kernel/sys_hwprobe.c | 2 +
arch/riscv/kernel/sys_riscv.c | 10 +
arch/riscv/kernel/traps.c | 54 ++
arch/riscv/kernel/usercfi.c | 545 +++++++++++++++++++++
arch/riscv/kernel/vdso.c | 7 +
arch/riscv/kernel/vdso/Makefile | 40 +-
arch/riscv/kernel/vdso/flush_icache.S | 4 +
arch/riscv/kernel/vdso/gen_vdso_offsets.sh | 4 +-
arch/riscv/kernel/vdso/getcpu.S | 4 +
arch/riscv/kernel/vdso/note.S | 3 +
arch/riscv/kernel/vdso/rt_sigreturn.S | 4 +
arch/riscv/kernel/vdso/sys_hwprobe.S | 4 +
arch/riscv/kernel/vdso/vgetrandom-chacha.S | 5 +-
arch/riscv/kernel/vdso_cfi/Makefile | 25 +
arch/riscv/kernel/vdso_cfi/vdso-cfi.S | 11 +
arch/riscv/mm/init.c | 2 +-
arch/riscv/mm/pgtable.c | 16 +
include/linux/cpu.h | 4 +
include/linux/mm.h | 7 +
include/uapi/linux/elf.h | 2 +
include/uapi/linux/prctl.h | 27 +
kernel/sys.c | 30 ++
tools/testing/selftests/riscv/Makefile | 2 +-
tools/testing/selftests/riscv/cfi/.gitignore | 2 +
tools/testing/selftests/riscv/cfi/Makefile | 23 +
tools/testing/selftests/riscv/cfi/cfi_rv_test.h | 82 ++++
tools/testing/selftests/riscv/cfi/cfitests.c | 173 +++++++
tools/testing/selftests/riscv/cfi/shadowstack.c | 385 +++++++++++++++
tools/testing/selftests/riscv/cfi/shadowstack.h | 27 +
62 files changed, 2482 insertions(+), 41 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20240930-v5_user_cfi_series-3dc332f8f5b2
--
- debug
This is a follow-up series of [1]. It tries to fix a possible UAF in the
fops of cros_ec_chardev after the underlying protocol device has gone by
using revocable.
The 1st patch introduces the revocable which is an implementation of ideas
from the talk [2].
The 2nd and 3rd patches add test cases for revocable in Kunit and selftest.
The 4th patch converts existing protocol devices to resource providers
of cros_ec_device.
The 5th - 7th are PoC patches for showing the use case of "Replace file
operations" below.
---
I came out with 2 possible usages of revocable.
1. Use primitive APIs
Use the primitive APIs of revocable directly.
The file operations make sure the resources are available when using them.
This is what the series original proposed[3][4]. Even though it has the
finest grain for accessing the resources, it makes the user code verbose.
Per feedback from the community, I'm looking for some subsystem level
helpers so that user code can be simlper.
2. Replace file operations
Replace filp->f_op to revocable-aware warppers.
The warppers make sure the resources are available in the file operations.
The user code needs to provide a callback .try_access() to tell the wrappers
where/how to *save* the pointers of resources.
Known drawback:
- The warppers reserve the resources for all file operations even if they
might be unused.
- The user code still needs to be revocable-aware.
- The whole file operation becomes a SRCU read-side critical section. Are
there any functions can't be called in the critical section? If there is,
the file operations may not be awared of that.
See 5th - 7th patches for an example usage.
[1] https://lore.kernel.org/chrome-platform/20250721044456.2736300-6-tzungbi@ke…
[2] https://lpc.events/event/17/contributions/1627/
[3] https://lore.kernel.org/chrome-platform/20250912081718.3827390-5-tzungbi@ke…
[4] https://lore.kernel.org/chrome-platform/20250912081718.3827390-6-tzungbi@ke…
v5:
- Rebase onto next-20251015.
- Add more context about the PoC.
- Support multiple revocable providers in the PoC.
v4: https://lore.kernel.org/chrome-platform/20250923075302.591026-1-tzungbi@ker…
- Rebase onto next-20250922.
- Remove the 5th patch from v3.
- Add fops replacement PoC in 5th - 7th patches.
v3: https://lore.kernel.org/chrome-platform/20250912081718.3827390-1-tzungbi@ke…
- Rebase onto https://lore.kernel.org/chrome-platform/20250828083601.856083-1-tzungbi@ker…
and next-20250912.
- The 4th patch changed accordingly.
v2: https://lore.kernel.org/chrome-platform/20250820081645.847919-1-tzungbi@ker…
- Rename "ref_proxy" -> "revocable".
- Add test cases in Kunit and selftest.
v1: https://lore.kernel.org/chrome-platform/20250814091020.1302888-1-tzungbi@ke…
Tzung-Bi Shih (7):
revocable: Revocable resource management
revocable: Add Kunit test cases
selftests: revocable: Add kselftest cases
platform/chrome: Protect cros_ec_device lifecycle with revocable
revocable: Add fops replacement
char: misc: Leverage revocable fops replacement
platform/chrome: cros_ec_chardev: Secure cros_ec_device via revocable
.../driver-api/driver-model/index.rst | 1 +
.../driver-api/driver-model/revocable.rst | 87 +++++++
MAINTAINERS | 9 +
drivers/base/Kconfig | 8 +
drivers/base/Makefile | 5 +-
drivers/base/revocable.c | 233 ++++++++++++++++++
drivers/base/revocable_test.c | 110 +++++++++
drivers/char/misc.c | 8 +
drivers/platform/chrome/cros_ec.c | 5 +
drivers/platform/chrome/cros_ec_chardev.c | 22 +-
fs/Makefile | 2 +-
fs/fs_revocable.c | 154 ++++++++++++
include/linux/fs.h | 2 +
include/linux/fs_revocable.h | 21 ++
include/linux/miscdevice.h | 4 +
include/linux/platform_data/cros_ec_proto.h | 4 +
include/linux/revocable.h | 53 ++++
tools/testing/selftests/Makefile | 1 +
.../selftests/drivers/base/revocable/Makefile | 7 +
.../drivers/base/revocable/revocable_test.c | 116 +++++++++
.../drivers/base/revocable/test-revocable.sh | 39 +++
.../base/revocable/test_modules/Makefile | 10 +
.../revocable/test_modules/revocable_test.c | 188 ++++++++++++++
23 files changed, 1086 insertions(+), 3 deletions(-)
create mode 100644 Documentation/driver-api/driver-model/revocable.rst
create mode 100644 drivers/base/revocable.c
create mode 100644 drivers/base/revocable_test.c
create mode 100644 fs/fs_revocable.c
create mode 100644 include/linux/fs_revocable.h
create mode 100644 include/linux/revocable.h
create mode 100644 tools/testing/selftests/drivers/base/revocable/Makefile
create mode 100644 tools/testing/selftests/drivers/base/revocable/revocable_test.c
create mode 100755 tools/testing/selftests/drivers/base/revocable/test-revocable.sh
create mode 100644 tools/testing/selftests/drivers/base/revocable/test_modules/Makefile
create mode 100644 tools/testing/selftests/drivers/base/revocable/test_modules/revocable_test.c
--
2.51.0.788.g6d19910ace-goog
The get_hw_info uses a smaller user buffer on purpose to check how
the kernel updates only the fields that fit in the buffer. The test
created a custom smaller struct for this, but the helper function later
treats the buffer as struct iommu_test_hw_info. This makes the compiler
warn about a possible out-of-bounds access (-Warray-bounds).
This keeps the test behavior the same and removes the warning.
Signed-off-by: Kathara Sasikumar <katharasasikumar007(a)gmail.com>
---
tools/testing/selftests/iommu/iommufd.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 10e051b6f592..f6aceb65313f 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -755,9 +755,7 @@ TEST_F(iommufd_ioas, get_hw_info)
struct iommu_test_hw_info info;
uint64_t trailing_bytes;
} buffer_larger;
- struct iommu_test_hw_info_buffer_smaller {
- __u32 flags;
- } buffer_smaller;
+ struct iommu_test_hw_info buffer_smaller;
if (self->device_id) {
uint8_t max_pasid = 0;
--
2.51.0
GCC gets a bit confused and reports:
In function '_test_cmd_get_hw_info',
inlined from 'iommufd_ioas_get_hw_info' at iommufd.c:779:3,
inlined from 'wrapper_iommufd_ioas_get_hw_info' at iommufd.c:752:1:
>> iommufd_utils.h:804:37: warning: array subscript 'struct iommu_test_hw_info[0]' is partly outside array bounds of 'struct iommu_test_hw_info_buffer_smaller[1]' [-Warray-bounds=]
804 | assert(!info->flags);
| ~~~~^~~~~~~
iommufd.c: In function 'wrapper_iommufd_ioas_get_hw_info':
iommufd.c:761:11: note: object 'buffer_smaller' of size 4
761 | } buffer_smaller;
| ^~~~~~~~~~~~~~
While it is true that "struct iommu_test_hw_info[0]" is partly out of
bounds of the input pointer, it is not true that info->flags is out of
bounds. Unclear why it warns on this.
Reuse an existing properly sized stack buffer and pass a truncated length
instead to test the same thing.
Fixes: af4fde93c319 ("iommufd/selftest: Add coverage for IOMMU_GET_HW_INFO ioctl")
Reported-by: kernel test robot <lkp(a)intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202512032344.kaAcKFIM-lkp@intel.com/
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
---
tools/testing/selftests/iommu/iommufd.c | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 10e051b6f592df..dadad277f4eb2e 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -755,9 +755,6 @@ TEST_F(iommufd_ioas, get_hw_info)
struct iommu_test_hw_info info;
uint64_t trailing_bytes;
} buffer_larger;
- struct iommu_test_hw_info_buffer_smaller {
- __u32 flags;
- } buffer_smaller;
if (self->device_id) {
uint8_t max_pasid = 0;
@@ -789,8 +786,9 @@ TEST_F(iommufd_ioas, get_hw_info)
* the fields within the size range still gets updated.
*/
test_cmd_get_hw_info(self->device_id,
- IOMMU_HW_INFO_TYPE_DEFAULT,
- &buffer_smaller, sizeof(buffer_smaller));
+ IOMMU_HW_INFO_TYPE_DEFAULT, &buffer_exact,
+ offsetofend(struct iommu_test_hw_info,
+ flags));
test_cmd_get_hw_info_pasid(self->device_id, &max_pasid);
ASSERT_EQ(0, max_pasid);
if (variant->pasid_capable) {
base-commit: 93013488dd77dd2ea8bd23355a5587d9e6dac185
--
2.43.0
On Thu, 11 Dec 2025 18:04:48 +0000,
Jiaqi Yan <jiaqiyan(a)google.com> wrote:
>
> [1 <text/plain; UTF-8 (quoted-printable)>]
> Hi Sebastian,
>
> CONFIGs seem alright to me. Do you boot kernel with cmdline options like
> "default_hugepagesz=1G hugepagesz=1G hugepages=64", or dynamically set up
> huge pages via "echo 64 >
> /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages"?
I don't think this is irrelevant. The whole thing seems to have some
logic flaws, see the extensive report from Zenghui[1] as a reply to
your series.
M.
[1] https://lore.kernel.org/r/3061f5f8-cef0-b7b1-c4de-f2ceea29af9a@huawei.com
--
Without deviation from the norm, progress is not possible.