Cover three recent cases:
1. missing ops locking for the lowers during netdev_sync_lower_features
2. missing locking for dev_set_promiscuity (plus netdev_ops_assert_locked
with a comment on why/when it's needed)
3. rcu lock during team_change_rx_flags
Verified that each one triggers when the respective fix is reverted.
Not sure about the placement, but since it all relies on teaming,
added to the teaming directory.
One ugly bit is that I add NETIF_F_LRO to netdevsim; there is no way
to trigger netdev_sync_lower_features without it.
Signed-off-by: Stanislav Fomichev <stfomichev(a)gmail.com>
---
drivers/net/netdevsim/netdev.c | 2 +
net/core/dev.c | 10 ++-
.../selftests/drivers/net/team/Makefile | 2 +-
.../testing/selftests/drivers/net/team/config | 1 +
.../selftests/drivers/net/team/propagation.sh | 79 +++++++++++++++++++
5 files changed, 92 insertions(+), 2 deletions(-)
create mode 100755 tools/testing/selftests/drivers/net/team/propagation.sh
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index 0e0321a7ddd7..3bd1f8cffee8 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -879,11 +879,13 @@ static void nsim_setup(struct net_device *dev)
NETIF_F_SG |
NETIF_F_FRAGLIST |
NETIF_F_HW_CSUM |
+ NETIF_F_LRO |
NETIF_F_TSO;
dev->hw_features |= NETIF_F_HW_TC |
NETIF_F_SG |
NETIF_F_FRAGLIST |
NETIF_F_HW_CSUM |
+ NETIF_F_LRO |
NETIF_F_TSO;
dev->max_mtu = ETH_MAX_MTU;
dev->xdp_features = NETDEV_XDP_ACT_HW_OFFLOAD;
diff --git a/net/core/dev.c b/net/core/dev.c
index 0d891634c692..4debd4b8e0f5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9188,8 +9188,16 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
dev_change_rx_flags(dev, IFF_PROMISC);
}
- if (notify)
+ if (notify) {
+ /* The ops lock is only required to ensure consistent locking
+ * for `NETDEV_CHANGE` notifiers. This function is sometimes
+ * called without the lock, even for devices that are ops
+ * locked, such as in `dev_uc_sync_multiple` when using
+ * bonding or teaming.
+ */
+ netdev_ops_assert_locked(dev);
__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
+ }
return 0;
}
diff --git a/tools/testing/selftests/drivers/net/team/Makefile b/tools/testing/selftests/drivers/net/team/Makefile
index 2d5a76d99181..eaf6938f100e 100644
--- a/tools/testing/selftests/drivers/net/team/Makefile
+++ b/tools/testing/selftests/drivers/net/team/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
# Makefile for net selftests
-TEST_PROGS := dev_addr_lists.sh
+TEST_PROGS := dev_addr_lists.sh propagation.sh
TEST_INCLUDES := \
../bonding/lag_lib.sh \
diff --git a/tools/testing/selftests/drivers/net/team/config b/tools/testing/selftests/drivers/net/team/config
index b5e3a3aad4bf..636b3525b679 100644
--- a/tools/testing/selftests/drivers/net/team/config
+++ b/tools/testing/selftests/drivers/net/team/config
@@ -1,5 +1,6 @@
CONFIG_DUMMY=y
CONFIG_IPV6=y
CONFIG_MACVLAN=y
+CONFIG_NETDEVSIM=m
CONFIG_NET_TEAM=y
CONFIG_NET_TEAM_MODE_LOADBALANCE=y
diff --git a/tools/testing/selftests/drivers/net/team/propagation.sh b/tools/testing/selftests/drivers/net/team/propagation.sh
new file mode 100755
index 000000000000..849a5f2cb3a7
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/team/propagation.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+NSIM_LRO_ID=$((256 + RANDOM % 256))
+NSIM_LRO_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_LRO_ID
+
+NSIM_DEV_SYS_NEW=/sys/bus/netdevsim/new_device
+NSIM_DEV_SYS_DEL=/sys/bus/netdevsim/del_device
+
+cleanup()
+{
+ ip link del dummyteam &>/dev/null
+ ip link del team0 &>/dev/null
+ echo $NSIM_LRO_ID > $NSIM_DEV_SYS_DEL
+}
+
+# Trigger LRO propagation to the lower.
+# https://lore.kernel.org/netdev/aBvOpkIoxcr9PfDg@mini-arch/
+team_lro()
+{
+ # using netdevsim because it supports NETIF_F_LRO
+ NSIM_LRO_NAME=$(find $NSIM_LRO_SYS/net -maxdepth 1 -type d ! \
+ -path $NSIM_LRO_SYS/net -exec basename {} \;)
+
+ ip link add name team0 type team
+ ip link set $NSIM_LRO_NAME down
+ ip link set dev $NSIM_LRO_NAME master team0
+ ip link set team0 up
+ ethtool -K team0 large-receive-offload off
+
+ ip link del team0
+}
+
+# Trigger promisc propagation to the lower during IFLA_MASTER.
+# https://lore.kernel.org/netdev/20250506032328.3003050-1-sdf@fomichev.me/
+team_promisc()
+{
+ ip link add name dummyteam type dummy
+ ip link add name team0 type team
+ ip link set dummyteam down
+ ip link set team0 promisc on
+ ip link set dev dummyteam master team0
+ ip link set team0 up
+
+ ip link del team0
+ ip link del dummyteam
+}
+
+# Trigger promisc propagation to the lower via netif_change_flags (aka
+# ndo_change_rx_flags).
+# https://lore.kernel.org/netdev/20250514220319.3505158-1-stfomichev@gmail.co…
+team_change_flags()
+{
+ ip link add name dummyteam type dummy
+ ip link add name team0 type team
+ ip link set dummyteam down
+ ip link set dev dummyteam master team0
+ ip link set team0 up
+ ip link set team0 promisc on
+
+ # Make sure we can add more L2 addresses without any issues.
+ ip link add link team0 address 00:00:00:00:00:01 team0.1 type macvlan
+ ip link set team0.1 up
+
+ ip link del team0.1
+ ip link del team0
+ ip link del dummyteam
+}
+
+trap cleanup EXIT
+modprobe netdevsim || :
+echo $NSIM_LRO_ID > $NSIM_DEV_SYS_NEW
+udevadm settle
+team_lro
+team_promisc
+team_change_flags
+modprobe -r netdevsim || :
--
2.49.0
When running 'make' in tools/testing/selftests/arm64/ without explicitly
setting the OUTPUT variable, the build system will creates test
directories (e.g., /bti) in the root filesystem due to OUTPUT defaulting
to an empty string. This causes unintended pollution of the root directory.
This patch adds proper handling for the OUTPUT variable: Sets OUTPUT
to the current directory (.) if not specified
Signed-off-by: tanze <tanze(a)kylinos.cn>
---
tools/testing/selftests/arm64/Makefile | 2 ++
1 file changed, 2 insertions(+)
diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile
index 22029e60eff3..c4c72ee2ef55 100644
--- a/tools/testing/selftests/arm64/Makefile
+++ b/tools/testing/selftests/arm64/Makefile
@@ -21,6 +21,8 @@ CFLAGS += $(KHDR_INCLUDES)
CFLAGS += -I$(top_srcdir)/tools/include
+OUTPUT ?= $(CURDIR)
+
export CFLAGS
export top_srcdir
--
2.25.1
Two guard_regions tests on userfaultfd fail when userfaultfd is not
present. Skip them instead.
hugevm test reads kernel config to get page table level information and
fails when neither /proc/config.gz nor /boot/config-* is present. Skip
it instead.
Zi Yan (2):
selftests/mm: skip guard_regions.uffd tests when uffd is not present.
selftests/mm: skip hugevm test if kernel config file is not present.
tools/testing/selftests/mm/guard-regions.c | 17 ++++++++++--
.../selftests/mm/va_high_addr_switch.sh | 26 +++++++------------
2 files changed, 24 insertions(+), 19 deletions(-)
--
2.47.2
The ID_AA64PFR1_EL1.MTE_frac field is currently hidden from KVM.
However, when ID_AA64PFR1_EL1.MTE==2, ID_AA64PFR1_EL1.MTE_frac==0
indicates that MTE_ASYNC is supported. On a host with
ID_AA64PFR1_EL1.MTE==2 but without MTE_ASYNC support a guest with the
MTE capability enabled will incorrectly see MTE_ASYNC advertised as
supported. This series fixes that.
This was found by inspection and the current behaviour is not known to
break anything. Linux doesn't check MTE_frac, and wrongly, assumes
MTE async faults can be generated whenever MTE is supported. This is
a separate problem and not addressed here.
I am looking for feedback on whether this change is valuable or
otherwise.
Changes since v1:
Only pass MTE_Frac hw value to the guest when it is the exact failure case.
Changed base commit to v6.15-rc5 but still applies on v6.16-rc2 as well.
Ben Horgan (3):
arm64/sysreg: Expose MTE_frac so that it is visible to KVM
KVM: arm64: Make MTE_frac masking conditional on MTE capability
KVM: selftests: Confirm exposing MTE_frac does not break migration
arch/arm64/kernel/cpufeature.c | 1 +
arch/arm64/kvm/sys_regs.c | 28 ++++++-
.../testing/selftests/kvm/arm64/set_id_regs.c | 77 ++++++++++++++++++-
3 files changed, 103 insertions(+), 3 deletions(-)
base-commit: 92a09c47464d040866cf2b4cd052bc60555185fb
--
2.43.0
Add small grammar fixes in perf events and Real Time Clock tests'
output messages.
Include braces around a single if statement, when there are multiple
statements in the else branch, to align with the kernel coding style.
Signed-off-by: Hanne-Lotta Mäenpää <hannelotta(a)gmail.com>
---
tools/testing/selftests/perf_events/watermark_signal.c | 7 ++++---
tools/testing/selftests/rtc/rtctest.c | 10 +++++-----
2 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/tools/testing/selftests/perf_events/watermark_signal.c b/tools/testing/selftests/perf_events/watermark_signal.c
index 49dc1e831174..6176afd4950b 100644
--- a/tools/testing/selftests/perf_events/watermark_signal.c
+++ b/tools/testing/selftests/perf_events/watermark_signal.c
@@ -65,8 +65,9 @@ TEST(watermark_signal)
child = fork();
EXPECT_GE(child, 0);
- if (child == 0)
+ if (child == 0) {
do_child();
+ }
else if (child < 0) {
perror("fork()");
goto cleanup;
@@ -75,7 +76,7 @@ TEST(watermark_signal)
if (waitpid(child, &child_status, WSTOPPED) != child ||
!(WIFSTOPPED(child_status) && WSTOPSIG(child_status) == SIGSTOP)) {
fprintf(stderr,
- "failed to sycnhronize with child errno=%d status=%x\n",
+ "failed to synchronize with child errno=%d status=%x\n",
errno,
child_status);
goto cleanup;
@@ -84,7 +85,7 @@ TEST(watermark_signal)
fd = syscall(__NR_perf_event_open, &attr, child, -1, -1,
PERF_FLAG_FD_CLOEXEC);
if (fd < 0) {
- fprintf(stderr, "failed opening event %llx\n", attr.config);
+ fprintf(stderr, "failed to setup performance monitoring %llx\n", attr.config);
goto cleanup;
}
diff --git a/tools/testing/selftests/rtc/rtctest.c b/tools/testing/selftests/rtc/rtctest.c
index be175c0e6ae3..8fd4d5d3b527 100644
--- a/tools/testing/selftests/rtc/rtctest.c
+++ b/tools/testing/selftests/rtc/rtctest.c
@@ -138,10 +138,10 @@ TEST_F_TIMEOUT(rtc, date_read_loop, READ_LOOP_DURATION_SEC + 2) {
rtc_read = rtc_time_to_timestamp(&rtc_tm);
/* Time should not go backwards */
ASSERT_LE(prev_rtc_read, rtc_read);
- /* Time should not increase more then 1s at a time */
+ /* Time should not increase more than 1s per read */
ASSERT_GE(prev_rtc_read + 1, rtc_read);
- /* Sleep 11ms to avoid killing / overheating the RTC */
+ /* Sleep 11ms to avoid overheating the RTC */
nanosleep_with_retries(READ_LOOP_SLEEP_MS * 1000000);
prev_rtc_read = rtc_read;
@@ -236,7 +236,7 @@ TEST_F(rtc, alarm_alm_set) {
if (alarm_state == RTC_ALARM_DISABLED)
SKIP(return, "Skipping test since alarms are not supported.");
if (alarm_state == RTC_ALARM_RES_MINUTE)
- SKIP(return, "Skipping test since alarms has only minute granularity.");
+ SKIP(return, "Skipping test since alarms have only minute granularity.");
rc = ioctl(self->fd, RTC_RD_TIME, &tm);
ASSERT_NE(-1, rc);
@@ -306,7 +306,7 @@ TEST_F(rtc, alarm_wkalm_set) {
if (alarm_state == RTC_ALARM_DISABLED)
SKIP(return, "Skipping test since alarms are not supported.");
if (alarm_state == RTC_ALARM_RES_MINUTE)
- SKIP(return, "Skipping test since alarms has only minute granularity.");
+ SKIP(return, "Skipping test since alarms have only minute granularity.");
rc = ioctl(self->fd, RTC_RD_TIME, &alarm.time);
ASSERT_NE(-1, rc);
@@ -502,7 +502,7 @@ int main(int argc, char **argv)
if (access(rtc_file, R_OK) == 0)
ret = test_harness_run(argc, argv);
else
- ksft_exit_skip("[SKIP]: Cannot access rtc file %s - Exiting\n",
+ ksft_exit_skip("Cannot access RTC file %s - exiting\n",
rtc_file);
return ret;
--
2.39.5
Hi maintainers,
As part of the Kselftest task for the LFX Mentorship Program, I have
reviewed the futex selftest and made minor improvements to the message
clarity in `futex_requeue.c`.
Attached is the patch with the changes. I’ve also uploaded it to the
mentorship platform as instructed.
Please let me know if any changes are needed.
Thanks,
Shivam Sharma
10sharmashivam(a)gmail.com
The thuge-gen test program runs mmap() and shmget() tests for both every
available page size and the default page size, resulting in two tests for
the default size. These tests are distinct since the flags in the default
case do not specify an explicit size, add the flags to the test name that
is logged to deduplicate.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
tools/testing/selftests/mm/thuge-gen.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
index cd5174d735be..a41bc1234b37 100644
--- a/tools/testing/selftests/mm/thuge-gen.c
+++ b/tools/testing/selftests/mm/thuge-gen.c
@@ -127,7 +127,7 @@ void test_mmap(unsigned long size, unsigned flags)
show(size);
ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES,
- "%s mmap %lu\n", __func__, size);
+ "%s mmap %lu %x\n", __func__, size, flags);
if (munmap(map, size * NUM_PAGES))
ksft_exit_fail_msg("%s: unmap %s\n", __func__, strerror(errno));
@@ -165,7 +165,7 @@ void test_shmget(unsigned long size, unsigned flags)
show(size);
ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES,
- "%s: mmap %lu\n", __func__, size);
+ "%s: mmap %lu %x\n", __func__, size, flags);
if (shmdt(map))
ksft_exit_fail_msg("%s: shmdt: %s\n", __func__, strerror(errno));
}
---
base-commit: 82f2b0b97b36ee3fcddf0f0780a9a0825d52fec3
change-id: 20250514-selfests-mm-thuge-gen-dup-7e1c40716091
Best regards,
--
Mark Brown <broonie(a)kernel.org>
The mlock2-tests test_mlock_lock() test reports two test results with
an identical string, one reporitng if it successfully locked a block of
memory and another reporting if the lock is still present after doing an
unlock (following a similar pattern to other tests in the same program).
This confuses test automation since the test string is used to deduplicate
tests, change the post unlock test to report "Unlocked" instead like the
other tests to fix this.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
tools/testing/selftests/mm/mlock2-tests.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c
index 7f0d50fa361d..3e90ff37e336 100644
--- a/tools/testing/selftests/mm/mlock2-tests.c
+++ b/tools/testing/selftests/mm/mlock2-tests.c
@@ -196,7 +196,7 @@ static void test_mlock_lock(void)
ksft_exit_fail_msg("munlock(): %s\n", strerror(errno));
}
- ksft_test_result(!unlock_lock_check(map), "%s: Locked\n", __func__);
+ ksft_test_result(!unlock_lock_check(map), "%s: Unlocked\n", __func__);
munmap(map, 2 * page_size);
}
---
base-commit: 82f2b0b97b36ee3fcddf0f0780a9a0825d52fec3
change-id: 20250514-selftest-mm-mlock2-dup-277d586bb29d
Best regards,
--
Mark Brown <broonie(a)kernel.org>
When writing a test for fusectl, I referred to this Makefile as a
reference for creating a FUSE daemon in the selftests.
While doing so, I noticed that there is a minor issue in the Makefile.
The fuse_mnt.c file is not actually compiled into fuse_mnt.o,
and the code setting CFLAGS for it never takes effect.
The reason fuse_mnt compiles successfully is because CFLAGS is set
at the very beginning of the file.
Signed-off-by: Chen Linxuan <chenlinxuan(a)uniontech.com>
---
tools/testing/selftests/memfd/Makefile | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile
index 163b6f68631c4..e9b886c65153d 100644
--- a/tools/testing/selftests/memfd/Makefile
+++ b/tools/testing/selftests/memfd/Makefile
@@ -1,5 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-CFLAGS += -D_FILE_OFFSET_BITS=64
CFLAGS += $(KHDR_INCLUDES)
TEST_GEN_PROGS := memfd_test
@@ -16,10 +15,9 @@ ifeq ($(VAR_LDLIBS),)
VAR_LDLIBS := -lfuse -pthread
endif
-fuse_mnt.o: CFLAGS += $(VAR_CFLAGS)
-
include ../lib.mk
+$(OUTPUT)/fuse_mnt: CFLAGS += $(VAR_CFLAGS)
$(OUTPUT)/fuse_mnt: LDLIBS += $(VAR_LDLIBS)
$(OUTPUT)/memfd_test: memfd_test.c common.c
--
2.43.0
Until CONFIG_DMABUF_SYSFS_STATS was added [1] it was only possible to
perform per-buffer accounting with debugfs which is not suitable for
production environments. Eventually we discovered the overhead with
per-buffer sysfs file creation/removal was significantly impacting
allocation and free times, and exacerbated kernfs lock contention. [2]
dma_buf_stats_setup() is responsible for 39% of single-page buffer
creation duration, or 74% of single-page dma_buf_export() duration when
stressing dmabuf allocations and frees.
I prototyped a change from per-buffer to per-exporter statistics with a
RCU protected list of exporter allocations that accommodates most (but
not all) of our use-cases and avoids almost all of the sysfs overhead.
While that adds less overhead than per-buffer sysfs, and less even than
the maintenance of the dmabuf debugfs_list, it's still *additional*
overhead on top of the debugfs_list and doesn't give us per-buffer info.
This series uses the existing dmabuf debugfs_list to implement a BPF
dmabuf iterator, which adds no overhead to buffer allocation/free and
provides per-buffer info. The list has been moved outside of
CONFIG_DEBUG_FS scope so that it is always populated. The BPF program
loaded by userspace that extracts per-buffer information gets to define
its own interface which avoids the lack of ABI stability with debugfs.
This will allow us to replace our use of CONFIG_DMABUF_SYSFS_STATS, and
the plan is to remove it from the kernel after the next longterm stable
release.
[1] https://lore.kernel.org/linux-media/20201210044400.1080308-1-hridya@google.…
[2] https://lore.kernel.org/all/20220516171315.2400578-1-tjmercier@google.com
v1: https://lore.kernel.org/all/20250414225227.3642618-1-tjmercier@google.com
v1 -> v2:
Make the DMA buffer list independent of CONFIG_DEBUG_FS per Christian
König
Add CONFIG_DMA_SHARED_BUFFER check to kernel/bpf/Makefile per kernel
test robot
Use BTF_ID_LIST_SINGLE instead of BTF_ID_LIST_GLOBAL_SINGLE per Song Liu
Fixup comment style, mixing code/declarations, and use ASSERT_OK_FD in
selftest per Song Liu
Add BPF_ITER_RESCHED feature to bpf_dmabuf_reg_info per Alexei
Starovoitov
Add open-coded iterator and selftest per Alexei Starovoitov
Add a second test buffer from the system dmabuf heap to selftests
Use the BPF program we'll use in production for selftest per Alexei
Starovoitov
https://r.android.com/c/platform/system/bpfprogs/+/3616123/2/dmabufIter.chttps://r.android.com/c/platform/system/memory/libmeminfo/+/3614259/1/libdm…
v2: https://lore.kernel.org/all/20250504224149.1033867-1-tjmercier@google.com
v2 -> v3:
Rebase onto bpf-next/master
Move get_next_dmabuf() into drivers/dma-buf/dma-buf.c, along with the
new get_first_dmabuf(). This avoids having to expose the dmabuf list
and mutex to the rest of the kernel, and keeps the dmabuf mutex
operations near each other in the same file. (Christian König)
Add Christian's RB to dma-buf: Rename debugfs symbols
Drop RFC: dma-buf: Remove DMA-BUF statistics
v3: https://lore.kernel.org/all/20250507001036.2278781-1-tjmercier@google.com
v3 -> v4:
Fix selftest BPF program comment style (not kdoc) per Alexei Starovoitov
Fix dma-buf.c kdoc comment style per Alexei Starovoitov
Rename get_first_dmabuf / get_next_dmabuf to dma_buf_iter_begin /
dma_buf_iter_next per Christian König
Add Christian's RB to bpf: Add dmabuf iterator
v4: https://lore.kernel.org/all/20250508182025.2961555-1-tjmercier@google.com
v4 -> v5:
Add Christian's Acks to all patches
Add Song Liu's Acks
Move BTF_ID_LIST_SINGLE and DEFINE_BPF_ITER_FUNC closer to usage per
Song Liu
Fix open-coded iterator comment style per Song Liu
Move iterator termination check to its own subtest per Song Liu
Rework selftest buffer creation per Song Liu
Fix spacing in sanitize_string per BPF CI
v5: https://lore.kernel.org/all/20250512174036.266796-1-tjmercier@google.com
v5 -> v6:
Song Liu:
Init test buffer FDs to -1
Zero-init udmabuf_create for future proofing
Bail early for iterator fd/FILE creation failure
Dereference char ptr to check for NUL in sanitize_string()
Move map insertion from create_test_buffers() to test_dmabuf_iter()
Add ACK to selftests/bpf: Add test for open coded dmabuf_iter
T.J. Mercier (5):
dma-buf: Rename debugfs symbols
bpf: Add dmabuf iterator
bpf: Add open coded dmabuf iterator
selftests/bpf: Add test for dmabuf_iter
selftests/bpf: Add test for open coded dmabuf_iter
drivers/dma-buf/dma-buf.c | 98 ++++--
include/linux/dma-buf.h | 4 +-
kernel/bpf/Makefile | 3 +
kernel/bpf/dmabuf_iter.c | 150 +++++++++
kernel/bpf/helpers.c | 5 +
.../testing/selftests/bpf/bpf_experimental.h | 5 +
tools/testing/selftests/bpf/config | 3 +
.../selftests/bpf/prog_tests/dmabuf_iter.c | 285 ++++++++++++++++++
.../testing/selftests/bpf/progs/dmabuf_iter.c | 91 ++++++
9 files changed, 622 insertions(+), 22 deletions(-)
create mode 100644 kernel/bpf/dmabuf_iter.c
create mode 100644 tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c
create mode 100644 tools/testing/selftests/bpf/progs/dmabuf_iter.c
base-commit: 43745d11bfd9683abdf08ad7a5cc403d6a9ffd15
--
2.49.0.1045.g170613ef41-goog
From: Chia-Yu Chang <chia-yu.chang(a)nokia-bell-labs.com>
Hello,
Please find the v6:
v5 (09-May-2025)
- Add #3 to utilize exisintg holes of tcp_sock_write_txrx group for later patches (#4, #9, #10) with new u8 members (Paolo Abeni <pabeni(a)redhat.com>)
- Add pahole outcomes before and after commit in #4, #5, #6, #9, #10, #15 (Paolo Abeni <pabeni(a)redhat.com>)
- Define new helper function tcp_send_ack_reflect_ect() for sending ACK with reflected ECT in #5 (Paolo Abeni <pabeni(a)redhat.com>)
- Add comments for function tcp_ecn_rcv_synack() in #5 (Paolo Abeni <pabeni(a)redhat.com>)
- Add enum/define to be used by sysctl_tcp_ecn in #5, sysctl_tcp_ecn_option in #9, and sysctl_tcp_ecn_option_beacon in #10 (Paolo Abeni <pabeni(a)redhat.com>)
- Move accecn_fail_mode and saw_accecn_opt in #5 and #11 to use exisintg holes of tcp_sock (Paolo Abeni <pabeni(a)redhat.com>)
- Change data type of new members of tcp_request_sock and move them to the end of struct in #5 and #11 (Paolo Abeni <pabeni(a)redhat.com>)
- Move new members of tcp_info to the end of struct in #6 (Paolo Abeni <pabeni(a)redhat.com>)
- Merge previous #7 into #9 (Paolo Abeni <pabeni(a)redhat.com>)
- Mask ecnfield with INET_ECN_MASK to remove WARN_ONCE in #9 (Paolo Abeni <pabeni(a)redhat.com>)
- Reduce the indentation levels for reabability in #9 and #10 (Paolo Abeni <pabeni(a)redhat.com>)
- Move delivered_ecn_bytes to the RX group in #9, accecn_opt_tstamp to the TX group in #10, pkts_acked_ewma to the RX group in #15 (Paolo Abeni <pabeni(a)redhat.com>)
- Add changes in Documentation/networking/net_cachelines/tcp_sock.rst for new tcp_sock members in #3, #5, #6, #9, #10, #15
v5 (22-Apr-2025)
- Further fix for 32-bit ARM alignment in tcp.c (Simon Horman <horms(a)kernel.org>)
v4 (18-Apr-2025)
- Fix 32-bit ARM assertion for alignment requirement (Simon Horman <horms(a)kernel.org>)
v3 (14-Apr-2025)
- Fix patch apply issue in v2 (Jakub Kicinski <kuba(a)kernel.org>)
v2 (18-Mar-2025)
- Add one missing patch from the previous AccECN protocol preparation patch series to this patch series.
The full patch series can be found in
https://github.com/L4STeam/linux-net-next/commits/upstream_l4steam/
The Accurate ECN draft can be found in
https://datatracker.ietf.org/doc/html/draft-ietf-tcpm-accurate-ecn-28
Best regards,
Chia-Yu
Chia-Yu Chang (2):
tcp: reorganize tcp_sock_write_txrx group for variables later
tcp: accecn: AccECN option failure handling
Ilpo Järvinen (13):
tcp: reorganize SYN ECN code
tcp: fast path functions later
tcp: AccECN core
tcp: accecn: AccECN negotiation
tcp: accecn: add AccECN rx byte counters
tcp: accecn: AccECN needs to know delivered bytes
tcp: sack option handling improvements
tcp: accecn: AccECN option
tcp: accecn: AccECN option send control
tcp: accecn: AccECN option ceb/cep heuristic
tcp: accecn: AccECN ACE field multi-wrap heuristic
tcp: accecn: try to fit AccECN option with SACK
tcp: try to avoid safer when ACKs are thinned
.../networking/net_cachelines/tcp_sock.rst | 14 +
include/linux/tcp.h | 34 +-
include/net/netns/ipv4.h | 2 +
include/net/tcp.h | 221 ++++++-
include/uapi/linux/tcp.h | 7 +
net/ipv4/syncookies.c | 3 +
net/ipv4/sysctl_net_ipv4.c | 19 +
net/ipv4/tcp.c | 30 +-
net/ipv4/tcp_input.c | 608 +++++++++++++++++-
net/ipv4/tcp_ipv4.c | 7 +-
net/ipv4/tcp_minisocks.c | 92 ++-
net/ipv4/tcp_output.c | 297 ++++++++-
net/ipv6/syncookies.c | 1 +
net/ipv6/tcp_ipv6.c | 1 +
14 files changed, 1237 insertions(+), 99 deletions(-)
--
2.34.1
Hello,
This patchset is our exploration of how to support 1G pages in guest_memfd, and
how the pages will be used in Confidential VMs.
The patchset covers:
+ How to get 1G pages
+ Allowing mmap() of guest_memfd to userspace so that both private and shared
memory can use the same physical pages
+ Splitting and reconstructing pages to support conversions and mmap()
+ How the VM, userspace and guest_memfd interact to support conversions
+ Selftests to test all the above
+ Selftests also demonstrate the conversion flow between VM, userspace and
guest_memfd.
Why 1G pages in guest memfd?
Bring guest_memfd to performance and memory savings parity with VMs that are
backed by HugeTLBfs.
+ Performance is improved with 1G pages by more TLB hits and faster page walks
on TLB misses.
+ Memory savings from 1G pages comes from HugeTLB Vmemmap Optimization (HVO).
Options for 1G page support:
1. HugeTLB
2. Contiguous Memory Allocator (CMA)
3. Other suggestions are welcome!
Comparison between options:
1. HugeTLB
+ Refactor HugeTLB to separate allocator from the rest of HugeTLB
+ Pro: Graceful transition for VMs backed with HugeTLB to guest_memfd
+ Near term: Allows co-tenancy of HugeTLB and guest_memfd backed VMs
+ Pro: Can provide iterative steps toward new future allocator
+ Unexplored: Managing userspace-visible changes
+ e.g. HugeTLB's free_hugepages will decrease if HugeTLB is used,
but not when future allocator is used
2. CMA
+ Port some HugeTLB features to be applied on CMA
+ Pro: Clean slate
What would refactoring HugeTLB involve?
(Some refactoring was done in this RFC, more can be done.)
1. Broadly involves separating the HugeTLB allocator from the rest of HugeTLB
+ Brings more modularity to HugeTLB
+ No functionality change intended
+ Likely step towards HugeTLB's integration into core-mm
2. guest_memfd will use just the allocator component of HugeTLB, not including
the complex parts of HugeTLB like
+ Userspace reservations (resv_map)
+ Shared PMD mappings
+ Special page walkers
What features would need to be ported to CMA?
+ Improved allocation guarantees
+ Per NUMA node pool of huge pages
+ Subpools per guest_memfd
+ Memory savings
+ Something like HugeTLB Vmemmap Optimization
+ Configuration/reporting features
+ Configuration of number of pages available (and per NUMA node) at and
after host boot
+ Reporting of memory usage/availability statistics at runtime
HugeTLB was picked as the source of 1G pages for this RFC because it allows a
graceful transition, and retains memory savings from HVO.
To illustrate this, if a host machine uses HugeTLBfs to back VMs, and a
confidential VM were to be scheduled on that host, some HugeTLBfs pages would
have to be given up and returned to CMA for guest_memfd pages to be rebuilt from
that memory. This requires memory to be reserved for HVO to be removed and
reapplied on the new guest_memfd memory. This not only slows down memory
allocation but also trims the benefits of HVO. Memory would have to be reserved
on the host to facilitate these transitions.
Improving how guest_memfd uses the allocator in a future revision of this RFC:
To provide an easier transition away from HugeTLB, guest_memfd's use of HugeTLB
should be limited to these allocator functions:
+ reserve(node, page_size, num_pages) => opaque handle
+ Used when a guest_memfd inode is created to reserve memory from backend
allocator
+ allocate(handle, mempolicy, page_size) => folio
+ To allocate a folio from guest_memfd's reservation
+ split(handle, folio, target_page_size) => void
+ To take a huge folio, and split it to smaller folios, restore to filemap
+ reconstruct(handle, first_folio, nr_pages) => void
+ To take a folio, and reconstruct a huge folio out of nr_pages from the
first_folio
+ free(handle, folio) => void
+ To return folio to guest_memfd's reservation
+ error(handle, folio) => void
+ To handle memory errors
+ unreserve(handle) => void
+ To return guest_memfd's reservation to allocator backend
Userspace should only provide a page size when creating a guest_memfd and should
not have to specify HugeTLB.
Overview of patches:
+ Patches 01-12
+ Many small changes to HugeTLB, mostly to separate HugeTLBfs concepts from
HugeTLB, and to expose HugeTLB functions.
+ Patches 13-16
+ Letting guest_memfd use HugeTLB
+ Creation of each guest_memfd reserves pages from HugeTLB's global hstate
and puts it into the guest_memfd inode's subpool
+ Each folio allocation takes a page from the guest_memfd inode's subpool
+ Patches 17-21
+ Selftests for new HugeTLB features in guest_memfd
+ Patches 22-24
+ More small changes on the HugeTLB side to expose functions needed by
guest_memfd
+ Patch 25:
+ Uses the newly available functions from patches 22-24 to split HugeTLB
pages. In this patch, HugeTLB folios are always split to 4K before any
usage, private or shared.
+ Patches 26-28
+ Allow mmap() in guest_memfd and faulting in shared pages
+ Patch 29
+ Enables conversion between private/shared pages
+ Patch 30
+ Required to zero folios after conversions to avoid leaking initialized
kernel memory
+ Patch 31-38
+ Add selftests to test mapping pages to userspace, guest/host memory
sharing and update conversions tests
+ Patch 33 illustrates the conversion flow between VM/userspace/guest_memfd
+ Patch 39
+ Dynamically split and reconstruct HugeTLB pages instead of always
splitting before use. All earlier selftests are expected to still pass.
TODOs:
+ Add logic to wait for safe_refcount [1]
+ Look into lazy splitting/reconstruction of pages
+ Currently, when the KVM_SET_MEMORY_ATTRIBUTES is invoked, not only is the
mem_attr_array and faultability updated, the pages in the requested range
are also split/reconstructed as necessary. We want to look into delaying
splitting/reconstruction to fault time.
+ Solve race between folios being faulted in and being truncated
+ When running private_mem_conversions_test with more than 1 vCPU, a folio
getting truncated may get faulted in by another process, causing elevated
mapcounts when the folio is freed (VM_BUG_ON_FOLIO).
+ Add intermediate splits (1G should first split to 2M and not split directly to
4K)
+ Use guest's lock instead of hugetlb_lock
+ Use multi-index xarray/replace xarray with some other data struct for
faultability flag
+ Refactor HugeTLB better, present generic allocator interface
Please let us know your thoughts on:
+ HugeTLB as the choice of transitional allocator backend
+ Refactoring HugeTLB to provide generic allocator interface
+ Shared/private conversion flow
+ Requiring user to request kernel to unmap pages from userspace using
madvise(MADV_DONTNEED)
+ Failing conversion on elevated mapcounts/pincounts/refcounts
+ Process of splitting/reconstructing page
+ Anything else!
[1] https://lore.kernel.org/all/20240829-guest-memfd-lib-v2-0-b9afc1ff3656@quic…
Ackerley Tng (37):
mm: hugetlb: Simplify logic in dequeue_hugetlb_folio_vma()
mm: hugetlb: Refactor vma_has_reserves() to should_use_hstate_resv()
mm: hugetlb: Remove unnecessary check for avoid_reserve
mm: mempolicy: Refactor out policy_node_nodemask()
mm: hugetlb: Refactor alloc_buddy_hugetlb_folio_with_mpol() to
interpret mempolicy instead of vma
mm: hugetlb: Refactor dequeue_hugetlb_folio_vma() to use mpol
mm: hugetlb: Refactor out hugetlb_alloc_folio
mm: truncate: Expose preparation steps for truncate_inode_pages_final
mm: hugetlb: Expose hugetlb_subpool_{get,put}_pages()
mm: hugetlb: Add option to create new subpool without using surplus
mm: hugetlb: Expose hugetlb_acct_memory()
mm: hugetlb: Move and expose hugetlb_zero_partial_page()
KVM: guest_memfd: Make guest mem use guest mem inodes instead of
anonymous inodes
KVM: guest_memfd: hugetlb: initialization and cleanup
KVM: guest_memfd: hugetlb: allocate and truncate from hugetlb
KVM: guest_memfd: Add page alignment check for hugetlb guest_memfd
KVM: selftests: Add basic selftests for hugetlb-backed guest_memfd
KVM: selftests: Support various types of backing sources for private
memory
KVM: selftests: Update test for various private memory backing source
types
KVM: selftests: Add private_mem_conversions_test.sh
KVM: selftests: Test that guest_memfd usage is reported via hugetlb
mm: hugetlb: Expose vmemmap optimization functions
mm: hugetlb: Expose HugeTLB functions for promoting/demoting pages
mm: hugetlb: Add functions to add/move/remove from hugetlb lists
KVM: guest_memfd: Track faultability within a struct kvm_gmem_private
KVM: guest_memfd: Allow mmapping guest_memfd files
KVM: guest_memfd: Use vm_type to determine default faultability
KVM: Handle conversions in the SET_MEMORY_ATTRIBUTES ioctl
KVM: guest_memfd: Handle folio preparation for guest_memfd mmap
KVM: selftests: Allow vm_set_memory_attributes to be used without
asserting return value of 0
KVM: selftests: Test using guest_memfd memory from userspace
KVM: selftests: Test guest_memfd memory sharing between guest and host
KVM: selftests: Add notes in private_mem_kvm_exits_test for mmap-able
guest_memfd
KVM: selftests: Test that pinned pages block KVM from setting memory
attributes to PRIVATE
KVM: selftests: Refactor vm_mem_add to be more flexible
KVM: selftests: Add helper to perform madvise by memslots
KVM: selftests: Update private_mem_conversions_test for mmap()able
guest_memfd
Vishal Annapurve (2):
KVM: guest_memfd: Split HugeTLB pages for guest_memfd use
KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page
fs/hugetlbfs/inode.c | 35 +-
include/linux/hugetlb.h | 54 +-
include/linux/kvm_host.h | 1 +
include/linux/mempolicy.h | 2 +
include/linux/mm.h | 1 +
include/uapi/linux/kvm.h | 26 +
include/uapi/linux/magic.h | 1 +
mm/hugetlb.c | 346 ++--
mm/hugetlb_vmemmap.h | 11 -
mm/mempolicy.c | 36 +-
mm/truncate.c | 26 +-
tools/include/linux/kernel.h | 4 +-
tools/testing/selftests/kvm/Makefile | 3 +
.../kvm/guest_memfd_hugetlb_reporting_test.c | 222 +++
.../selftests/kvm/guest_memfd_pin_test.c | 104 ++
.../selftests/kvm/guest_memfd_sharing_test.c | 160 ++
.../testing/selftests/kvm/guest_memfd_test.c | 238 ++-
.../testing/selftests/kvm/include/kvm_util.h | 45 +-
.../testing/selftests/kvm/include/test_util.h | 18 +
tools/testing/selftests/kvm/lib/kvm_util.c | 443 +++--
tools/testing/selftests/kvm/lib/test_util.c | 99 ++
.../kvm/x86_64/private_mem_conversions_test.c | 158 +-
.../x86_64/private_mem_conversions_test.sh | 91 +
.../kvm/x86_64/private_mem_kvm_exits_test.c | 11 +-
virt/kvm/guest_memfd.c | 1563 ++++++++++++++++-
virt/kvm/kvm_main.c | 17 +
virt/kvm/kvm_mm.h | 16 +
27 files changed, 3288 insertions(+), 443 deletions(-)
create mode 100644 tools/testing/selftests/kvm/guest_memfd_hugetlb_reporting_test.c
create mode 100644 tools/testing/selftests/kvm/guest_memfd_pin_test.c
create mode 100644 tools/testing/selftests/kvm/guest_memfd_sharing_test.c
create mode 100755 tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.sh
--
2.46.0.598.g6f2099f65c-goog
v14: https://lore.kernel.org/netdev/20250429032645.363766-1-almasrymina@google.c…
===
Picked up acks from Paolo, and addressed feedback from Paolo and Jakub.
Changelog:
- Fix issue in patch 4 where sockc_valid == false but err is
overwritten.
- Addressed nits.
v13: https://lore.kernel.org/netdev/20250425204743.617260-1-almasrymina@google.c…
===
Changelog:
- Fix unneeded error label pointed out by Christoph, and addressed
nitpick.
v12: https://lore.kernel.org/netdev/20250423031117.907681-1-almasrymina@google.c…
====
No changes in v12, just restored the selftests patch I accidentally dropped in
v11
v11: https://lore.kernel.org/netdev/20250423031117.907681-1-almasrymina@google.c…
====
Addressed a couple of nits and collected Acked-by from Harshitha
(thanks!)
v10: https://lore.kernel.org/netdev/20250417231540.2780723-1-almasrymina@google.…
====
Addressed comments following conversations with Pavel, Stan, and
Harshitha. Thank you guys for the reviews again. Overall minor changes:
Changelog:
- Check for !niov->pp in io_zcrx_recv_frag, just in case we end up with
a TX niov in that path (Pavel).
- Fix locking case in !netif_device_present (Jakub/Stan).
v9: https://lore.kernel.org/netdev/20250415224756.152002-1-almasrymina@google.c…
===
Changelog:
- Use priv->bindings list instead of sock_bindings_list. This was missed
during the rebase as the bindings have been updated to use
priv->bindings recently (thanks Stan!)
v8: https://lore.kernel.org/netdev/20250308214045.1160445-1-almasrymina@google.…
===
Only address minor comments on V7
Changelog:
- Use netdev locking instead of rtnl_locking to match rx path.
- Now that iouring zcrx is in net-next, use NET_IOV_IOURING instead of
NET_IOV_UNSPECIFIED.
- Post send binding to net_devmem_dmabuf_bindings after it's been fully
initialized (Stan).
v7: https://lore.kernel.org/netdev/20250227041209.2031104-1-almasrymina@google.…
===
Changelog:
- Check the dmabuf net_iov binding belongs to the device the TX is going
out on. (Jakub)
- Provide detailed inspection of callsites of
__skb_frag_ref/skb_page_unref in patch 2's changelog (Jakub)
v6: https://lore.kernel.org/netdev/20250222191517.743530-1-almasrymina@google.c…
===
v6 has no major changes. Addressed a few issues from Paolo and David,
and collected Acks from Stan. Thank you everyone for the review!
Changes:
- retain behavior to process MSG_FASTOPEN even if the provided cmsg is
invalid (Paolo).
- Rework the freeing of tx_vec slightly (it now has its own err label).
(Paolo).
- Squash the commit that makes dmabuf unbinding scheduled work into the
same one which implements the TX path so we don't run into future
errors on bisecting (Paolo).
- Fix/add comments to explain how dmabuf binding refcounting works
(David).
v5: https://lore.kernel.org/netdev/20250220020914.895431-1-almasrymina@google.c…
===
v5 has no major changes; it clears up the relatively minor issues
pointed out to in v4, and rebases the series on top of net-next to
resolve the conflict with a patch that raced to the tree. It also
collects the review tags from v4.
Changes:
- Rebase to net-next
- Fix issues in selftest (Stan).
- Address comments in the devmem and netmem driver docs (Stan and Bagas)
- Fix zerocopy_fill_skb_from_devmem return error code (Stan).
v4: https://lore.kernel.org/netdev/20250203223916.1064540-1-almasrymina@google.…
===
v4 mainly addresses the critical driver support issue surfaced in v3 by
Paolo and Stan. Drivers aiming to support netmem_tx should make sure not
to pass the netmem dma-addrs to the dma-mapping APIs, as these dma-addrs
may come from dma-bufs.
Additionally other feedback from v3 is addressed.
Major changes:
- Add helpers to handle netmem dma-addrs. Add GVE support for
netmem_tx.
- Fix binding->tx_vec not being freed on error paths during the
tx binding.
- Add a minimal devmem_tx test to devmem.py.
- Clean up everything obsolete from the cover letter (Paolo).
v3: https://patchwork.kernel.org/project/netdevbpf/list/?series=929401&state=*
===
Address minor comments from RFCv2 and fix a few build warnings and
ynl-regen issues. No major changes.
RFC v2: https://patchwork.kernel.org/project/netdevbpf/list/?series=920056&state=*
=======
RFC v2 addresses much of the feedback from RFC v1. I plan on sending
something close to this as net-next reopens, sending it slightly early
to get feedback if any.
Major changes:
--------------
- much improved UAPI as suggested by Stan. We now interpret the iov_base
of the passed in iov from userspace as the offset into the dmabuf to
send from. This removes the need to set iov.iov_base = NULL which may
be confusing to users, and enables us to send multiple iovs in the
same sendmsg() call. ncdevmem and the docs show a sample use of that.
- Removed the duplicate dmabuf iov_iter in binding->iov_iter. I think
this is good improvment as it was confusing to keep track of
2 iterators for the same sendmsg, and mistracking both iterators
caused a couple of bugs reported in the last iteration that are now
resolved with this streamlining.
- Improved test coverage in ncdevmem. Now multiple sendmsg() are tested,
and sending multiple iovs in the same sendmsg() is tested.
- Fixed issue where dmabuf unmapping was happening in invalid context
(Stan).
====================================================================
The TX path had been dropped from the Device Memory TCP patch series
post RFCv1 [1], to make that series slightly easier to review. This
series rebases the implementation of the TX path on top of the
net_iov/netmem framework agreed upon and merged. The motivation for
the feature is thoroughly described in the docs & cover letter of the
original proposal, so I don't repeat the lengthy descriptions here, but
they are available in [1].
Full outline on usage of the TX path is detailed in the documentation
included with this series.
Test example is available via the kselftest included in the series as well.
The series is relatively small, as the TX path for this feature largely
piggybacks on the existing MSG_ZEROCOPY implementation.
Patch Overview:
---------------
1. Documentation & tests to give high level overview of the feature
being added.
1. Add netmem refcounting needed for the TX path.
2. Devmem TX netlink API.
3. Devmem TX net stack implementation.
4. Make dma-buf unbinding scheduled work to handle TX cases where it gets
freed from contexts where we can't sleep.
5. Add devmem TX documentation.
6. Add scaffolding enabling driver support for netmem_tx. Add helpers, driver
feature flag, and docs to enable drivers to declare netmem_tx support.
7. Guard netmem_tx against being enabled against drivers that don't
support it.
8. Add devmem_tx selftests. Add TX path to ncdevmem and add a test to
devmem.py.
Testing:
--------
Testing is very similar to devmem TCP RX path. The ncdevmem test used
for the RX path is now augemented with client functionality to test TX
path.
* Test Setup:
Kernel: net-next with this RFC and memory provider API cherry-picked
locally.
Hardware: Google Cloud A3 VMs.
NIC: GVE with header split & RSS & flow steering support.
Performance results are not included with this version, unfortunately.
I'm having issues running the dma-buf exporter driver against the
upstream kernel on my test setup. The issues are specific to that
dma-buf exporter and do not affect this patch series. I plan to follow
up this series with perf fixes if the tests point to issues once they're
up and running.
Special thanks to Stan who took a stab at rebasing the TX implementation
on top of the netmem/net_iov framework merged. Parts of his proposal [2]
that are reused as-is are forked off into their own patches to give full
credit.
[1] https://lore.kernel.org/netdev/20240909054318.1809580-1-almasrymina@google.…
[2] https://lore.kernel.org/netdev/20240913150913.1280238-2-sdf@fomichev.me/T/#…
Cc: sdf(a)fomichev.me
Cc: asml.silence(a)gmail.com
Cc: dw(a)davidwei.uk
Cc: Jamal Hadi Salim <jhs(a)mojatatu.com>
Cc: Victor Nogueira <victor(a)mojatatu.com>
Cc: Pedro Tammela <pctammela(a)mojatatu.com>
Cc: Samiullah Khawaja <skhawaja(a)google.com>
Cc: Kuniyuki Iwashima <kuniyu(a)amazon.com>
Mina Almasry (8):
netmem: add niov->type attribute to distinguish different net_iov
types
net: add get_netmem/put_netmem support
net: devmem: Implement TX path
net: add devmem TCP TX documentation
net: enable driver support for netmem TX
gve: add netmem TX support to GVE DQO-RDA mode
net: check for driver support in netmem TX
selftests: ncdevmem: Implement devmem TCP TX
Stanislav Fomichev (1):
net: devmem: TCP tx netlink api
Documentation/netlink/specs/netdev.yaml | 12 +
Documentation/networking/devmem.rst | 150 ++++++++-
.../networking/net_cachelines/net_device.rst | 1 +
Documentation/networking/netdev-features.rst | 5 +
Documentation/networking/netmem.rst | 23 +-
drivers/net/ethernet/google/gve/gve_main.c | 3 +
drivers/net/ethernet/google/gve/gve_tx_dqo.c | 8 +-
include/linux/netdevice.h | 2 +
include/linux/skbuff.h | 17 +-
include/linux/skbuff_ref.h | 4 +-
include/net/netmem.h | 34 +-
include/net/sock.h | 1 +
include/uapi/linux/netdev.h | 1 +
io_uring/zcrx.c | 3 +-
net/core/datagram.c | 48 ++-
net/core/dev.c | 34 +-
net/core/devmem.c | 131 ++++++--
net/core/devmem.h | 83 ++++-
net/core/netdev-genl-gen.c | 13 +
net/core/netdev-genl-gen.h | 1 +
net/core/netdev-genl.c | 80 ++++-
net/core/skbuff.c | 48 ++-
net/core/sock.c | 5 +
net/ipv4/ip_output.c | 3 +-
net/ipv4/tcp.c | 48 ++-
net/ipv6/ip6_output.c | 3 +-
net/vmw_vsock/virtio_transport_common.c | 5 +-
tools/include/uapi/linux/netdev.h | 1 +
.../selftests/drivers/net/hw/devmem.py | 26 +-
.../selftests/drivers/net/hw/ncdevmem.c | 300 +++++++++++++++++-
30 files changed, 1007 insertions(+), 86 deletions(-)
base-commit: 3e52667a9c328b3d1a1ddbbb6b8fbf63a217bda3
--
2.49.0.987.g0cc8ee98dc-goog
The kunit test that checks the longests symbol length [1], has triggered
warnings in some pilelines when symbol prefixes are used [2]. The test
is adjunsted to depend on !PREFIX_SYMBOLS and !CFI_CLANG as sujested in [3]
[1] https://lore.kernel.org/rust-for-linux/CABVgOSm=5Q0fM6neBhxSbOUHBgNzmwf2V22…
[2] https://lore.kernel.org/all/20250328112156.2614513-1-arnd@kernel.org/T/#u
[3] https://lore.kernel.org/linux-kselftest/20250427200916.GA1661412@ax162/T/#t
Signed-off-by: Sergio González Collado <sergio.collado(a)gmail.com>
---
V1 -> V2: added dependency on !CFI_CLANG as suggested in [3], removed
CONFIG_ prefix
---
lib/Kconfig.debug | 2 +-
lib/tests/longest_symbol_kunit.c | 3 +--
2 files changed, 2 insertions(+), 3 deletions(-)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f9051ab610d5..5b33673d82da 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2885,7 +2885,7 @@ config FORTIFY_KUNIT_TEST
config LONGEST_SYM_KUNIT_TEST
tristate "Test the longest symbol possible" if !KUNIT_ALL_TESTS
- depends on KUNIT && KPROBES
+ depends on KUNIT && KPROBES && !PREFIX_SYMBOLS && !CFI_CLANG
default KUNIT_ALL_TESTS
help
Tests the longest symbol possible
diff --git a/lib/tests/longest_symbol_kunit.c b/lib/tests/longest_symbol_kunit.c
index e3c28ff1807f..b183fb92d1b2 100644
--- a/lib/tests/longest_symbol_kunit.c
+++ b/lib/tests/longest_symbol_kunit.c
@@ -3,8 +3,7 @@
* Test the longest symbol length. Execute with:
* ./tools/testing/kunit/kunit.py run longest-symbol
* --arch=x86_64 --kconfig_add CONFIG_KPROBES=y --kconfig_add CONFIG_MODULES=y
- * --kconfig_add CONFIG_RETPOLINE=n --kconfig_add CONFIG_CFI_CLANG=n
- * --kconfig_add CONFIG_MITIGATION_RETPOLINE=n
+ * --kconfig_add CONFIG_CPU_MITIGATIONS=n
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
base-commit: ebd297a2affadb6f6f4d2e5d975c1eda18ac762d
--
2.39.2
Until CONFIG_DMABUF_SYSFS_STATS was added [1] it was only possible to
perform per-buffer accounting with debugfs which is not suitable for
production environments. Eventually we discovered the overhead with
per-buffer sysfs file creation/removal was significantly impacting
allocation and free times, and exacerbated kernfs lock contention. [2]
dma_buf_stats_setup() is responsible for 39% of single-page buffer
creation duration, or 74% of single-page dma_buf_export() duration when
stressing dmabuf allocations and frees.
I prototyped a change from per-buffer to per-exporter statistics with a
RCU protected list of exporter allocations that accommodates most (but
not all) of our use-cases and avoids almost all of the sysfs overhead.
While that adds less overhead than per-buffer sysfs, and less even than
the maintenance of the dmabuf debugfs_list, it's still *additional*
overhead on top of the debugfs_list and doesn't give us per-buffer info.
This series uses the existing dmabuf debugfs_list to implement a BPF
dmabuf iterator, which adds no overhead to buffer allocation/free and
provides per-buffer info. The list has been moved outside of
CONFIG_DEBUG_FS scope so that it is always populated. The BPF program
loaded by userspace that extracts per-buffer information gets to define
its own interface which avoids the lack of ABI stability with debugfs.
This will allow us to replace our use of CONFIG_DMABUF_SYSFS_STATS, and
the plan is to remove it from the kernel after the next longterm stable
release.
[1] https://lore.kernel.org/linux-media/20201210044400.1080308-1-hridya@google.…
[2] https://lore.kernel.org/all/20220516171315.2400578-1-tjmercier@google.com
v1: https://lore.kernel.org/all/20250414225227.3642618-1-tjmercier@google.com
v1 -> v2:
Make the DMA buffer list independent of CONFIG_DEBUG_FS per Christian
König
Add CONFIG_DMA_SHARED_BUFFER check to kernel/bpf/Makefile per kernel
test robot
Use BTF_ID_LIST_SINGLE instead of BTF_ID_LIST_GLOBAL_SINGLE per Song Liu
Fixup comment style, mixing code/declarations, and use ASSERT_OK_FD in
selftest per Song Liu
Add BPF_ITER_RESCHED feature to bpf_dmabuf_reg_info per Alexei
Starovoitov
Add open-coded iterator and selftest per Alexei Starovoitov
Add a second test buffer from the system dmabuf heap to selftests
Use the BPF program we'll use in production for selftest per Alexei
Starovoitov
https://r.android.com/c/platform/system/bpfprogs/+/3616123/2/dmabufIter.chttps://r.android.com/c/platform/system/memory/libmeminfo/+/3614259/1/libdm…
v2: https://lore.kernel.org/all/20250504224149.1033867-1-tjmercier@google.com
v2 -> v3:
Rebase onto bpf-next/master
Move get_next_dmabuf() into drivers/dma-buf/dma-buf.c, along with the
new get_first_dmabuf(). This avoids having to expose the dmabuf list
and mutex to the rest of the kernel, and keeps the dmabuf mutex
operations near each other in the same file. (Christian König)
Add Christian's RB to dma-buf: Rename debugfs symbols
Drop RFC: dma-buf: Remove DMA-BUF statistics
v3: https://lore.kernel.org/all/20250507001036.2278781-1-tjmercier@google.com
v3 -> v4:
Fix selftest BPF program comment style (not kdoc) per Alexei Starovoitov
Fix dma-buf.c kdoc comment style per Alexei Starovoitov
Rename get_first_dmabuf / get_next_dmabuf to dma_buf_iter_begin /
dma_buf_iter_next per Christian König
Add Christian's RB to bpf: Add dmabuf iterator
v4: https://lore.kernel.org/all/20250508182025.2961555-1-tjmercier@google.com
v4 -> v5:
Add Christian's Acks to all patches
Add Song Liu's Acks
Move BTF_ID_LIST_SINGLE and DEFINE_BPF_ITER_FUNC closer to usage per
Song Liu
Fix open-coded iterator comment style per Song Liu
Move iterator termination check to its own subtest per Song Liu
Rework selftest buffer creation per Song Liu
Fix spacing in sanitize_string per BPF CI
T.J. Mercier (5):
dma-buf: Rename debugfs symbols
bpf: Add dmabuf iterator
bpf: Add open coded dmabuf iterator
selftests/bpf: Add test for dmabuf_iter
selftests/bpf: Add test for open coded dmabuf_iter
drivers/dma-buf/dma-buf.c | 98 +++++--
include/linux/dma-buf.h | 4 +-
kernel/bpf/Makefile | 3 +
kernel/bpf/dmabuf_iter.c | 150 ++++++++++
kernel/bpf/helpers.c | 5 +
.../testing/selftests/bpf/bpf_experimental.h | 5 +
tools/testing/selftests/bpf/config | 3 +
.../selftests/bpf/prog_tests/dmabuf_iter.c | 276 ++++++++++++++++++
.../testing/selftests/bpf/progs/dmabuf_iter.c | 91 ++++++
9 files changed, 613 insertions(+), 22 deletions(-)
create mode 100644 kernel/bpf/dmabuf_iter.c
create mode 100644 tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c
create mode 100644 tools/testing/selftests/bpf/progs/dmabuf_iter.c
base-commit: 43745d11bfd9683abdf08ad7a5cc403d6a9ffd15
--
2.49.0.1045.g170613ef41-goog
The SBI Firmware Feature extension allows the S-mode to request some
specific features (either hardware or software) to be enabled. This
series uses this extension to request misaligned access exception
delegation to S-mode in order to let the kernel handle it. It also adds
support for the KVM FWFT SBI extension based on the misaligned access
handling infrastructure.
FWFT SBI extension is part of the SBI V3.0 specifications [1]. It can be
tested using the qemu provided at [2] which contains the series from
[3]. Upstream kvm-unit-tests can be used inside kvm to tests the correct
delegation of misaligned exceptions. Upstream OpenSBI can be used.
Note: Since SBI V3.0 is not yet ratified, FWFT extension API is split
between interface only and implementation, allowing to pick only the
interface which do not have hard dependencies on SBI.
The tests can be run using the kselftest from series [4].
$ qemu-system-riscv64 \
-cpu rv64,trap-misaligned-access=true,v=true \
-M virt \
-m 1024M \
-bios fw_dynamic.bin \
-kernel Image
...
# ./misaligned
TAP version 13
1..23
# Starting 23 tests from 1 test cases.
# RUN global.gp_load_lh ...
# OK global.gp_load_lh
ok 1 global.gp_load_lh
# RUN global.gp_load_lhu ...
# OK global.gp_load_lhu
ok 2 global.gp_load_lhu
# RUN global.gp_load_lw ...
# OK global.gp_load_lw
ok 3 global.gp_load_lw
# RUN global.gp_load_lwu ...
# OK global.gp_load_lwu
ok 4 global.gp_load_lwu
# RUN global.gp_load_ld ...
# OK global.gp_load_ld
ok 5 global.gp_load_ld
# RUN global.gp_load_c_lw ...
# OK global.gp_load_c_lw
ok 6 global.gp_load_c_lw
# RUN global.gp_load_c_ld ...
# OK global.gp_load_c_ld
ok 7 global.gp_load_c_ld
# RUN global.gp_load_c_ldsp ...
# OK global.gp_load_c_ldsp
ok 8 global.gp_load_c_ldsp
# RUN global.gp_load_sh ...
# OK global.gp_load_sh
ok 9 global.gp_load_sh
# RUN global.gp_load_sw ...
# OK global.gp_load_sw
ok 10 global.gp_load_sw
# RUN global.gp_load_sd ...
# OK global.gp_load_sd
ok 11 global.gp_load_sd
# RUN global.gp_load_c_sw ...
# OK global.gp_load_c_sw
ok 12 global.gp_load_c_sw
# RUN global.gp_load_c_sd ...
# OK global.gp_load_c_sd
ok 13 global.gp_load_c_sd
# RUN global.gp_load_c_sdsp ...
# OK global.gp_load_c_sdsp
ok 14 global.gp_load_c_sdsp
# RUN global.fpu_load_flw ...
# OK global.fpu_load_flw
ok 15 global.fpu_load_flw
# RUN global.fpu_load_fld ...
# OK global.fpu_load_fld
ok 16 global.fpu_load_fld
# RUN global.fpu_load_c_fld ...
# OK global.fpu_load_c_fld
ok 17 global.fpu_load_c_fld
# RUN global.fpu_load_c_fldsp ...
# OK global.fpu_load_c_fldsp
ok 18 global.fpu_load_c_fldsp
# RUN global.fpu_store_fsw ...
# OK global.fpu_store_fsw
ok 19 global.fpu_store_fsw
# RUN global.fpu_store_fsd ...
# OK global.fpu_store_fsd
ok 20 global.fpu_store_fsd
# RUN global.fpu_store_c_fsd ...
# OK global.fpu_store_c_fsd
ok 21 global.fpu_store_c_fsd
# RUN global.fpu_store_c_fsdsp ...
# OK global.fpu_store_c_fsdsp
ok 22 global.fpu_store_c_fsdsp
# RUN global.gen_sigbus ...
[12797.988647] misaligned[618]: unhandled signal 7 code 0x1 at 0x0000000000014dc0 in misaligned[4dc0,10000+76000]
[12797.988990] CPU: 0 UID: 0 PID: 618 Comm: misaligned Not tainted 6.13.0-rc6-00008-g4ec4468967c9-dirty #51
[12797.989169] Hardware name: riscv-virtio,qemu (DT)
[12797.989264] epc : 0000000000014dc0 ra : 0000000000014d00 sp : 00007fffe165d100
[12797.989407] gp : 000000000008f6e8 tp : 0000000000095760 t0 : 0000000000000008
[12797.989544] t1 : 00000000000965d8 t2 : 000000000008e830 s0 : 00007fffe165d160
[12797.989692] s1 : 000000000000001a a0 : 0000000000000000 a1 : 0000000000000002
[12797.989831] a2 : 0000000000000000 a3 : 0000000000000000 a4 : ffffffffdeadbeef
[12797.989964] a5 : 000000000008ef61 a6 : 626769735f6e0000 a7 : fffffffffffff000
[12797.990094] s2 : 0000000000000001 s3 : 00007fffe165d838 s4 : 00007fffe165d848
[12797.990238] s5 : 000000000000001a s6 : 0000000000010442 s7 : 0000000000010200
[12797.990391] s8 : 000000000000003a s9 : 0000000000094508 s10: 0000000000000000
[12797.990526] s11: 0000555567460668 t3 : 00007fffe165d070 t4 : 00000000000965d0
[12797.990656] t5 : fefefefefefefeff t6 : 0000000000000073
[12797.990756] status: 0000000200004020 badaddr: 000000000008ef61 cause: 0000000000000006
[12797.990911] Code: 8793 8791 3423 fcf4 3783 fc84 c737 dead 0713 eef7 (c398) 0001
# OK global.gen_sigbus
ok 23 global.gen_sigbus
# PASSED: 23 / 23 tests passed.
# Totals: pass:23 fail:0 xfail:0 xpass:0 skip:0 error:0
With kvm-tools:
# lkvm run -k sbi.flat -m 128
Info: # lkvm run -k sbi.flat -m 128 -c 1 --name guest-97
Info: Removed ghost socket file "/root/.lkvm//guest-97.sock".
##########################################################################
# kvm-unit-tests
##########################################################################
... [test messages elided]
PASS: sbi: fwft: FWFT extension probing no error
PASS: sbi: fwft: get/set reserved feature 0x6 error == SBI_ERR_DENIED
PASS: sbi: fwft: get/set reserved feature 0x3fffffff error == SBI_ERR_DENIED
PASS: sbi: fwft: get/set reserved feature 0x80000000 error == SBI_ERR_DENIED
PASS: sbi: fwft: get/set reserved feature 0xbfffffff error == SBI_ERR_DENIED
PASS: sbi: fwft: misaligned_deleg: Get misaligned deleg feature no error
PASS: sbi: fwft: misaligned_deleg: Set misaligned deleg feature invalid value error
PASS: sbi: fwft: misaligned_deleg: Set misaligned deleg feature invalid value error
PASS: sbi: fwft: misaligned_deleg: Set misaligned deleg feature value no error
PASS: sbi: fwft: misaligned_deleg: Set misaligned deleg feature value 0
PASS: sbi: fwft: misaligned_deleg: Set misaligned deleg feature value no error
PASS: sbi: fwft: misaligned_deleg: Set misaligned deleg feature value 1
PASS: sbi: fwft: misaligned_deleg: Verify misaligned load exception trap in supervisor
SUMMARY: 50 tests, 2 unexpected failures, 12 skipped
This series is available at [5].
Link: https://github.com/riscv-non-isa/riscv-sbi-doc/releases/download/vv3.0-rc2/… [1]
Link: https://github.com/rivosinc/qemu/tree/dev/cleger/misaligned [2]
Link: https://lore.kernel.org/all/20241211211933.198792-3-fkonrad@amd.com/T/ [3]
Link: https://lore.kernel.org/linux-riscv/20250414123543.1615478-1-cleger@rivosin… [4]
Link: https://github.com/rivosinc/linux/tree/dev/cleger/fwft [5]
---
V6:
- Rename FWFT interface to remove "_local"
- Fix test for MEDELEG values in KVM FWFT support
- Add __init for unaligned_access_init()
- Rebased on master
V5:
- Return ERANGE as mapping for SBI_ERR_BAD_RANGE
- Removed unused sbi_fwft_get()
- Fix kernel for sbi_fwft_local_set_cpumask()
- Fix indentation for sbi_fwft_local_set()
- Remove spurious space in kvm_sbi_fwft_ops.
- Rebased on origin/master
- Remove fixes commits and sent them as a separate series [4]
V4:
- Check SBI version 3.0 instead of 2.0 for FWFT presence
- Use long for kvm_sbi_fwft operation return value
- Init KVM sbi extension even if default_disabled
- Remove revert_on_fail parameter for sbi_fwft_feature_set().
- Fix comments for sbi_fwft_set/get()
- Only handle local features (there are no globals yet in the spec)
- Add new SBI errors to sbi_err_map_linux_errno()
V3:
- Added comment about kvm sbi fwft supported/set/get callback
requirements
- Move struct kvm_sbi_fwft_feature in kvm_sbi_fwft.c
- Add a FWFT interface
V2:
- Added Kselftest for misaligned testing
- Added get_user() usage instead of __get_user()
- Reenable interrupt when possible in misaligned access handling
- Document that riscv supports unaligned-traps
- Fix KVM extension state when an init function is present
- Rework SBI misaligned accesses trap delegation code
- Added support for CPU hotplugging
- Added KVM SBI reset callback
- Added reset for KVM SBI FWFT lock
- Return SBI_ERR_DENIED_LOCKED when LOCK flag is set
Clément Léger (14):
riscv: sbi: add Firmware Feature (FWFT) SBI extensions definitions
riscv: sbi: remove useless parenthesis
riscv: sbi: add new SBI error mappings
riscv: sbi: add FWFT extension interface
riscv: sbi: add SBI FWFT extension calls
riscv: misaligned: request misaligned exception from SBI
riscv: misaligned: use on_each_cpu() for scalar misaligned access
probing
riscv: misaligned: use correct CONFIG_ ifdef for
misaligned_access_speed
riscv: misaligned: move emulated access uniformity check in a function
riscv: misaligned: add a function to check misalign trap delegability
RISC-V: KVM: add SBI extension init()/deinit() functions
RISC-V: KVM: add SBI extension reset callback
RISC-V: KVM: add support for FWFT SBI extension
RISC-V: KVM: add support for SBI_FWFT_MISALIGNED_DELEG
arch/riscv/include/asm/cpufeature.h | 8 +-
arch/riscv/include/asm/kvm_host.h | 5 +-
arch/riscv/include/asm/kvm_vcpu_sbi.h | 12 +
arch/riscv/include/asm/kvm_vcpu_sbi_fwft.h | 29 +++
arch/riscv/include/asm/sbi.h | 60 +++++
arch/riscv/include/uapi/asm/kvm.h | 1 +
arch/riscv/kernel/sbi.c | 81 ++++++-
arch/riscv/kernel/traps_misaligned.c | 110 ++++++++-
arch/riscv/kernel/unaligned_access_speed.c | 8 +-
arch/riscv/kvm/Makefile | 1 +
arch/riscv/kvm/vcpu.c | 7 +-
arch/riscv/kvm/vcpu_sbi.c | 54 +++++
arch/riscv/kvm/vcpu_sbi_fwft.c | 252 +++++++++++++++++++++
arch/riscv/kvm/vcpu_sbi_sta.c | 3 +-
14 files changed, 613 insertions(+), 18 deletions(-)
create mode 100644 arch/riscv/include/asm/kvm_vcpu_sbi_fwft.h
create mode 100644 arch/riscv/kvm/vcpu_sbi_fwft.c
--
2.49.0
hi,
We can use get_func_[arg|arg_cnt] helpers in fentry/fexit/fmod_ret programs
currently[1]. But they can't be used in raw_tp/tp_btf programs.
Adding support to use get_func_[arg|arg_cnt] helpers in raw_tp/tp_btf
programs.
Adding BPF_PROG_TEST_RUN for tp_btf.
Add selftests to check them.
Thanks,
KaFai
[1] https://lore.kernel.org/bpf/20211208193245.172141-1-jolsa@kernel.org/
---
KaFai Wan (4):
bpf: Allow get_func_[arg|arg_cnt] helpers in raw tracepoint programs
bpf: Enable BPF_PROG_TEST_RUN for tp_btf
selftests/bpf: Add raw_tp_test_run for tp_btf
selftests/bpf: Add tests for get_func_[arg|arg_cnt] helpers in raw
tracepoint programs
kernel/trace/bpf_trace.c | 17 +++++--
net/bpf/test_run.c | 16 +++----
.../bpf/prog_tests/raw_tp_get_func_args.c | 48 +++++++++++++++++++
.../bpf/prog_tests/raw_tp_test_run.c | 18 ++++++-
.../bpf/progs/test_raw_tp_get_func_args.c | 47 ++++++++++++++++++
.../bpf/progs/test_raw_tp_test_run.c | 16 +++++--
6 files changed, 146 insertions(+), 16 deletions(-)
create mode 100644 tools/testing/selftests/bpf/prog_tests/raw_tp_get_func_args.c
create mode 100644 tools/testing/selftests/bpf/progs/test_raw_tp_get_func_args.c
--
2.43.0
This series introduces a new VIOMMU infrastructure and related ioctls.
IOMMUFD has been using the HWPT infrastructure for all cases, including a
nested IO page table support. Yet, there're limitations for an HWPT-based
structure to support some advanced HW-accelerated features, such as CMDQV
on NVIDIA Grace, and HW-accelerated vIOMMU on AMD. Even for a multi-IOMMU
environment, it is not straightforward for nested HWPTs to share the same
parent HWPT (stage-2 IO pagetable), with the HWPT infrastructure alone.
The new VIOMMU object is an additional layer, between the nested HWPT and
its parent HWPT, to give to both the IOMMUFD core and an IOMMU driver an
additional structure to support HW-accelerated feature:
----------------------------
---------------- | | paging_hwpt0 |
| hwpt_nested0 |--->| viommu0 ------------------
---------------- | | HW-accel feats |
----------------------------
On a multi-IOMMU system, the VIOMMU object can be instanced to the number
of vIOMMUs in a guest VM, while holding the same parent HWPT to share the
stage-2 IO pagetable. Each VIOMMU then just need to only allocate its own
VMID to attach the shared stage-2 IO pagetable to the physical IOMMU:
----------------------------
---------------- | | paging_hwpt0 |
| hwpt_nested0 |--->| viommu0 ------------------
---------------- | | VMID0 |
----------------------------
----------------------------
---------------- | | paging_hwpt0 |
| hwpt_nested1 |--->| viommu1 ------------------
---------------- | | VMID1 |
----------------------------
As an initial part-1, add ioctls to support a VIOMMU-based invalidation:
IOMMUFD_CMD_VIOMMU_ALLOC to allocate a VIOMMU object
IOMMUFD_CMD_VIOMMU_SET/UNSET_VDEV_ID to set/clear device's virtual ID
(Resue IOMMUFD_CMD_HWPT_INVALIDATE for a VIOMMU object to flush cache
by a given driver data)
Worth noting that the VDEV_ID is for a per-VIOMMU device list for drivers
to look up the device's physical instance from its virtual ID in a VM. It
is essential for a VIOMMU-based invalidation where the request contains a
device's virtual ID for its device cache flush, e.g. ATC invalidation.
As for the implementation of the series, add an IOMMU_VIOMMU_TYPE_DEFAULT
type for a core-allocated-core-managed VIOMMU object, allowing drivers to
simply hook a default viommu ops for viommu-based invalidation alone. And
provide some viommu helpers to drivers for VDEV_ID translation and parent
domain lookup. Add VIOMMU invalidation support to ARM SMMUv3 driver for a
real world use case. This adds supports of arm-smmuv-v3's CMDQ_OP_ATC_INV
and CMDQ_OP_CFGI_CD/ALL commands, supplementing HWPT-based invalidations.
In the future, drivers will also be able to choose a driver-managed type
to hold its own structure by adding a new type to enum iommu_viommu_type.
More VIOMMU-based structures and ioctls will be introduced in part-2/3 to
support a driver-managed VIOMMU, e.g. VQUEUE object for a HW accelerated
queue, VIRQ (or VEVENT) object for IRQ injections. Although we repurposed
the VIOMMU object from an earlier RFC discussion, for a referece:
https://lore.kernel.org/all/cover.1712978212.git.nicolinc@nvidia.com/
This series is on Github:
https://github.com/nicolinc/iommufd/commits/iommufd_viommu_p1-v2
Paring QEMU branch for testing:
https://github.com/nicolinc/qemu/commits/wip/for_iommufd_viommu_p1-v2
Changelog
v2
* Limited vdev_id to one per idev
* Added a rw_sem to protect the vdev_id list
* Reworked driver-level APIs with proper lockings
* Added a new viommu_api file for IOMMUFD_DRIVER config
* Dropped useless iommu_dev point from the viommu structure
* Added missing index numnbers to new types in the uAPI header
* Dropped IOMMU_VIOMMU_INVALIDATE uAPI; Instead, reuse the HWPT one
* Reworked mock_viommu_cache_invalidate() using the new iommu helper
* Reordered details of set/unset_vdev_id handlers for proper lockings
* Added arm_smmu_cache_invalidate_user patch from Jason's nesting series
v1
https://lore.kernel.org/all/cover.1723061377.git.nicolinc@nvidia.com/
Thanks!
Nicolin
Jason Gunthorpe (3):
iommu: Add iommu_copy_struct_from_full_user_array helper
iommu/arm-smmu-v3: Allow ATS for IOMMU_DOMAIN_NESTED
iommu/arm-smmu-v3: Update comments about ATS and bypass
Nicolin Chen (16):
iommufd: Reorder struct forward declarations
iommufd/viommu: Add IOMMUFD_OBJ_VIOMMU and IOMMU_VIOMMU_ALLOC ioctl
iommu: Pass in a viommu pointer to domain_alloc_user op
iommufd: Allow pt_id to carry viommu_id for IOMMU_HWPT_ALLOC
iommufd/selftest: Add IOMMU_VIOMMU_ALLOC test coverage
iommufd/viommu: Add IOMMU_VIOMMU_SET/UNSET_VDEV_ID ioctl
iommufd/selftest: Add IOMMU_VIOMMU_SET/UNSET_VDEV_ID test coverage
iommufd/viommu: Add cache_invalidate for IOMMU_VIOMMU_TYPE_DEFAULT
iommufd: Allow hwpt_id to carry viommu_id for IOMMU_HWPT_INVALIDATE
iommufd/viommu: Add vdev_id helpers for IOMMU drivers
iommufd/selftest: Add mock_viommu_invalidate_user op
iommufd/selftest: Add IOMMU_TEST_OP_DEV_CHECK_CACHE test command
iommufd/selftest: Add VIOMMU coverage for IOMMU_HWPT_INVALIDATE ioctl
iommufd/viommu: Add iommufd_viommu_to_parent_domain helper
iommu/arm-smmu-v3: Add arm_smmu_cache_invalidate_user
iommu/arm-smmu-v3: Add arm_smmu_viommu_cache_invalidate
drivers/iommu/amd/iommu.c | 1 +
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 218 ++++++++++++++-
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 3 +
drivers/iommu/intel/iommu.c | 1 +
drivers/iommu/iommufd/Makefile | 5 +-
drivers/iommu/iommufd/device.c | 12 +
drivers/iommu/iommufd/hw_pagetable.c | 59 +++-
drivers/iommu/iommufd/iommufd_private.h | 37 +++
drivers/iommu/iommufd/iommufd_test.h | 30 ++
drivers/iommu/iommufd/main.c | 12 +
drivers/iommu/iommufd/selftest.c | 101 ++++++-
drivers/iommu/iommufd/viommu.c | 196 +++++++++++++
drivers/iommu/iommufd/viommu_api.c | 53 ++++
include/linux/iommu.h | 56 +++-
include/linux/iommufd.h | 51 +++-
include/uapi/linux/iommufd.h | 117 +++++++-
tools/testing/selftests/iommu/iommufd.c | 259 +++++++++++++++++-
tools/testing/selftests/iommu/iommufd_utils.h | 126 +++++++++
18 files changed, 1299 insertions(+), 38 deletions(-)
create mode 100644 drivers/iommu/iommufd/viommu.c
create mode 100644 drivers/iommu/iommufd/viommu_api.c
--
2.43.0
This series fixes misaligned access handling when in non interruptible
context by reenabling interrupts when possible. A previous commit
changed raw_copy_from_user() with copy_from_user() which enables page
faulting and thus can sleep. While correct, a warning is now triggered
due to being called in an invalid context (sleeping in
non-interruptible). This series fixes that problem by factorizing
misaligned load/store entry in a single function than reenables
interrupt if the interrupted context had interrupts enabled.
In order for misaligned handling problems to be caught sooner, add a
kselftest for all the currently supported instructions .
Note: these commits were actually part of another larger series for
misaligned request delegation but was split since it isn't directly
required.
V2:
- Use an array of struct to simplify misaligned load/store selection
- Revert use of irqentry_enter()/exit() to irqentry_nmi_enter() for
kernel space.
Clément Léger (5):
riscv: misaligned: factorize trap handling
riscv: misaligned: enable IRQs while handling misaligned accesses
riscv: misaligned: use get_user() instead of __get_user()
Documentation/sysctl: add riscv to unaligned-trap supported archs
selftests: riscv: add misaligned access testing
Documentation/admin-guide/sysctl/kernel.rst | 4 +-
arch/riscv/kernel/traps.c | 64 +++--
arch/riscv/kernel/traps_misaligned.c | 2 +-
.../selftests/riscv/misaligned/.gitignore | 1 +
.../selftests/riscv/misaligned/Makefile | 12 +
.../selftests/riscv/misaligned/common.S | 33 +++
.../testing/selftests/riscv/misaligned/fpu.S | 180 +++++++++++++
tools/testing/selftests/riscv/misaligned/gp.S | 103 +++++++
.../selftests/riscv/misaligned/misaligned.c | 254 ++++++++++++++++++
9 files changed, 623 insertions(+), 30 deletions(-)
create mode 100644 tools/testing/selftests/riscv/misaligned/.gitignore
create mode 100644 tools/testing/selftests/riscv/misaligned/Makefile
create mode 100644 tools/testing/selftests/riscv/misaligned/common.S
create mode 100644 tools/testing/selftests/riscv/misaligned/fpu.S
create mode 100644 tools/testing/selftests/riscv/misaligned/gp.S
create mode 100644 tools/testing/selftests/riscv/misaligned/misaligned.c
--
2.49.0
┌────────────┐ ┌───────────────────────────────────┐ ┌────────────────┐
│ │ │ │ │ │
│ │ │ PCI Endpoint │ │ PCI Host │
│ │ │ │ │ │
│ │◄──┤ 1.platform_msi_domain_alloc_irqs()│ │ │
│ │ │ │ │ │
│ MSI ├──►│ 2.write_msi_msg() ├──►├─BAR<n> │
│ Controller │ │ update doorbell register address│ │ │
│ │ │ for BAR │ │ │
│ │ │ │ │ 3. Write BAR<n>│
│ │◄──┼───────────────────────────────────┼───┤ │
│ │ │ │ │ │
│ ├──►│ 4.Irq Handle │ │ │
│ │ │ │ │ │
│ │ │ │ │ │
└────────────┘ └───────────────────────────────────┘ └────────────────┘
This patches based on old https://lore.kernel.org/imx/20221124055036.1630573-1-Frank.Li@nxp.com/
Original patch only target to vntb driver. But actually it is common
method.
This patches add new API to pci-epf-core, so any EP driver can use it.
Previous v2 discussion here.
https://lore.kernel.org/imx/20230911220920.1817033-1-Frank.Li@nxp.com/
Changes in v18:
- pci-ep.yaml: sort property order, fix maxvalue to 0x7ffff for msi-map-mask and
iommu-map-mask
- Link to v17: https://lore.kernel.org/r/20250407-ep-msi-v17-0-633ab45a31d0@nxp.com
Changes in v17:
- move document part to pci-ep.yaml
- Link to v16: https://lore.kernel.org/r/20250404-ep-msi-v16-0-d4919d68c0d0@nxp.com
Changes in v16:
- remove arm64: dts: imx95-19x19-evk: Add PCIe1 endpoint function overlay file
because there are better patches, which under review.
- Add document for pcie-ep msi-map usage
- other change to see each patch's change log
About IMMUTABLE (No change for this part, tglx provide feedback)
> - This IMMUTABLE thing serves no purpose, because you don't randomly
> plug this end-point block on any MSI controller. They come as part
> of an SoC.
"Yes and no. The problem is that the EP implementation is meant to be a
generic library and while GIC-ITS guarantees immutability of the
address/data pair after setup, there are architectures (x86, loongson,
riscv) where the base MSI controller does not and immutability is only
achieved when interrupt remapping is enabled. The latter can be disabled
at boot-time and then the EP implementation becomes a lottery across
affinity changes.
That was my concern about this library implementation and that's why I
asked for a mechanism to ensure that the underlying irqdomain provides a
immutable address/data pair.
So it does not matter for GIC-ITS, but in the larger picture it matters.
Thanks,
tglx
"
So it does not matter for GIC-ITS, but in the larger picture it matters.
- Link to v15: https://lore.kernel.org/r/20250211-ep-msi-v15-0-bcacc1f2b1a9@nxp.com
Changes in v15:
- rebase to v6.14-rc1
- fix build issue find by kernel test robot
- Link to v14: https://lore.kernel.org/r/20250207-ep-msi-v14-0-9671b136f2b8@nxp.com
Changes in v14:
Marc Zyngier raised concerns about adding DOMAIN_BUS_DEVICE_PCI_EP_MSI. As
a result, the approach has been reverted to the v9 method. However, there
are several improvements:
MSI now supports msi-map in addition to msi-parent.
- The struct device: id is used as the endpoint function (EPF) device
identity to map to the stream ID (sideband information).
- The EPC device tree source (DTS) utilizes msi-map to provide such
information.
- The EPF device's of_node is set to the EPC controller’s node. This
approach is commonly used for multi-function device (MFD) platform child
devices, allowing them to inherit properties from the MFD device’s DTS,
such as reset-cells and gpio-cells. This method is well-suited for the
current case, as the EPF is inherently created/binded to the EPC and
should inherit the EPC’s DTS node properties.
Additionally:
Since the basic IMX95 LUT support has already been merged into the
mainline, a DTS and driver increment patch is added to complete the
solution. The patch is rebased onto the latest linux-next tree and
aligned with the new pcitest framework.
- Link to v13: https://lore.kernel.org/r/20241218-ep-msi-v13-0-646e2192dc24@nxp.com
Changes in v13:
- Change to use DOMAIN_BUS_PCI_DEVICE_EP_MSI
- Change request id as func | vfunc << 3
- Remove IRQ_DOMAIN_MSI_IMMUTABLE
Thomas Gleixner:
I hope capture all your points in review comments. If missed, let me know.
- Link to v12: https://lore.kernel.org/r/20241211-ep-msi-v12-0-33d4532fa520@nxp.com
Changes in v12:
- Change to use IRQ_DOMAIN_MSI_IMMUTABLE and add help function
irq_domain_msi_is_immuatble().
- split PCI: endpoint: pci-ep-msi: Add MSI address/data pair mutable check to 3 patches
- Link to v11: https://lore.kernel.org/r/20241209-ep-msi-v11-0-7434fa8397bd@nxp.com
Changes in v11:
- Change to use MSI_FLAG_MSG_IMMUTABLE
- Link to v10: https://lore.kernel.org/r/20241204-ep-msi-v10-0-87c378dbcd6d@nxp.com
Changes in v10:
Thomas Gleixner:
There are big change in pci-ep-msi.c. I am sure if go on the
corrent path. The key improvement is remove only 1 function devices's
limitation.
I use new patch for imutable check, which relative additional
feature compared to base enablement patch.
- Remove patch Add msi_remove_device_irq_domain() in platform_device_msi_free_irqs_all()
- Add new patch irqchip/gic-v3-its: Avoid overwriting msi_prepare callback if provided by msi_domain_info
- Remove only support 1 endpoint function limiation.
- Create one MSI domain for each endpoint function devices.
- Use "msi-map" in pci ep controler node, instead of of msi-parent. first
argument is
(func_no << 8 | vfunc_no)
- Link to v9: https://lore.kernel.org/r/20241203-ep-msi-v9-0-a60dbc3f15dd@nxp.com
Changes in v9
- Add patch platform-msi: Add msi_remove_device_irq_domain() in platform_device_msi_free_irqs_all()
- Remove patch PCI: endpoint: Add pci_epc_get_fn() API for customizable filtering
- Remove API pci_epf_align_inbound_addr_lo_hi
- Move doorbell_alloc in to doorbell_enable function.
- Link to v8: https://lore.kernel.org/r/20241116-ep-msi-v8-0-6f1f68ffd1bb@nxp.com
Changes in v8:
- update helper function name to pci_epf_align_inbound_addr()
- Link to v7: https://lore.kernel.org/r/20241114-ep-msi-v7-0-d4ac7aafbd2c@nxp.com
Changes in v7:
- Add helper function pci_epf_align_addr();
- Link to v6: https://lore.kernel.org/r/20241112-ep-msi-v6-0-45f9722e3c2a@nxp.com
Changes in v6:
- change doorbell_addr to doorbell_offset
- use round_down()
- add Niklas's test by tag
- rebase to pci/endpoint
- Link to v5: https://lore.kernel.org/r/20241108-ep-msi-v5-0-a14951c0d007@nxp.com
Changes in v5:
- Move request_irq to epf test function driver for more flexiable user case
- Add fixed size bar handler
- Some minor improvememtn to see each patches's changelog.
- Link to v4: https://lore.kernel.org/r/20241031-ep-msi-v4-0-717da2d99b28@nxp.com
Changes in v4:
- Remove patch genirq/msi: Add cleanup guard define for msi_lock_descs()/msi_unlock_descs()
- Use new method to avoid compatible problem.
Add new command DOORBELL_ENABLE and DOORBELL_DISABLE.
pcitest -B send DOORBELL_ENABLE first, EP test function driver try to
remap one of BAR_N (except test register bar) to ITS MSI MMIO space. Old
driver don't support new command, so failure return, not side effect.
After test, DOORBELL_DISABLE command send out to recover original map, so
pcitest bar test can pass as normal.
- Other detail change see each patches's change log
- Link to v3: https://lore.kernel.org/r/20241015-ep-msi-v3-0-cedc89a16c1a@nxp.com
Change from v2 to v3
- Fixed manivannan's comments
- Move common part to pci-ep-msi.c and pci-ep-msi.h
- rebase to 6.12-rc1
- use RevID to distingiush old version
mkdir /sys/kernel/config/pci_ep/functions/pci_epf_test/func1
echo 16 > /sys/kernel/config/pci_ep/functions/pci_epf_test/func1/msi_interrupts
echo 0x080c > /sys/kernel/config/pci_ep/functions/pci_epf_test/func1/deviceid
echo 0x1957 > /sys/kernel/config/pci_ep/functions/pci_epf_test/func1/vendorid
echo 1 > /sys/kernel/config/pci_ep/functions/pci_epf_test/func1/revid
^^^^^^ to enable platform msi support.
ln -s /sys/kernel/config/pci_ep/functions/pci_epf_test/func1 /sys/kernel/config/pci_ep/controllers/4c380000.pcie-ep
- use new device ID, which identify support doorbell to avoid broken
compatility.
Enable doorbell support only for PCI_DEVICE_ID_IMX8_DB, while other devices
keep the same behavior as before.
EP side RC with old driver RC with new driver
PCI_DEVICE_ID_IMX8_DB no probe doorbell enabled
Other device ID doorbell disabled* doorbell disabled*
* Behavior remains unchanged.
Change from v1 to v2
- Add missed patch for endpont/pci-epf-test.c
- Move alloc and free to epc driver from epf.
- Provide general help function for EPC driver to alloc platform msi irq.
- Fixed manivannan's comments.
Signed-off-by: Frank Li <Frank.Li(a)nxp.com>
---
Frank Li (15):
platform-msi: Add msi_remove_device_irq_domain() in platform_device_msi_free_irqs_all()
irqdomain: Add IRQ_DOMAIN_FLAG_MSI_IMMUTABLE and irq_domain_is_msi_immutable()
irqchip/gic-v3-its: Set IRQ_DOMAIN_FLAG_MSI_IMMUTABLE for ITS
dt-bindings: PCI: pci-ep: Add support for iommu-map and msi-map
irqchip/gic-v3-its: Add support for device tree msi-map and msi-mask
PCI: endpoint: Set ID and of_node for function driver
PCI: endpoint: Add RC-to-EP doorbell support using platform MSI controller
PCI: endpoint: pci-ep-msi: Add MSI address/data pair mutable check
PCI: endpoint: Add pci_epf_align_inbound_addr() helper for address alignment
PCI: endpoint: pci-epf-test: Add doorbell test support
misc: pci_endpoint_test: Add doorbell test case
selftests: pci_endpoint: Add doorbell test case
pci: imx6: Add helper function imx_pcie_add_lut_by_rid()
pci: imx6: Add LUT setting for MSI/IOMMU in Endpoint mode
arm64: dts: imx95: Add msi-map for pci-ep device
Documentation/devicetree/bindings/pci/pci-ep.yaml | 68 ++++++++++
arch/arm64/boot/dts/freescale/imx95.dtsi | 1 +
drivers/base/platform-msi.c | 1 +
drivers/irqchip/irq-gic-v3-its-msi-parent.c | 8 ++
drivers/irqchip/irq-gic-v3-its.c | 2 +-
drivers/misc/pci_endpoint_test.c | 82 ++++++++++++
drivers/pci/controller/dwc/pci-imx6.c | 25 ++--
drivers/pci/endpoint/Makefile | 1 +
drivers/pci/endpoint/functions/pci-epf-test.c | 142 +++++++++++++++++++++
drivers/pci/endpoint/pci-ep-msi.c | 90 +++++++++++++
drivers/pci/endpoint/pci-epf-core.c | 48 +++++++
include/linux/irqdomain.h | 7 +
include/linux/pci-ep-msi.h | 28 ++++
include/linux/pci-epf.h | 21 +++
include/uapi/linux/pcitest.h | 1 +
.../selftests/pci_endpoint/pci_endpoint_test.c | 28 ++++
16 files changed, 544 insertions(+), 9 deletions(-)
---
base-commit: a4949bd40778aa9beac77c89e4c6a1da52875c8b
change-id: 20241010-ep-msi-8b4cab33b1be
Best regards,
---
Frank Li <Frank.Li(a)nxp.com>
This patch set aims to allow ublk server threads to better balance load
amongst themselves by decoupling server threads from ublk queues/hctxs,
so that multiple threads can service I/Os from a single hctx.
The first patch is the functional change in the driver which switches
from per-queue daemons to per-io daemons and allows for ublk servers to
balance load that is imbalanced among queues. The second patch fixes a
bug in tag allocation (in the sbitmap layer) that was observed while
developing a test for this feature. The next five patches add support in
the selftests ublk server (kublk) for this feature, and add a test which
shows the new feature working as intended. The last patch documents the
new feature.
Signed-off-by: Uday Shankar <ushankar(a)purestorage.com>
---
Changes in v6:
- Add a feature flag for this feature, called UBLK_F_RR_TAGS (Ming Lei)
- Add test for this feature (Ming Lei)
- Add documentation for this feature (Ming Lei)
- Link to v5: https://lore.kernel.org/r/20250416-ublk_task_per_io-v5-0-9261ad7bff20@pures…
Changes in v5:
- Set io->task before ublk_mark_io_ready (Caleb Sander Mateos)
- Set io->task atomically, read it atomically when needed
- Return 0 on success from command-specific helpers in
__ublk_ch_uring_cmd (Caleb Sander Mateos)
- Rename ublk_handle_need_get_data to ublk_get_data (Caleb Sander
Mateos)
- Link to v4: https://lore.kernel.org/r/20250415-ublk_task_per_io-v4-0-54210b91a46f@pures…
Changes in v4:
- Drop "ublk: properly serialize all FETCH_REQs" since Ming is taking it
in another set
- Prevent data races by marking data structures which should be
read-only in the I/O path as const (Ming Lei)
- Link to v3: https://lore.kernel.org/r/20250410-ublk_task_per_io-v3-0-b811e8f4554a@pures…
Changes in v3:
- Check for UBLK_IO_FLAG_ACTIVE on I/O again after taking lock to ensure
that two concurrent FETCH_REQs on the same I/O can't succeed (Caleb
Sander Mateos)
- Link to v2: https://lore.kernel.org/r/20250408-ublk_task_per_io-v2-0-b97877e6fd50@pures…
Changes in v2:
- Remove changes split into other patches
- To ease error handling/synchronization, associate each I/O (instead of
each queue) to the last task that issues a FETCH_REQ against it. Only
that task is allowed to operate on the I/O.
- Link to v1: https://lore.kernel.org/r/20241002224437.3088981-1-ushankar@purestorage.com
---
Uday Shankar (8):
ublk: have a per-io daemon instead of a per-queue daemon
sbitmap: fix off-by-one when wrapping hint
selftests: ublk: kublk: plumb q_id in io_uring user_data
selftests: ublk: kublk: tie sqe allocation to io instead of queue
selftests: ublk: kublk: lift queue initialization out of thread
selftests: ublk: kublk: move per-thread data out of ublk_queue
selftests: ublk: kublk: decouple ublk_queues from ublk server threads
Documentation: ublk: document UBLK_F_RR_TAGS
Documentation/block/ublk.rst | 83 +++++-
drivers/block/ublk_drv.c | 82 ++---
include/uapi/linux/ublk_cmd.h | 8 +
lib/sbitmap.c | 4 +-
tools/testing/selftests/ublk/Makefile | 1 +
tools/testing/selftests/ublk/fault_inject.c | 4 +-
tools/testing/selftests/ublk/file_backed.c | 20 +-
tools/testing/selftests/ublk/kublk.c | 329 ++++++++++++++-------
tools/testing/selftests/ublk/kublk.h | 73 +++--
tools/testing/selftests/ublk/null.c | 12 +-
tools/testing/selftests/ublk/stripe.c | 17 +-
tools/testing/selftests/ublk/test_generic_08.sh | 61 ++++
.../selftests/ublk/trace/count_ios_per_tid.bt | 9 +
13 files changed, 488 insertions(+), 215 deletions(-)
---
base-commit: 037af793557ed192b2c10cf2379ac97abacedf55
change-id: 20250408-ublk_task_per_io-c693cf608d7a
Best regards,
--
Uday Shankar <ushankar(a)purestorage.com>
Some small fixes for arch_timer_edge_cases that I stumbled upon
while debugging failures for this selftest on ampere-one.
Sebastian Ott (3):
KVM: arm64: selftests: fix help text for arch_timer_edge_cases
KVM: arm64: selftests: fix thread migration in arch_timer_edge_cases
KVM: arm64: selftests: arch_timer_edge_cases - workaround for
AC03_CPU_14
.../selftests/kvm/arm64/arch_timer_edge_cases.c | 12 +++++++-----
1 file changed, 7 insertions(+), 5 deletions(-)
base-commit: 9c69f88849045499e8ad114e5e13dbb3c85f4443
--
2.49.0
Until CONFIG_DMABUF_SYSFS_STATS was added [1] it was only possible to
perform per-buffer accounting with debugfs which is not suitable for
production environments. Eventually we discovered the overhead with
per-buffer sysfs file creation/removal was significantly impacting
allocation and free times, and exacerbated kernfs lock contention. [2]
dma_buf_stats_setup() is responsible for 39% of single-page buffer
creation duration, or 74% of single-page dma_buf_export() duration when
stressing dmabuf allocations and frees.
I prototyped a change from per-buffer to per-exporter statistics with a
RCU protected list of exporter allocations that accommodates most (but
not all) of our use-cases and avoids almost all of the sysfs overhead.
While that adds less overhead than per-buffer sysfs, and less even than
the maintenance of the dmabuf debugfs_list, it's still *additional*
overhead on top of the debugfs_list and doesn't give us per-buffer info.
This series uses the existing dmabuf debugfs_list to implement a BPF
dmabuf iterator, which adds no overhead to buffer allocation/free and
provides per-buffer info. The list has been moved outside of
CONFIG_DEBUG_FS scope so that it is always populated. The BPF program
loaded by userspace that extracts per-buffer information gets to define
its own interface which avoids the lack of ABI stability with debugfs.
This will allow us to replace our use of CONFIG_DMABUF_SYSFS_STATS, and
the plan is to remove it from the kernel after the next longterm stable
release.
[1] https://lore.kernel.org/linux-media/20201210044400.1080308-1-hridya@google.…
[2] https://lore.kernel.org/all/20220516171315.2400578-1-tjmercier@google.com
v1: https://lore.kernel.org/all/20250414225227.3642618-1-tjmercier@google.com
v1 -> v2:
Make the DMA buffer list independent of CONFIG_DEBUG_FS per Christian König
Add CONFIG_DMA_SHARED_BUFFER check to kernel/bpf/Makefile per kernel test robot
Use BTF_ID_LIST_SINGLE instead of BTF_ID_LIST_GLOBAL_SINGLE per Song Liu
Fixup comment style, mixing code/declarations, and use ASSERT_OK_FD in selftest per Song Liu
Add BPF_ITER_RESCHED feature to bpf_dmabuf_reg_info per Alexei Starovoitov
Add open-coded iterator and selftest per Alexei Starovoitov
Add a second test buffer from the system dmabuf heap to selftests
Use the BPF program we'll use in production for selftest per Alexei Starovoitov
https://r.android.com/c/platform/system/bpfprogs/+/3616123/2/dmabufIter.chttps://r.android.com/c/platform/system/memory/libmeminfo/+/3614259/1/libdm…
v2: https://lore.kernel.org/all/20250504224149.1033867-1-tjmercier@google.com
v2 -> v3:
Rebase onto bpf-next/master
Move get_next_dmabuf() into drivers/dma-buf/dma-buf.c, along with the
new get_first_dmabuf(). This avoids having to expose the dmabuf list
and mutex to the rest of the kernel, and keeps the dmabuf mutex
operations near each other in the same file. (Christian König)
Add Christian's RB to dma-buf: Rename debugfs symbols
Drop RFC: dma-buf: Remove DMA-BUF statistics
v3: https://lore.kernel.org/all/20250507001036.2278781-1-tjmercier@google.com
v3 -> v4:
Fix selftest BPF program comment style (not kdoc) per Alexei Starovoitov
Fix dma-buf.c kdoc comment style per Alexei Starovoitov
Rename get_first_dmabuf / get_next_dmabuf to dma_buf_iter_begin /
dma_buf_iter_next per Christian König
Add Christian's RB to bpf: Add dmabuf iterator
T.J. Mercier (5):
dma-buf: Rename debugfs symbols
bpf: Add dmabuf iterator
bpf: Add open coded dmabuf iterator
selftests/bpf: Add test for dmabuf_iter
selftests/bpf: Add test for open coded dmabuf_iter
drivers/dma-buf/dma-buf.c | 98 +++++--
include/linux/dma-buf.h | 4 +-
kernel/bpf/Makefile | 3 +
kernel/bpf/dmabuf_iter.c | 149 ++++++++++
kernel/bpf/helpers.c | 5 +
.../testing/selftests/bpf/bpf_experimental.h | 5 +
tools/testing/selftests/bpf/config | 3 +
.../selftests/bpf/prog_tests/dmabuf_iter.c | 258 ++++++++++++++++++
.../testing/selftests/bpf/progs/dmabuf_iter.c | 91 ++++++
9 files changed, 594 insertions(+), 22 deletions(-)
create mode 100644 kernel/bpf/dmabuf_iter.c
create mode 100644 tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c
create mode 100644 tools/testing/selftests/bpf/progs/dmabuf_iter.c
base-commit: 43745d11bfd9683abdf08ad7a5cc403d6a9ffd15
--
2.49.0.1015.ga840276032-goog
From: Chia-Yu Chang <chia-yu.chang(a)nokia-bell-labs.com>
Hello,
Please find DUALPI2 iproute2 patch v8.
v8 (09-May-25)
- Update pkt_sched.h with the one in nex-next
- Fix one typo of comment in pkt_sched.h (ALOK TIWARI <alok.a.tiwari(a)oracle.com>)
- Update manual content in man/man8/tc-dualpi2.8 (ALOK TIWARI <alok.a.tiwari(a)oracle.com>)
- Update tc/q_dualpi2.c to fix missing blank lines and add missing case (ALOK TIWARI <alok.a.tiwari(a)oracle.com>)
v7 (05-May-25)
- Align pkt_sched.h with the v14 version of net-next due to spec modificiaotn in tc.yaml
- Reorganize dualpi2_print_opt() to match the order in tc.yaml
- Remove credit-queue in PRINT_JSON
v6 (26-Apr-25)
- Update JSON file output due to spec modifiocation in tc.yaml of net-next
v5 (25-Mar-25)
- Use matches() to replace current strcmp() (Stephen Hemminger <stephen(a)networkplumber.org>)
- Use general parse_percent() for handling scaled percentage values (Stephen Hemminger <stephen(a)networkplumber.org>)
- Add print function for JSON of dualpi2 stats (Stephen Hemminger <stephen(a)networkplumber.org>)
v4 (16-Mar-25)
- Add min_qlen_step to dualpi2 attribute as the minimum queue length in number of packets in the L-queue to start step amrking.
v3 (21-Feb-25)
- Add memlimit to dualpi2 attribute, and add memory_used, max_memory_used, memory_limit in dualpi2 stats (Dave Taht <dave.taht(a)gmail.com>)
- Update manual to align latest implementation and clarify the queue naming and default unit
- Use common "get_scaled_alpha_beta" and clean print_opt for Dualpi2
v2 (23-Oct-24)
- Rename get_float in dualpi2 to get_float_min_max in utils.c
- Move get_float from iplink_can.c in utils.c (Stephen Hemminger <stephen(a)networkplumber.org>)
- Add print function for JSON of dualpi2 (Stephen Hemminger <stephen(a)networkplumber.org>)
For more details of DualPI2, plesae refer IETF RFC9332
(https://datatracker.ietf.org/doc/html/rfc9332).
Best Regards,
Chia-Yu
Chia-Yu Chang (1):
tc: add dualpi2 scheduler module
bash-completion/tc | 11 +-
include/uapi/linux/pkt_sched.h | 68 +++++
include/utils.h | 2 +
ip/iplink_can.c | 14 -
lib/utils.c | 30 ++
man/man8/tc-dualpi2.8 | 249 +++++++++++++++
tc/Makefile | 1 +
tc/q_dualpi2.c | 534 +++++++++++++++++++++++++++++++++
8 files changed, 894 insertions(+), 15 deletions(-)
create mode 100644 man/man8/tc-dualpi2.8
create mode 100644 tc/q_dualpi2.c
--
2.34.1
There are KUnit tests for the bitfield packing library but these depend
on CONFIG_PACKING which is not enabled by anything in either the
existing KUnit all_tests.config or the base UML config it runs on as
standard. Enable that in all_tests.config to improve coverage.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
tools/testing/kunit/configs/all_tests.config | 2 ++
1 file changed, 2 insertions(+)
diff --git a/tools/testing/kunit/configs/all_tests.config b/tools/testing/kunit/configs/all_tests.config
index cdd9782f9646..b0223b7aebde 100644
--- a/tools/testing/kunit/configs/all_tests.config
+++ b/tools/testing/kunit/configs/all_tests.config
@@ -51,3 +51,5 @@ CONFIG_SOUND=y
CONFIG_SND=y
CONFIG_SND_SOC=y
CONFIG_SND_SOC_TOPOLOGY_BUILD=y
+
+CONFIG_PACKING=y
---
base-commit: 8ffd015db85fea3e15a77027fda6c02ced4d2444
change-id: 20250422-kunit-enable-missing-tests-9189ee930cae
Best regards,
--
Mark Brown <broonie(a)kernel.org>
TRACE_syscall.ptrace.negative_ENOSYS and TRACE_syscall.seccomp.negative_ENOSYS
on arm32 are being reported as failures instead of skipping.
The teardown_trace_fixture function sets the test to KSFT_FAIL in case of a
non 0 return value from the tracer process.
Due to _metadata now being shared between the forked processes the tracer is
returning the KSFT_SKIP value set by the tracee which is non 0.
Remove the setting of the _metadata.exit_code in teardown_trace_fixture.
Fixes: 24cf65a62266 ("selftests/harness: Share _metadata between forked processes")
Signed-off-by: Terry Tritton <terry.tritton(a)linaro.org>
---
tools/testing/selftests/seccomp/seccomp_bpf.c | 6 ------
1 file changed, 6 deletions(-)
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index b2f76a52215a..c43a6f8f8cd5 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -1629,14 +1629,8 @@ void teardown_trace_fixture(struct __test_metadata *_metadata,
{
if (tracer) {
int status;
- /*
- * Extract the exit code from the other process and
- * adopt it for ourselves in case its asserts failed.
- */
ASSERT_EQ(0, kill(tracer, SIGUSR1));
ASSERT_EQ(tracer, waitpid(tracer, &status, 0));
- if (WEXITSTATUS(status))
- _metadata->exit_code = KSFT_FAIL;
}
}
--
2.39.5