- Linux-kselftest-mirror - lists.linaro.org

[PATCH net-next v2 6/6] selftests: net: extend SCM_PIDFD test to cover stale pidfds

by Alexander Mikhalitsyn

Extend SCM_PIDFD test scenarios to also cover dead task's pidfd retrieval and reading its exit info. Cc: linux-kselftest(a)vger.kernel.org Cc: linux-kernel(a)vger.kernel.org Cc: netdev(a)vger.kernel.org Cc: Shuah Khan <shuah(a)kernel.org> Cc: "David S. Miller" <davem(a)davemloft.net> Cc: Eric Dumazet <edumazet(a)google.com> Cc: Jakub Kicinski <kuba(a)kernel.org> Cc: Paolo Abeni <pabeni(a)redhat.com> Cc: Simon Horman <horms(a)kernel.org> Cc: Christian Brauner <brauner(a)kernel.org> Cc: Kuniyuki Iwashima <kuniyu(a)google.com> Cc: Lennart Poettering <mzxreary(a)0pointer.de> Cc: Luca Boccassi <bluca(a)debian.org> Cc: David Rheinsberg <david(a)readahead.eu> Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn(a)canonical.com> Reviewed-by: Christian Brauner <brauner(a)kernel.org> --- .../testing/selftests/net/af_unix/scm_pidfd.c | 217 ++++++++++++++---- 1 file changed, 173 insertions(+), 44 deletions(-) diff --git a/tools/testing/selftests/net/af_unix/scm_pidfd.c b/tools/testing/selftests/net/af_unix/scm_pidfd.c index 7e534594167e..37e034874034 100644 --- a/tools/testing/selftests/net/af_unix/scm_pidfd.c +++ b/tools/testing/selftests/net/af_unix/scm_pidfd.c @@ -15,6 +15,7 @@ #include <sys/types.h> #include <sys/wait.h> +#include "../../pidfd/pidfd.h" #include "../../kselftest_harness.h" #define clean_errno() (errno == 0 ? "None" : strerror(errno)) @@ -26,6 +27,8 @@ #define SCM_PIDFD 0x04 #endif +#define CHILD_EXIT_CODE_OK 123 + static void child_die() { exit(1); @@ -126,16 +129,65 @@ static pid_t get_pid_from_fdinfo_file(int pidfd, const char *key, size_t keylen) return result; } +struct cmsg_data { + struct ucred *ucred; + int *pidfd; +}; + +static int parse_cmsg(struct msghdr *msg, struct cmsg_data *res) +{ + struct cmsghdr *cmsg; + int data = 0; + + if (msg->msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + log_err("recvmsg: truncated"); + return 1; + } + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; + cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_PIDFD) { + if (cmsg->cmsg_len < sizeof(*res->pidfd)) { + log_err("CMSG parse: SCM_PIDFD wrong len"); + return 1; + } + + res->pidfd = (void *)CMSG_DATA(cmsg); + } + + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_CREDENTIALS) { + if (cmsg->cmsg_len < sizeof(*res->ucred)) { + log_err("CMSG parse: SCM_CREDENTIALS wrong len"); + return 1; + } + + res->ucred = (void *)CMSG_DATA(cmsg); + } + } + + if (!res->pidfd) { + log_err("CMSG parse: SCM_PIDFD not found"); + return 1; + } + + if (!res->ucred) { + log_err("CMSG parse: SCM_CREDENTIALS not found"); + return 1; + } + + return 0; +} + static int cmsg_check(int fd) { struct msghdr msg = { 0 }; - struct cmsghdr *cmsg; + struct cmsg_data res; struct iovec iov; - struct ucred *ucred = NULL; int data = 0; char control[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))] = { 0 }; - int *pidfd = NULL; pid_t parent_pid; int err; @@ -158,53 +210,99 @@ static int cmsg_check(int fd) return 1; } - for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; - cmsg = CMSG_NXTHDR(&msg, cmsg)) { - if (cmsg->cmsg_level == SOL_SOCKET && - cmsg->cmsg_type == SCM_PIDFD) { - if (cmsg->cmsg_len < sizeof(*pidfd)) { - log_err("CMSG parse: SCM_PIDFD wrong len"); - return 1; - } + /* send(pfd, "x", sizeof(char), 0) */ + if (data != 'x') { + log_err("recvmsg: data corruption"); + return 1; + } - pidfd = (void *)CMSG_DATA(cmsg); - } + if (parse_cmsg(&msg, &res)) { + log_err("CMSG parse: parse_cmsg() failed"); + return 1; + } - if (cmsg->cmsg_level == SOL_SOCKET && - cmsg->cmsg_type == SCM_CREDENTIALS) { - if (cmsg->cmsg_len < sizeof(*ucred)) { - log_err("CMSG parse: SCM_CREDENTIALS wrong len"); - return 1; - } + /* pidfd from SCM_PIDFD should point to the parent process PID */ + parent_pid = + get_pid_from_fdinfo_file(*res.pidfd, "Pid:", sizeof("Pid:") - 1); + if (parent_pid != getppid()) { + log_err("wrong SCM_PIDFD %d != %d", parent_pid, getppid()); + close(*res.pidfd); + return 1; + } - ucred = (void *)CMSG_DATA(cmsg); - } + close(*res.pidfd); + return 0; +} + +static int cmsg_check_dead(int fd, int expected_pid) +{ + int err; + struct msghdr msg = { 0 }; + struct cmsg_data res; + struct iovec iov; + int data = 0; + char control[CMSG_SPACE(sizeof(struct ucred)) + + CMSG_SPACE(sizeof(int))] = { 0 }; + pid_t client_pid; + struct pidfd_info info = { + .mask = PIDFD_INFO_EXIT, + }; + + iov.iov_base = &data; + iov.iov_len = sizeof(data); + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + err = recvmsg(fd, &msg, 0); + if (err < 0) { + log_err("recvmsg"); + return 1; } - /* send(pfd, "x", sizeof(char), 0) */ - if (data != 'x') { + if (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + log_err("recvmsg: truncated"); + return 1; + } + + /* send(cfd, "y", sizeof(char), 0) */ + if (data != 'y') { log_err("recvmsg: data corruption"); return 1; } - if (!pidfd) { - log_err("CMSG parse: SCM_PIDFD not found"); + if (parse_cmsg(&msg, &res)) { + log_err("CMSG parse: parse_cmsg() failed"); return 1; } - if (!ucred) { - log_err("CMSG parse: SCM_CREDENTIALS not found"); + /* + * pidfd from SCM_PIDFD should point to the client_pid. + * Let's read exit information and check if it's what + * we expect to see. + */ + if (ioctl(*res.pidfd, PIDFD_GET_INFO, &info)) { + log_err("%s: ioctl(PIDFD_GET_INFO) failed", __func__); + close(*res.pidfd); return 1; } - /* pidfd from SCM_PIDFD should point to the parent process PID */ - parent_pid = - get_pid_from_fdinfo_file(*pidfd, "Pid:", sizeof("Pid:") - 1); - if (parent_pid != getppid()) { - log_err("wrong SCM_PIDFD %d != %d", parent_pid, getppid()); + if (!(info.mask & PIDFD_INFO_EXIT)) { + log_err("%s: No exit information from ioctl(PIDFD_GET_INFO)", __func__); + close(*res.pidfd); return 1; } + err = WIFEXITED(info.exit_code) ? WEXITSTATUS(info.exit_code) : 1; + if (err != CHILD_EXIT_CODE_OK) { + log_err("%s: wrong exit_code %d != %d", __func__, err, CHILD_EXIT_CODE_OK); + close(*res.pidfd); + return 1; + } + + close(*res.pidfd); return 0; } @@ -291,6 +389,24 @@ static void fill_sockaddr(struct sock_addr *addr, bool abstract) memcpy(sun_path_buf, addr->sock_name, strlen(addr->sock_name)); } +static int sk_enable_cred_pass(int sk) +{ + int on = 0; + + on = 1; + if (setsockopt(sk, SOL_SOCKET, SO_PASSCRED, &on, sizeof(on))) { + log_err("Failed to set SO_PASSCRED"); + return 1; + } + + if (setsockopt(sk, SOL_SOCKET, SO_PASSPIDFD, &on, sizeof(on))) { + log_err("Failed to set SO_PASSPIDFD"); + return 1; + } + + return 0; +} + static void client(FIXTURE_DATA(scm_pidfd) *self, const FIXTURE_VARIANT(scm_pidfd) *variant) { @@ -299,7 +415,6 @@ static void client(FIXTURE_DATA(scm_pidfd) *self, struct ucred peer_cred; int peer_pidfd; pid_t peer_pid; - int on = 0; cfd = socket(AF_UNIX, variant->type, 0); if (cfd < 0) { @@ -322,14 +437,8 @@ static void client(FIXTURE_DATA(scm_pidfd) *self, child_die(); } - on = 1; - if (setsockopt(cfd, SOL_SOCKET, SO_PASSCRED, &on, sizeof(on))) { - log_err("Failed to set SO_PASSCRED"); - child_die(); - } - - if (setsockopt(cfd, SOL_SOCKET, SO_PASSPIDFD, &on, sizeof(on))) { - log_err("Failed to set SO_PASSPIDFD"); + if (sk_enable_cred_pass(cfd)) { + log_err("sk_enable_cred_pass() failed"); child_die(); } @@ -340,6 +449,12 @@ static void client(FIXTURE_DATA(scm_pidfd) *self, child_die(); } + /* send something to the parent so it can receive SCM_PIDFD too and validate it */ + if (send(cfd, "y", sizeof(char), 0) == -1) { + log_err("Failed to send(cfd, \"y\", sizeof(char), 0)"); + child_die(); + } + /* skip further for SOCK_DGRAM as it's not applicable */ if (variant->type == SOCK_DGRAM) return; @@ -398,7 +513,13 @@ TEST_F(scm_pidfd, test) close(self->server); close(self->startup_pipe[0]); client(self, variant); - exit(0); + + /* + * It's a bit unusual, but in case of success we return non-zero + * exit code (CHILD_EXIT_CODE_OK) and then we expect to read it + * from ioctl(PIDFD_GET_INFO) in cmsg_check_dead(). + */ + exit(CHILD_EXIT_CODE_OK); } close(self->startup_pipe[1]); @@ -421,9 +542,17 @@ TEST_F(scm_pidfd, test) ASSERT_NE(-1, err); } - close(pfd); waitpid(self->client_pid, &child_status, 0); - ASSERT_EQ(0, WIFEXITED(child_status) ? WEXITSTATUS(child_status) : 1); + /* see comment before exit(CHILD_EXIT_CODE_OK) */ + ASSERT_EQ(CHILD_EXIT_CODE_OK, WIFEXITED(child_status) ? WEXITSTATUS(child_status) : 1); + + err = sk_enable_cred_pass(pfd); + ASSERT_EQ(0, err); + + err = cmsg_check_dead(pfd, self->client_pid); + ASSERT_EQ(0, err); + + close(pfd); } TEST_HARNESS_MAIN -- 2.43.0

3 months, 2 weeks

1
0
0 0

[PATCH resend] rtc: Rename lib_test to rtc_lib_test

by Geert Uytterhoeven

When compiling the RTC library functions test as a module, the module has the non-descriptive name "lib_test.ko". Fix this by adding the subsystem's name as a prefix. Signed-off-by: Geert Uytterhoeven <geert(a)linux-m68k.org> --- drivers/rtc/Makefile | 2 +- drivers/rtc/{lib_test.c => rtc_lib_test.c} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename drivers/rtc/{lib_test.c => rtc_lib_test.c} (100%) diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 4619aa2ac4697591..aa08d1c8a32ec23d 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -15,7 +15,7 @@ rtc-core-$(CONFIG_RTC_INTF_DEV) += dev.o rtc-core-$(CONFIG_RTC_INTF_PROC) += proc.o rtc-core-$(CONFIG_RTC_INTF_SYSFS) += sysfs.o -obj-$(CONFIG_RTC_LIB_KUNIT_TEST) += lib_test.o +obj-$(CONFIG_RTC_LIB_KUNIT_TEST) += rtc_lib_test.o # Keep the list ordered. diff --git a/drivers/rtc/lib_test.c b/drivers/rtc/rtc_lib_test.c similarity index 100% rename from drivers/rtc/lib_test.c rename to drivers/rtc/rtc_lib_test.c -- 2.43.0

3 months, 2 weeks

2
2
0 0

[PATCH net-next] selftests: devmem: configure HDS threshold

by Taehee Yoo

The devmem TCP requires the hds-thresh value to be 0, but it doesn't change it automatically. Therefore, configure_hds_thresh() is added to handle this. The run_devmem_tests() now tests hds_thresh, but it skips test if the hds_thresh_max value is 0. Signed-off-by: Taehee Yoo <ap420073(a)gmail.com> --- .../selftests/drivers/net/hw/ncdevmem.c | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c index cc9b40d9c5d5..d78b5e5697d7 100644 --- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c @@ -349,6 +349,72 @@ static int configure_headersplit(bool on) return ret; } +static int configure_hds_thresh(int len) +{ + struct ethtool_rings_get_req *get_req; + struct ethtool_rings_get_rsp *get_rsp; + struct ethtool_rings_set_req *req; + struct ynl_error yerr; + struct ynl_sock *ys; + int ret; + + ys = ynl_sock_create(&ynl_ethtool_family, &yerr); + if (!ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return -1; + } + + req = ethtool_rings_set_req_alloc(); + ethtool_rings_set_req_set_header_dev_index(req, ifindex); + ethtool_rings_set_req_set_hds_thresh(req, len); + ret = ethtool_rings_set(ys, req); + if (ret < 0) + fprintf(stderr, "YNL failed: %s\n", ys->err.msg); + ethtool_rings_set_req_free(req); + + if (ret == 0) { + get_req = ethtool_rings_get_req_alloc(); + ethtool_rings_get_req_set_header_dev_index(get_req, ifindex); + get_rsp = ethtool_rings_get(ys, get_req); + ethtool_rings_get_req_free(get_req); + if (get_rsp) + fprintf(stderr, "HDS threshold: %d\n", + get_rsp->hds_thresh); + ethtool_rings_get_rsp_free(get_rsp); + } + + ynl_sock_destroy(ys); + + return ret; +} + +static int get_hds_thresh_max(void) +{ + struct ethtool_rings_get_req *get_req; + struct ethtool_rings_get_rsp *get_rsp; + struct ynl_error yerr; + unsigned int ret = 0; + struct ynl_sock *ys; + + ys = ynl_sock_create(&ynl_ethtool_family, &yerr); + if (!ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return -1; + } + + get_req = ethtool_rings_get_req_alloc(); + ethtool_rings_get_req_set_header_dev_index(get_req, ifindex); + get_rsp = ethtool_rings_get(ys, get_req); + ethtool_rings_get_req_free(get_req); + if (get_rsp) + ret = get_rsp->hds_thresh_max; + ethtool_rings_get_rsp_free(get_rsp); + + ynl_sock_destroy(ys); + + return ret; +} + static int configure_rss(void) { return run_command("sudo ethtool -X %s equal %d >&2", ifname, start_queue); @@ -565,6 +631,9 @@ static int do_server(struct memory_buffer *mem) if (configure_headersplit(1)) error(1, 0, "Failed to enable TCP header split\n"); + if (configure_hds_thresh(0)) + error(1, 0, "Failed to set HDS threshold\n"); + /* Configure RSS to divert all traffic from our devmem queues */ if (configure_rss()) error(1, 0, "Failed to configure rss\n"); @@ -725,10 +794,14 @@ static int do_server(struct memory_buffer *mem) void run_devmem_tests(void) { struct memory_buffer *mem; + int hds_thresh_max = 0; struct ynl_sock *ys; mem = provider->alloc(getpagesize() * NUM_PAGES); + hds_thresh_max = get_hds_thresh_max(); + if (hds_thresh_max < 0) + error(1, 0, "Failed to get hds_thresh_max"); /* Configure RSS to divert all traffic from our devmem queues */ if (configure_rss()) error(1, 0, "rss error\n"); @@ -750,6 +823,19 @@ void run_devmem_tests(void) if (configure_headersplit(1)) error(1, 0, "Failed to configure header split\n"); + /* Do not test hds_thresh if hds_thresh_max value is 0 */ + if (hds_thresh_max) { + if (configure_hds_thresh(1)) + error(1, 0, "Failed to configure HDS threshold\n"); + + if (!bind_rx_queue(ifindex, mem->fd, create_queues(), + num_queues, &ys)) + error(1, 0, "Configure dmabuf with non-zero HDS threshold should have failed\n"); + + if (configure_hds_thresh(0)) + error(1, 0, "Failed to configure HDS threshold\n"); + } + if (bind_rx_queue(ifindex, mem->fd, create_queues(), num_queues, &ys)) error(1, 0, "Failed to bind\n"); -- 2.34.1

3 months, 2 weeks

3
4
0 0

[PATCH] selftests: cachestat: add tests for mmap

by Suresh K C

From: Suresh K C <suresh.k.chandrappa(a)gmail.com> Add a test case to verify cachestat behavior with memory-mapped files using mmap(). This ensures that pages accessed via mmap are correctly accounted for in the page cache. Tested on x86_64 with default kernel config Signed-off-by: Suresh K C <suresh.k.chandrappa(a)gmail.com> --- .../selftests/cachestat/test_cachestat.c | 39 ++++++++++++++++--- 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c index 632ab44737ec..b6452978dae0 100644 --- a/tools/testing/selftests/cachestat/test_cachestat.c +++ b/tools/testing/selftests/cachestat/test_cachestat.c @@ -33,6 +33,11 @@ void print_cachestat(struct cachestat *cs) cs->nr_evicted, cs->nr_recently_evicted); } +enum file_type { + FILE_MMAP, + FILE_SHMEM +}; + bool write_exactly(int fd, size_t filesize) { int random_fd = open("/dev/urandom", O_RDONLY); @@ -202,7 +207,7 @@ static int test_cachestat(const char *filename, bool write_random, bool create, return ret; } -bool test_cachestat_shmem(void) +bool run_cachestat_test(enum file_type type) { size_t PS = sysconf(_SC_PAGESIZE); size_t filesize = PS * 512 * 2; /* 2 2MB huge pages */ @@ -212,27 +217,43 @@ bool test_cachestat_shmem(void) char *filename = "tmpshmcstat"; struct cachestat cs; bool ret = true; + int fd; unsigned long num_pages = compute_len / PS; - int fd = shm_open(filename, O_CREAT | O_RDWR, 0600); + if (type == FILE_SHMEM) + fd = shm_open(filename, O_CREAT | O_RDWR, 0600); + else + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd < 0) { - ksft_print_msg("Unable to create shmem file.\n"); + ksft_print_msg("Unable to create file.\n"); ret = false; goto out; } if (ftruncate(fd, filesize)) { - ksft_print_msg("Unable to truncate shmem file.\n"); + ksft_print_msg("Unable to truncate file.\n"); ret = false; goto close_fd; } if (!write_exactly(fd, filesize)) { - ksft_print_msg("Unable to write to shmem file.\n"); + ksft_print_msg("Unable to write to file.\n"); ret = false; goto close_fd; } + if (type == FILE_MMAP){ + char *map = mmap(NULL, filesize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (map == MAP_FAILED) { + ksft_print_msg("mmap failed.\n"); + ret = false; + goto close_fd; + } + for (int i = 0; i < filesize; i++) { + map[i] = 'A'; + } + map[filesize - 1] = 'X'; + } syscall_ret = syscall(__NR_cachestat, fd, &cs_range, &cs, 0); if (syscall_ret) { @@ -308,12 +329,18 @@ int main(void) break; } - if (test_cachestat_shmem()) + if (run_cachestat_test(FILE_SHMEM)) ksft_test_result_pass("cachestat works with a shmem file\n"); else { ksft_test_result_fail("cachestat fails with a shmem file\n"); ret = 1; } + if (run_cachestat_test(FILE_MMAP)) + ksft_test_result_pass("cachestat works with a mmap file\n"); + else { + ksft_test_result_fail("cachestat fails with a mmap file\n"); + ret = 1; + } return ret; } -- 2.43.0

3 months, 2 weeks

3
2
0 0

[PATCH v9 net-next 00/15] AccECN protocol patch series

by chia-yu.chang＠nokia-bell-labs.com

From: Chia-Yu Chang <chia-yu.chang(a)nokia-bell-labs.com> Hello, Please find the v9 AccECN protocol patch series, which covers the core functionality of Accurate ECN, AccECN negotiation, AccECN TCP options, and AccECN failure handling. The Accurate ECN draft can be found in https://datatracker.ietf.org/doc/html/draft-ietf-tcpm-accurate-ecn-28 This patch series is part of the full AccECN patch series, which is available at https://github.com/L4STeam/linux-net-next/commits/upstream_l4steam/ Best Regards, Chia-Yu --- v9 (21-Jun-2025) - Use tcp_data_ecn_check() to set TCP_ECN_SEE flag only for RFC3168 ECN (Paolo Abeni <pabeni(a)redhat.com>) - Add comments about setting TCP_ECN_SEEN flag for RFC3168 and Accruate ECN (Paolo Abeni <pabeni(a)redhat.com>) - Restruct the code in the for loop of tcp_accecn_process_option() (Paolo Abeni <pabeni(a)redhat.com>) - Remove ecn_bytes and add use_synack_ecn_bytes flag to identify whether syn_ack_bytes or received_ecn_bytes is used (Paolo Abeni <pabeni(a)redhat.com>) - Replace leftover_bytes and leftover_size with leftover_highbyte and leftover_lowbyte and add comments in tcp_options_write() (Paolo Abeni <pabeni(a)redhat.com>) - Add comments and commit message about the 1st retx SYN still attempt AccECN negotiation (Paolo Abeni <pabeni(a)redhat.com>) v8 (10-Jun-2025) - Add new helper function tcp_ecn_received_counters_payload() in #6 (Paolo Abeni <pabeni(a)redhat.com>) - Set opts->num_sack_blocks=0 to avoid potential undefined value in #8 (Paolo Abeni <pabeni(a)redhat.com>) - Reset leftover_size to 2 once leftover_bytes is used in #9 (Paolo Abeni <pabeni(a)redhat.com>) - Add new helper function tcp_accecn_opt_demand_min() in #10 (Paolo Abeni <pabeni(a)redhat.com>) - Add new helper function tcp_accecn_saw_opt_fail_recv() in #11 (Paolo Abeni <pabeni(a)redhat.com>) - Update tcp_options_fit_accecn() to avoid using recursion in #14 (Paolo Abeni <pabeni(a)redhat.com>) v7 (14-May-2025) - Modify group sizes of tcp_sock_write_txrx and tcp_sock_write_rx in #3 based on pahole results (Paolo Abeni <pabeni(a)redhat.com>) - Fix the issue in #4 and #5 where the RFC3168 ECN behavior in tcp_ecn_send() is changed (Paolo Abeni <pabeni(a)redhat.com>) - Modify group size of tcp_sock_write_txrx in #4 and #6 based on pahole results (Paolo Abeni <pabeni(a)redhat.com>) - Update commit message for #9 to explain the increase in tcp_sock_write_rx group size - Modify group size of tcp_sock_write_tx in #10 based on pahole results v6 (09-May-2025) - Add #3 to utilize exisintg holes of tcp_sock_write_txrx group for later patches (#4, #9, #10) with new u8 members (Paolo Abeni <pabeni(a)redhat.com>) - Add pahole outcomes before and after commit in #4, #5, #6, #9, #10, #15 (Paolo Abeni <pabeni(a)redhat.com>) - Define new helper function tcp_send_ack_reflect_ect() for sending ACK with reflected ECT in #5 (Paolo Abeni <pabeni(a)redhat.com>) - Add comments for function tcp_ecn_rcv_synack() in #5 (Paolo Abeni <pabeni(a)redhat.com>) - Add enum/define to be used by sysctl_tcp_ecn in #5, sysctl_tcp_ecn_option in #9, and sysctl_tcp_ecn_option_beacon in #10 (Paolo Abeni <pabeni(a)redhat.com>) - Move accecn_fail_mode and saw_accecn_opt in #5 and #11 to use exisintg holes of tcp_sock (Paolo Abeni <pabeni(a)redhat.com>) - Change data type of new members of tcp_request_sock and move them to the end of struct in #5 and #11 (Paolo Abeni <pabeni(a)redhat.com>) - Move new members of tcp_info to the end of struct in #6 (Paolo Abeni <pabeni(a)redhat.com>) - Merge previous #7 into #9 (Paolo Abeni <pabeni(a)redhat.com>) - Mask ecnfield with INET_ECN_MASK to remove WARN_ONCE in #9 (Paolo Abeni <pabeni(a)redhat.com>) - Reduce the indentation levels for reabability in #9 and #10 (Paolo Abeni <pabeni(a)redhat.com>) - Move delivered_ecn_bytes to the RX group in #9, accecn_opt_tstamp to the TX group in #10, pkts_acked_ewma to the RX group in #15 (Paolo Abeni <pabeni(a)redhat.com>) - Add changes in Documentation/networking/net_cachelines/tcp_sock.rst for new tcp_sock members in #3, #5, #6, #9, #10, #15 v5 (22-Apr-2025) - Further fix for 32-bit ARM alignment in tcp.c (Simon Horman <horms(a)kernel.org>) v4 (18-Apr-2025) - Fix 32-bit ARM assertion for alignment requirement (Simon Horman <horms(a)kernel.org>) v3 (14-Apr-2025) - Fix patch apply issue in v2 (Jakub Kicinski <kuba(a)kernel.org>) v2 (18-Mar-2025) - Add one missing patch from the previous AccECN protocol preparation patch series to this patch series. --- Chia-Yu Chang (3): tcp: reorganize tcp_sock_write_txrx group for variables later tcp: accecn: AccECN option failure handling tcp: accecn: try to fit AccECN option with SACK Ilpo Järvinen (12): tcp: reorganize SYN ECN code tcp: fast path functions later tcp: AccECN core tcp: accecn: AccECN negotiation tcp: accecn: add AccECN rx byte counters tcp: accecn: AccECN needs to know delivered bytes tcp: sack option handling improvements tcp: accecn: AccECN option tcp: accecn: AccECN option send control tcp: accecn: AccECN option ceb/cep heuristic tcp: accecn: AccECN ACE field multi-wrap heuristic tcp: try to avoid safer when ACKs are thinned .../networking/net_cachelines/tcp_sock.rst | 14 + include/linux/tcp.h | 34 +- include/net/netns/ipv4.h | 2 + include/net/tcp.h | 225 ++++++- include/uapi/linux/tcp.h | 7 + net/ipv4/syncookies.c | 3 + net/ipv4/sysctl_net_ipv4.c | 19 + net/ipv4/tcp.c | 30 +- net/ipv4/tcp_input.c | 622 +++++++++++++++++- net/ipv4/tcp_ipv4.c | 7 +- net/ipv4/tcp_minisocks.c | 91 ++- net/ipv4/tcp_output.c | 325 ++++++++- net/ipv6/syncookies.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 14 files changed, 1285 insertions(+), 96 deletions(-) -- 2.34.1

3 months, 3 weeks

3
28
0 0

[PATCH net-next v3 0/3] selftest: net: Add selftest for netpoll

by Breno Leitao

I am submitting a new selftest for the netpoll subsystem specifically targeting the case where the RX is polling in the TX path, which is a case that we don't have any test in the tree today. This is done when netpoll_poll_dev() called, and this test creates a scenario when that is probably. The test does the following: 1) Configuring a single RX/TX queue to increase contention on the interface. 2) Generating background traffic to saturate the network, mimicking real-world congestion. 3) Sending netconsole messages to trigger netpoll polling and monitor its behavior. 4) Using dynamic netconsole targets via configfs, with the ability to delete and recreate targets during the test. 5) Running bpftrace in parallel to verify that netpoll_poll_dev() is called when expected. If it is called, then the test passes, otherwise the test is marked as skipped. In order to achieve it, I stole Jakub's bpftrace helper from [1], and did some small changes that I found useful to use the helper. So, this patchset basically contains: 1) The code stolen from Jakub 2) Improvements on bpftrace() helper 3) The selftest itself Link: https://lore.kernel.org/all/20250421222827.283737-22-kuba@kernel.org/ [1] --- Changes in v3: - Make pylint happy (Simon) - Remove the unnecessary patch in bpftrace to raise an exception when it fails. (Jakub) - Improved the bpftrace code (Willem) - Stop sending messages if bpftrace is not alive anymore. - Link to v2: https://lore.kernel.org/r/20250625-netpoll_test-v2-0-47d27775222c@debian.org Changes in v2: - Stole Jakub's helper to run bpftrace - Removed the DEBUG option and moved logs to logging - Change the code to have a higher chance of calling netpoll_poll_dev(). In my current configuration, it is hitting multiple times during the test. - Save and restore TX/RX queue size (Jakub) - Link to v1: https://lore.kernel.org/r/20250620-netpoll_test-v1-1-5068832f72fc@debian.org --- Breno Leitao (2): selftests: drv-net: Strip '@' prefix from bpftrace map keys selftests: net: add netpoll basic functionality test Jakub Kicinski (1): selftests: drv-net: add helper/wrapper for bpftrace tools/testing/selftests/drivers/net/Makefile | 1 + .../selftests/drivers/net/lib/py/__init__.py | 3 +- .../testing/selftests/drivers/net/netpoll_basic.py | 345 +++++++++++++++++++++ tools/testing/selftests/net/lib/py/utils.py | 35 +++ 4 files changed, 383 insertions(+), 1 deletion(-) --- base-commit: 8efa26fcbf8a7f783fd1ce7dd2a409e9b7758df0 change-id: 20250612-netpoll_test-a1324d2057c8 Best regards, -- Breno Leitao <leitao(a)debian.org>

3 months, 3 weeks

3
8
0 0

[PATCH net-next 2/2] selftests: seg6: fix instaces typo in comments

by Andrea Mayer

Fix a typo: instaces -> instances The typo has been identified using codespell, and the tool does not report any additional issues in the selftests considered. Signed-off-by: Andrea Mayer <andrea.mayer(a)uniroma2.it> --- tools/testing/selftests/net/srv6_end_next_csid_l3vpn_test.sh | 2 +- tools/testing/selftests/net/srv6_end_x_next_csid_l3vpn_test.sh | 2 +- tools/testing/selftests/net/srv6_hencap_red_l3vpn_test.sh | 2 +- tools/testing/selftests/net/srv6_hl2encap_red_l2vpn_test.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/srv6_end_next_csid_l3vpn_test.sh b/tools/testing/selftests/net/srv6_end_next_csid_l3vpn_test.sh index ba730655a7bf..4bc135e5c22c 100755 --- a/tools/testing/selftests/net/srv6_end_next_csid_l3vpn_test.sh +++ b/tools/testing/selftests/net/srv6_end_next_csid_l3vpn_test.sh @@ -594,7 +594,7 @@ setup_rt_local_sids() dev "${DUMMY_DEVNAME}" # all SIDs for VPNs start with a common locator. Routes and SRv6 - # Endpoint behavior instaces are grouped together in the 'localsid' + # Endpoint behavior instances are grouped together in the 'localsid' # table. ip -netns "${nsname}" -6 rule \ add to "${VPN_LOCATOR_SERVICE}::/16" \ diff --git a/tools/testing/selftests/net/srv6_end_x_next_csid_l3vpn_test.sh b/tools/testing/selftests/net/srv6_end_x_next_csid_l3vpn_test.sh index bedf0ce885c2..34b781a2ae74 100755 --- a/tools/testing/selftests/net/srv6_end_x_next_csid_l3vpn_test.sh +++ b/tools/testing/selftests/net/srv6_end_x_next_csid_l3vpn_test.sh @@ -681,7 +681,7 @@ setup_rt_local_sids() set_underlay_sids_reachability "${rt}" "${rt_neighs}" # all SIDs for VPNs start with a common locator. Routes and SRv6 - # Endpoint behavior instaces are grouped together in the 'localsid' + # Endpoint behavior instances are grouped together in the 'localsid' # table. ip -netns "${nsname}" -6 rule \ add to "${VPN_LOCATOR_SERVICE}::/16" \ diff --git a/tools/testing/selftests/net/srv6_hencap_red_l3vpn_test.sh b/tools/testing/selftests/net/srv6_hencap_red_l3vpn_test.sh index 3efce1718c5f..6a68c7eff1dc 100755 --- a/tools/testing/selftests/net/srv6_hencap_red_l3vpn_test.sh +++ b/tools/testing/selftests/net/srv6_hencap_red_l3vpn_test.sh @@ -395,7 +395,7 @@ setup_rt_local_sids() dev "${VRF_DEVNAME}" # all SIDs for VPNs start with a common locator. Routes and SRv6 - # Endpoint behavior instaces are grouped together in the 'localsid' + # Endpoint behavior instances are grouped together in the 'localsid' # table. ip -netns "${nsname}" -6 rule \ add to "${VPN_LOCATOR_SERVICE}::/16" \ diff --git a/tools/testing/selftests/net/srv6_hl2encap_red_l2vpn_test.sh b/tools/testing/selftests/net/srv6_hl2encap_red_l2vpn_test.sh index cabc70538ffe..0979b5316fdf 100755 --- a/tools/testing/selftests/net/srv6_hl2encap_red_l2vpn_test.sh +++ b/tools/testing/selftests/net/srv6_hl2encap_red_l2vpn_test.sh @@ -343,7 +343,7 @@ setup_rt_local_sids() encap seg6local action End dev "${DUMMY_DEVNAME}" # all SIDs for VPNs start with a common locator. Routes and SRv6 - # Endpoint behaviors instaces are grouped together in the 'localsid' + # Endpoint behaviors instances are grouped together in the 'localsid' # table. ip -netns "${nsname}" -6 rule add \ to "${VPN_LOCATOR_SERVICE}::/16" \ -- 2.20.1

3 months, 3 weeks

2
1
0 0

Re: [PATCH] selftests/mm: add test for (BATCH_PROCESS)MADV_DONTNEED

by wang lian

I deeply appreciate for your criticism. This is my first patch and I will improve it based on what we have discussed so far.

3 months, 3 weeks

3
5
0 0

[PATCH 0/3] arm64: Support FEAT_LSFE (Large System Float Extension)

by Mark Brown

FEAT_LSFE is optional from v9.5, it adds new instructions for atomic memory operations with floating point values. We have no immediate use for it in kernel, provide a hwcap so userspace can discover it and allow the ID register field to be exposed to KVM guests. Signed-off-by: Mark Brown <broonie(a)kernel.org> --- Mark Brown (3): arm64/hwcap: Add hwcap for FEAT_LSFE KVM: arm64: Expose FEAT_LSFE to guests kselftest/arm64: Add lsfe to the hwcaps test Documentation/arch/arm64/elf_hwcaps.rst | 4 ++++ arch/arm64/include/asm/hwcap.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 1 + arch/arm64/kernel/cpufeature.c | 2 ++ arch/arm64/kernel/cpuinfo.c | 1 + arch/arm64/kvm/sys_regs.c | 4 +++- tools/testing/selftests/arm64/abi/hwcap.c | 21 +++++++++++++++++++++ 7 files changed, 33 insertions(+), 1 deletion(-) --- base-commit: 86731a2a651e58953fc949573895f2fa6d456841 change-id: 20250625-arm64-lsfe-0810cf98adc2 Best regards, -- Mark Brown <broonie(a)kernel.org>

3 months, 3 weeks

2
6
0 0

[PATCH net-next 6/6] selftests: net: extend SCM_PIDFD test to cover stale pidfds

by Alexander Mikhalitsyn

Extend SCM_PIDFD test scenarios to also cover dead task's pidfd retrieval and reading its exit info. Cc: linux-kselftest(a)vger.kernel.org Cc: linux-kernel(a)vger.kernel.org Cc: netdev(a)vger.kernel.org Cc: Shuah Khan <shuah(a)kernel.org> Cc: "David S. Miller" <davem(a)davemloft.net> Cc: Eric Dumazet <edumazet(a)google.com> Cc: Jakub Kicinski <kuba(a)kernel.org> Cc: Paolo Abeni <pabeni(a)redhat.com> Cc: Simon Horman <horms(a)kernel.org> Cc: Christian Brauner <brauner(a)kernel.org> Cc: Kuniyuki Iwashima <kuniyu(a)amazon.com> Cc: Lennart Poettering <mzxreary(a)0pointer.de> Cc: Luca Boccassi <bluca(a)debian.org> Cc: David Rheinsberg <david(a)readahead.eu> Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn(a)canonical.com> --- .../testing/selftests/net/af_unix/scm_pidfd.c | 217 ++++++++++++++---- 1 file changed, 173 insertions(+), 44 deletions(-) diff --git a/tools/testing/selftests/net/af_unix/scm_pidfd.c b/tools/testing/selftests/net/af_unix/scm_pidfd.c index 7e534594167e..37e034874034 100644 --- a/tools/testing/selftests/net/af_unix/scm_pidfd.c +++ b/tools/testing/selftests/net/af_unix/scm_pidfd.c @@ -15,6 +15,7 @@ #include <sys/types.h> #include <sys/wait.h> +#include "../../pidfd/pidfd.h" #include "../../kselftest_harness.h" #define clean_errno() (errno == 0 ? "None" : strerror(errno)) @@ -26,6 +27,8 @@ #define SCM_PIDFD 0x04 #endif +#define CHILD_EXIT_CODE_OK 123 + static void child_die() { exit(1); @@ -126,16 +129,65 @@ static pid_t get_pid_from_fdinfo_file(int pidfd, const char *key, size_t keylen) return result; } +struct cmsg_data { + struct ucred *ucred; + int *pidfd; +}; + +static int parse_cmsg(struct msghdr *msg, struct cmsg_data *res) +{ + struct cmsghdr *cmsg; + int data = 0; + + if (msg->msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + log_err("recvmsg: truncated"); + return 1; + } + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; + cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_PIDFD) { + if (cmsg->cmsg_len < sizeof(*res->pidfd)) { + log_err("CMSG parse: SCM_PIDFD wrong len"); + return 1; + } + + res->pidfd = (void *)CMSG_DATA(cmsg); + } + + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_CREDENTIALS) { + if (cmsg->cmsg_len < sizeof(*res->ucred)) { + log_err("CMSG parse: SCM_CREDENTIALS wrong len"); + return 1; + } + + res->ucred = (void *)CMSG_DATA(cmsg); + } + } + + if (!res->pidfd) { + log_err("CMSG parse: SCM_PIDFD not found"); + return 1; + } + + if (!res->ucred) { + log_err("CMSG parse: SCM_CREDENTIALS not found"); + return 1; + } + + return 0; +} + static int cmsg_check(int fd) { struct msghdr msg = { 0 }; - struct cmsghdr *cmsg; + struct cmsg_data res; struct iovec iov; - struct ucred *ucred = NULL; int data = 0; char control[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))] = { 0 }; - int *pidfd = NULL; pid_t parent_pid; int err; @@ -158,53 +210,99 @@ static int cmsg_check(int fd) return 1; } - for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; - cmsg = CMSG_NXTHDR(&msg, cmsg)) { - if (cmsg->cmsg_level == SOL_SOCKET && - cmsg->cmsg_type == SCM_PIDFD) { - if (cmsg->cmsg_len < sizeof(*pidfd)) { - log_err("CMSG parse: SCM_PIDFD wrong len"); - return 1; - } + /* send(pfd, "x", sizeof(char), 0) */ + if (data != 'x') { + log_err("recvmsg: data corruption"); + return 1; + } - pidfd = (void *)CMSG_DATA(cmsg); - } + if (parse_cmsg(&msg, &res)) { + log_err("CMSG parse: parse_cmsg() failed"); + return 1; + } - if (cmsg->cmsg_level == SOL_SOCKET && - cmsg->cmsg_type == SCM_CREDENTIALS) { - if (cmsg->cmsg_len < sizeof(*ucred)) { - log_err("CMSG parse: SCM_CREDENTIALS wrong len"); - return 1; - } + /* pidfd from SCM_PIDFD should point to the parent process PID */ + parent_pid = + get_pid_from_fdinfo_file(*res.pidfd, "Pid:", sizeof("Pid:") - 1); + if (parent_pid != getppid()) { + log_err("wrong SCM_PIDFD %d != %d", parent_pid, getppid()); + close(*res.pidfd); + return 1; + } - ucred = (void *)CMSG_DATA(cmsg); - } + close(*res.pidfd); + return 0; +} + +static int cmsg_check_dead(int fd, int expected_pid) +{ + int err; + struct msghdr msg = { 0 }; + struct cmsg_data res; + struct iovec iov; + int data = 0; + char control[CMSG_SPACE(sizeof(struct ucred)) + + CMSG_SPACE(sizeof(int))] = { 0 }; + pid_t client_pid; + struct pidfd_info info = { + .mask = PIDFD_INFO_EXIT, + }; + + iov.iov_base = &data; + iov.iov_len = sizeof(data); + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + err = recvmsg(fd, &msg, 0); + if (err < 0) { + log_err("recvmsg"); + return 1; } - /* send(pfd, "x", sizeof(char), 0) */ - if (data != 'x') { + if (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + log_err("recvmsg: truncated"); + return 1; + } + + /* send(cfd, "y", sizeof(char), 0) */ + if (data != 'y') { log_err("recvmsg: data corruption"); return 1; } - if (!pidfd) { - log_err("CMSG parse: SCM_PIDFD not found"); + if (parse_cmsg(&msg, &res)) { + log_err("CMSG parse: parse_cmsg() failed"); return 1; } - if (!ucred) { - log_err("CMSG parse: SCM_CREDENTIALS not found"); + /* + * pidfd from SCM_PIDFD should point to the client_pid. + * Let's read exit information and check if it's what + * we expect to see. + */ + if (ioctl(*res.pidfd, PIDFD_GET_INFO, &info)) { + log_err("%s: ioctl(PIDFD_GET_INFO) failed", __func__); + close(*res.pidfd); return 1; } - /* pidfd from SCM_PIDFD should point to the parent process PID */ - parent_pid = - get_pid_from_fdinfo_file(*pidfd, "Pid:", sizeof("Pid:") - 1); - if (parent_pid != getppid()) { - log_err("wrong SCM_PIDFD %d != %d", parent_pid, getppid()); + if (!(info.mask & PIDFD_INFO_EXIT)) { + log_err("%s: No exit information from ioctl(PIDFD_GET_INFO)", __func__); + close(*res.pidfd); return 1; } + err = WIFEXITED(info.exit_code) ? WEXITSTATUS(info.exit_code) : 1; + if (err != CHILD_EXIT_CODE_OK) { + log_err("%s: wrong exit_code %d != %d", __func__, err, CHILD_EXIT_CODE_OK); + close(*res.pidfd); + return 1; + } + + close(*res.pidfd); return 0; } @@ -291,6 +389,24 @@ static void fill_sockaddr(struct sock_addr *addr, bool abstract) memcpy(sun_path_buf, addr->sock_name, strlen(addr->sock_name)); } +static int sk_enable_cred_pass(int sk) +{ + int on = 0; + + on = 1; + if (setsockopt(sk, SOL_SOCKET, SO_PASSCRED, &on, sizeof(on))) { + log_err("Failed to set SO_PASSCRED"); + return 1; + } + + if (setsockopt(sk, SOL_SOCKET, SO_PASSPIDFD, &on, sizeof(on))) { + log_err("Failed to set SO_PASSPIDFD"); + return 1; + } + + return 0; +} + static void client(FIXTURE_DATA(scm_pidfd) *self, const FIXTURE_VARIANT(scm_pidfd) *variant) { @@ -299,7 +415,6 @@ static void client(FIXTURE_DATA(scm_pidfd) *self, struct ucred peer_cred; int peer_pidfd; pid_t peer_pid; - int on = 0; cfd = socket(AF_UNIX, variant->type, 0); if (cfd < 0) { @@ -322,14 +437,8 @@ static void client(FIXTURE_DATA(scm_pidfd) *self, child_die(); } - on = 1; - if (setsockopt(cfd, SOL_SOCKET, SO_PASSCRED, &on, sizeof(on))) { - log_err("Failed to set SO_PASSCRED"); - child_die(); - } - - if (setsockopt(cfd, SOL_SOCKET, SO_PASSPIDFD, &on, sizeof(on))) { - log_err("Failed to set SO_PASSPIDFD"); + if (sk_enable_cred_pass(cfd)) { + log_err("sk_enable_cred_pass() failed"); child_die(); } @@ -340,6 +449,12 @@ static void client(FIXTURE_DATA(scm_pidfd) *self, child_die(); } + /* send something to the parent so it can receive SCM_PIDFD too and validate it */ + if (send(cfd, "y", sizeof(char), 0) == -1) { + log_err("Failed to send(cfd, \"y\", sizeof(char), 0)"); + child_die(); + } + /* skip further for SOCK_DGRAM as it's not applicable */ if (variant->type == SOCK_DGRAM) return; @@ -398,7 +513,13 @@ TEST_F(scm_pidfd, test) close(self->server); close(self->startup_pipe[0]); client(self, variant); - exit(0); + + /* + * It's a bit unusual, but in case of success we return non-zero + * exit code (CHILD_EXIT_CODE_OK) and then we expect to read it + * from ioctl(PIDFD_GET_INFO) in cmsg_check_dead(). + */ + exit(CHILD_EXIT_CODE_OK); } close(self->startup_pipe[1]); @@ -421,9 +542,17 @@ TEST_F(scm_pidfd, test) ASSERT_NE(-1, err); } - close(pfd); waitpid(self->client_pid, &child_status, 0); - ASSERT_EQ(0, WIFEXITED(child_status) ? WEXITSTATUS(child_status) : 1); + /* see comment before exit(CHILD_EXIT_CODE_OK) */ + ASSERT_EQ(CHILD_EXIT_CODE_OK, WIFEXITED(child_status) ? WEXITSTATUS(child_status) : 1); + + err = sk_enable_cred_pass(pfd); + ASSERT_EQ(0, err); + + err = cmsg_check_dead(pfd, self->client_pid); + ASSERT_EQ(0, err); + + close(pfd); } TEST_HARNESS_MAIN -- 2.43.0

3 months, 3 weeks

1
0
0 0

[RFC PATCH v8 0/7] Add NUMA mempolicy support for KVM guest-memfd

by Shivank Garg

This series introduces NUMA-aware memory placement support for KVM guests with guest_memfd memory backends. It builds upon Fuad Tabba's work that enabled host-mapping for guest_memfd memory [1] and can be applied directly on KVM tree (branch:queue, base commit:7915077245) [2]. == Background == KVM's guest-memfd memory backend currently lacks support for NUMA policy enforcement, causing guest memory allocations to be distributed across host nodes according to kernel's default behavior, irrespective of any policy specified by the VMM. This limitation arises because conventional userspace NUMA control mechanisms like mbind(2) don't work since the memory isn't directly mapped to userspace when allocations occur. Fuad's work [1] provides the necessary mmap capability, and this series leverages it to enable mbind(2). == Implementation == This series implements proper NUMA policy support for guest-memfd by: 1. Adding mempolicy-aware allocation APIs to the filemap layer. 2. Introducing custom inodes (via a dedicated slab-allocated inode cache, kvm_gmem_inode_info) to store NUMA policy and metadata for guest memory. 3. Implementing get/set_policy vm_ops in guest_memfd to support NUMA policy. With these changes, VMMs can now control guest memory placement by mapping guest_memfd file descriptor and using mbind(2) to specify: - Policy modes: default, bind, interleave, or preferred - Host NUMA nodes: List of target nodes for memory allocation These Policies affect only future allocations and do not migrate existing memory. This matches mbind(2)'s default behavior which affects only new allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL flags (Not supported for guest_memfd as it is unmovable by design). == Upstream Plan == Phased approach as per David's guest_memfd extension overview [3] and community calls [4]: Phase 1 (this series): 1. Focuses on shared guest_memfd support (non-CoCo VMs). 2. Builds on Fuad's host-mapping work. Phase2 (future work): 1. NUMA support for private guest_memfd (CoCo VMs). 2. Depends on SNP in-place conversion support [5]. This series provides a clean integration path for NUMA-aware memory management for guest_memfd and lays the groundwork for future confidential computing NUMA capabilities. Please review and provide feedback! Thanks, Shivank == Changelog == - v1,v2: Extended the KVM_CREATE_GUEST_MEMFD IOCTL to pass mempolicy. - v3: Introduced fbind() syscall for VMM memory-placement configuration. - v4-v6: Current approach using shared_policy support and vm_ops (based on suggestions from David [6] and guest_memfd bi-weekly upstream call discussion [7]). - v7: Use inodes to store NUMA policy instead of file [8]. - v8: Rebase on top of Fuad's V12: Host mmaping for guest_memfd memory. [1] https://lore.kernel.org/all/20250611133330.1514028-1-tabba@google.com [2] https://git.kernel.org/pub/scm/virt/kvm/kvm.git/log/?h=queue [3] https://lore.kernel.org/all/c1c9591d-218a-495c-957b-ba356c8f8e09@redhat.com [4] https://docs.google.com/document/d/1M6766BzdY1Lhk7LiR5IqVR8B8mG3cr-cxTxOrAo… [5] https://lore.kernel.org/all/20250613005400.3694904-1-michael.roth@amd.com [6] https://lore.kernel.org/all/6fbef654-36e2-4be5-906e-2a648a845278@redhat.com [7] https://lore.kernel.org/all/2b77e055-98ac-43a1-a7ad-9f9065d7f38f@amd.com [8] https://lore.kernel.org/all/diqzbjumm167.fsf@ackerleytng-ctop.c.googlers.com Ackerley Tng (1): KVM: guest_memfd: Use guest mem inodes instead of anonymous inodes Shivank Garg (5): security: Export anon_inode_make_secure_inode for KVM guest_memfd mm/mempolicy: Export memory policy symbols KVM: guest_memfd: Add slab-allocated inode cache KVM: guest_memfd: Enforce NUMA mempolicy using shared policy KVM: guest_memfd: selftests: Add tests for mmap and NUMA policy support Shivansh Dhiman (1): mm/filemap: Add mempolicy support to the filemap layer fs/anon_inodes.c | 20 +- include/linux/fs.h | 2 + include/linux/pagemap.h | 41 +++ include/uapi/linux/magic.h | 1 + mm/filemap.c | 27 +- mm/mempolicy.c | 6 + tools/testing/selftests/kvm/Makefile.kvm | 1 + .../testing/selftests/kvm/guest_memfd_test.c | 123 ++++++++- virt/kvm/guest_memfd.c | 254 ++++++++++++++++-- virt/kvm/kvm_main.c | 7 +- virt/kvm/kvm_mm.h | 10 +- 11 files changed, 456 insertions(+), 36 deletions(-) -- 2.43.0 --- == Earlier Postings == v7: https://lore.kernel.org/all/20250408112402.181574-1-shivankg@amd.com v6: https://lore.kernel.org/all/20250226082549.6034-1-shivankg@amd.com v5: https://lore.kernel.org/all/20250219101559.414878-1-shivankg@amd.com v4: https://lore.kernel.org/all/20250210063227.41125-1-shivankg@amd.com v3: https://lore.kernel.org/all/20241105164549.154700-1-shivankg@amd.com v2: https://lore.kernel.org/all/20240919094438.10987-1-shivankg@amd.com v1: https://lore.kernel.org/all/20240916165743.201087-1-shivankg@amd.com

3 months, 3 weeks

8
33
0 0

[PATCH v3 0/3] replace `allow(...)` lints with `expect(...)`

by Onur Özkan

Changes in v3: - `#![expect(internal_features)]` is reverted from `rust/compiler_builtins.rs` which was done in the first 2 versions. Onur Özkan (3): replace `#[allow(...)]` with `#[expect(...)]` rust: remove `#[allow(clippy::unnecessary_cast)]` rust: remove `#[allow(clippy::non_send_fields_in_send_ty)]` drivers/gpu/nova-core/regs.rs | 2 +- rust/kernel/alloc/allocator_test.rs | 2 +- rust/kernel/cpufreq.rs | 1 - rust/kernel/devres.rs | 2 +- rust/kernel/driver.rs | 2 +- rust/kernel/drm/ioctl.rs | 8 ++++---- rust/kernel/error.rs | 3 +-- rust/kernel/init.rs | 6 +++--- rust/kernel/kunit.rs | 2 +- rust/kernel/opp.rs | 4 ++-- rust/kernel/types.rs | 2 +- rust/macros/helpers.rs | 2 +- 12 files changed, 17 insertions(+), 19 deletions(-) -- 2.50.0

3 months, 3 weeks

4
15
0 0

[PATCH 0/6] selftests/damon: add python and drgn based DAMON sysfs functionality tests

by SeongJae Park

DAMON sysfs interface is the bridge between the user space and the kernel space for DAMON parameters. There is no good and simple test to see if the parameters are set as expected. Existing DAMON selftests therefore test end-to-end features. For example, damos_quota_goal.py runs a DAMOS scheme with quota goal set against a test program running an artificial access pattern, and see if the result is as expected. Such tests cover only a few part of DAMON. Adding more tests is also complicated. Finally, the reliability of the test itself on different systems is bad. 'drgn' is a tool that can extract kernel internal data structures like DAMON parameters. Add a test that passes specific DAMON parameters via DAMON sysfs reusing _damon_sysfs.py, extract resulting DAMON parameters via 'drgn', and compare those. Note that this test is not adding exhaustive tests of all DAMON parameters and input combinations but very basic things. Advancing the test infrastructure and adding more tests are future works. Changes from RFC (https://lore.kernel.org/20250622210330.40490-1-sj@kernel.org) - Rebase on latest mm-new SeongJae Park (6): selftests/damon: add drgn script for extracting damon status selftests/damon/_damon_sysfs: set Kdamond.pid in start() selftests/damon: add python and drgn-based DAMON sysfs test selftests/damon/sysfs.py: test monitoring attribute parameters selftests/damon/sysfs.py: test adaptive targets parameter selftests/damon/sysfs.py: test DAMOS schemes parameters setup tools/testing/selftests/damon/Makefile | 1 + tools/testing/selftests/damon/_damon_sysfs.py | 3 + .../selftests/damon/drgn_dump_damon_status.py | 161 ++++++++++++++++++ tools/testing/selftests/damon/sysfs.py | 115 +++++++++++++ 4 files changed, 280 insertions(+) create mode 100755 tools/testing/selftests/damon/drgn_dump_damon_status.py create mode 100755 tools/testing/selftests/damon/sysfs.py base-commit: 5ab6feac2d83ebbf0d0d2eedf0505878ba677dcb -- 2.39.5

3 months, 3 weeks

1
6
0 0

[PATCH bpf-next 0/3] bpf: Fix and test aux usage after do_check_insn()

by Luis Gerhorst

Fix cur_aux()->nospec_result test after do_check_insn() referring to the to-be-analyzed (potentially unsafe) instruction, not the already-analyzed (safe) instruction. This might allow a unsafe insn to slip through on a speculative path. Create some tests from the reproducer [1]. Commit d6f1c85f2253 ("bpf: Fall back to nospec for Spectre v1") should not be in any stable kernel yet, therefore bpf-next should suffice. [1] https://lore.kernel.org/bpf/685b3c1b.050a0220.2303ee.0010.GAE@google.com/ Changes since RFC: - Introduce prev_aux() as suggested by Alexei. For this, we must move the env->prev_insn_idx assignment to happen directly after do_check_insn(), for which I have created a separate commit. This patch could be simplified by using a local prev_aux variable as sugested by Eduard, but I figured one might find the new assignment-strategy easier to understand (before, prev_insn_idx and env->prev_insn_idx were out-of-sync for the latter part of the loop). Also, like this we do not have an additional prev_* variable that must be kept in-sync and the local variable's usage (old prev_insn_idx, new tmp) is much more local. If you think it would be better to not take the risk and keep the fix simple by just introducing the prev_aux variable, let me know. - Change WARN_ON_ONCE() to verifier_bug_if() as suggested by Alexei - Change assertion to check instruction is BPF_JMP[32] as suggested by Eduard - RFC: https://lore.kernel.org/bpf/8734bmoemx.fsf@fau.de/ Luis Gerhorst (3): bpf: Update env->prev_insn_idx after do_check_insn() bpf: Fix aux usage after do_check_insn() selftests/bpf: Add Spectre v4 tests kernel/bpf/verifier.c | 30 ++-- tools/testing/selftests/bpf/progs/bpf_misc.h | 4 + .../selftests/bpf/progs/verifier_unpriv.c | 149 ++++++++++++++++++ 3 files changed, 174 insertions(+), 9 deletions(-) base-commit: d69bafe6ee2b5eff6099fa26626ecc2963f0f363 -- 2.49.0

3 months, 3 weeks

1
3
0 0

[kvm-unit-tests PATCH v6] riscv: sbi: Add SBI Debug Triggers Extension tests

by Jesse Taube

Add tests for the DBTR SBI extension. Signed-off-by: Jesse Taube <jesse(a)rivosinc.com> Reviewed-by: Charlie Jenkins <charlie(a)rivosinc.com> Tested-by: Charlie Jenkins <charlie(a)rivosinc.com> --- V1 -> V2: - Call report_prefix_pop before returning - Disable compressed instructions in exec_call, update related comment - Remove extra "| 1" in dbtr_test_load - Remove extra newlines - Remove extra tabs in check_exec - Remove typedefs from enums - Return when dbtr_install_trigger fails - s/avalible/available/g - s/unistall/uninstall/g V2 -> V3: - Change SBI_DBTR_SHMEM_INVALID_ADDR to -1UL - Move all dbtr functions to sbi-dbtr.c - Move INSN_LEN to processor.h - Update include list - Use C-style comments V3 -> V4: - Include libcflat.h - Remove #define SBI_DBTR_SHMEM_INVALID_ADDR V4 -> V5: - Sort includes - Add kfail for update triggers V5 -> V6: - Add assert in gen_tdata1 - Add prefix to dbtr_test_type - Add TRIG_STATE_DMODE - Add TRIG_STATE_RESERVED - Align function paramaters with opening parenthesis - Change OpenSBI < v1.7 to < v1.5 - Constantly use spaces in prefix rather than _ - Export split_phys_addr - Fix MCONTROL_U and MCONTROL_M mix up - Fix swapped VU and VS - Move /* to own line - Print type in dbtr_test_type - Remove _BIT suffix from macros - Remove duplicate MODE_S - Remove spaces before include - Rename tdata1,2 to trigger and control in dbtr_install_trigger - Report skip in dbtr_test_multiple - Report variables in info not pass or fail - s/save/store/g - sbi_debug_set_shmem use split_phys_addr - Use if (!report(... in dbtr_test_disable_enable --- lib/riscv/asm/sbi.h | 1 + riscv/Makefile | 1 + riscv/sbi-dbtr.c | 839 ++++++++++++++++++++++++++++++++++++++++++++ riscv/sbi-tests.h | 2 + riscv/sbi.c | 3 +- 5 files changed, 845 insertions(+), 1 deletion(-) create mode 100644 riscv/sbi-dbtr.c diff --git a/lib/riscv/asm/sbi.h b/lib/riscv/asm/sbi.h index a5738a5c..78fd6e2a 100644 --- a/lib/riscv/asm/sbi.h +++ b/lib/riscv/asm/sbi.h @@ -51,6 +51,7 @@ enum sbi_ext_id { SBI_EXT_SUSP = 0x53555350, SBI_EXT_FWFT = 0x46574654, SBI_EXT_SSE = 0x535345, + SBI_EXT_DBTR = 0x44425452, }; enum sbi_ext_base_fid { diff --git a/riscv/Makefile b/riscv/Makefile index 11e68eae..55c7ac93 100644 --- a/riscv/Makefile +++ b/riscv/Makefile @@ -20,6 +20,7 @@ all: $(tests) $(TEST_DIR)/sbi-deps += $(TEST_DIR)/sbi-asm.o $(TEST_DIR)/sbi-deps += $(TEST_DIR)/sbi-fwft.o $(TEST_DIR)/sbi-deps += $(TEST_DIR)/sbi-sse.o +$(TEST_DIR)/sbi-deps += $(TEST_DIR)/sbi-dbtr.o all_deps += $($(TEST_DIR)/sbi-deps) diff --git a/riscv/sbi-dbtr.c b/riscv/sbi-dbtr.c new file mode 100644 index 00000000..7837553f --- /dev/null +++ b/riscv/sbi-dbtr.c @@ -0,0 +1,839 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * SBI DBTR testsuite + * + * Copyright (C) 2025, Rivos Inc., Jesse Taube <jesse(a)rivosinc.com> + */ + +#include <libcflat.h> +#include <bitops.h> + +#include <asm/io.h> +#include <asm/processor.h> + +#include "sbi-tests.h" + +#define RV_MAX_TRIGGERS 32 + +#define SBI_DBTR_TRIG_STATE_MAPPED BIT(0) +#define SBI_DBTR_TRIG_STATE_U BIT(1) +#define SBI_DBTR_TRIG_STATE_S BIT(2) +#define SBI_DBTR_TRIG_STATE_VU BIT(3) +#define SBI_DBTR_TRIG_STATE_VS BIT(4) +#define SBI_DBTR_TRIG_STATE_HAVE_HW_TRIG BIT(5) +#define SBI_DBTR_TRIG_STATE_RESERVED GENMASK(7, 6) + +#define SBI_DBTR_TRIG_STATE_HW_TRIG_IDX_SHIFT 8 +#define SBI_DBTR_TRIG_STATE_HW_TRIG_IDX(trig_state) (trig_state >> SBI_DBTR_TRIG_STATE_HW_TRIG_IDX_SHIFT) + +#define SBI_DBTR_TDATA1_TYPE_SHIFT (__riscv_xlen - 4) +#define SBI_DBTR_TDATA1_DMODE BIT_UL(__riscv_xlen - 5) + +#define SBI_DBTR_TDATA1_MCONTROL6_LOAD BIT(0) +#define SBI_DBTR_TDATA1_MCONTROL6_STORE BIT(1) +#define SBI_DBTR_TDATA1_MCONTROL6_EXECUTE BIT(2) +#define SBI_DBTR_TDATA1_MCONTROL6_U BIT(3) +#define SBI_DBTR_TDATA1_MCONTROL6_S BIT(4) +#define SBI_DBTR_TDATA1_MCONTROL6_M BIT(6) +#define SBI_DBTR_TDATA1_MCONTROL6_SELECT BIT(21) +#define SBI_DBTR_TDATA1_MCONTROL6_VU BIT(23) +#define SBI_DBTR_TDATA1_MCONTROL6_VS BIT(24) + +#define SBI_DBTR_TDATA1_MCONTROL_LOAD BIT(0) +#define SBI_DBTR_TDATA1_MCONTROL_STORE BIT(1) +#define SBI_DBTR_TDATA1_MCONTROL_EXECUTE BIT(2) +#define SBI_DBTR_TDATA1_MCONTROL_U BIT(3) +#define SBI_DBTR_TDATA1_MCONTROL_S BIT(4) +#define SBI_DBTR_TDATA1_MCONTROL_M BIT(6) +#define SBI_DBTR_TDATA1_MCONTROL_SELECT BIT(19) + +enum McontrolType { + SBI_DBTR_TDATA1_TYPE_NONE = (0UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_LEGACY = (1UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_MCONTROL = (2UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_ICOUNT = (3UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_ITRIGGER = (4UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_ETRIGGER = (5UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_MCONTROL6 = (6UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_TMEXTTRIGGER = (7UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_RESERVED0 = (8UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_RESERVED1 = (9UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_RESERVED2 = (10UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_RESERVED3 = (11UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_CUSTOM0 = (12UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_CUSTOM1 = (13UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_CUSTOM2 = (14UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_DISABLED = (15UL << SBI_DBTR_TDATA1_TYPE_SHIFT), +}; + +enum Tdata1Value { + VALUE_NONE = 0, + VALUE_LOAD = BIT(0), + VALUE_STORE = BIT(1), + VALUE_EXECUTE = BIT(2), +}; + +enum Tdata1Mode { + MODE_NONE = 0, + MODE_M = BIT(0), + MODE_U = BIT(1), + MODE_S = BIT(2), + MODE_VU = BIT(3), + MODE_VS = BIT(4), +}; + +enum sbi_ext_dbtr_fid { + SBI_EXT_DBTR_NUM_TRIGGERS = 0, + SBI_EXT_DBTR_SETUP_SHMEM, + SBI_EXT_DBTR_TRIGGER_READ, + SBI_EXT_DBTR_TRIGGER_INSTALL, + SBI_EXT_DBTR_TRIGGER_UPDATE, + SBI_EXT_DBTR_TRIGGER_UNINSTALL, + SBI_EXT_DBTR_TRIGGER_ENABLE, + SBI_EXT_DBTR_TRIGGER_DISABLE, +}; + +struct sbi_dbtr_data_msg { + unsigned long tstate; + unsigned long tdata1; + unsigned long tdata2; + unsigned long tdata3; +}; + +struct sbi_dbtr_id_msg { + unsigned long idx; +}; + +/* SBI shared mem messages layout */ +struct sbi_dbtr_shmem_entry { + union { + struct sbi_dbtr_data_msg data; + struct sbi_dbtr_id_msg id; + }; +}; + +static bool dbtr_handled; + +/* Expected to be leaf function as not to disrupt frame-pointer */ +static __attribute__((naked)) void exec_call(void) +{ + /* skip over nop when triggered instead of ret. */ + asm volatile (".option push\n" + ".option arch, -c\n" + "nop\n" + "ret\n" + ".option pop\n"); +} + +static void dbtr_exception_handler(struct pt_regs *regs) +{ + dbtr_handled = true; + + /* Reading *epc may cause a fault, skip over nop */ + if ((void *)regs->epc == exec_call) { + regs->epc += 4; + return; + } + + /* WARNING: Skips over the trapped intruction */ + regs->epc += RV_INSN_LEN(readw((void *)regs->epc)); +} + +static bool do_store(void *tdata2) +{ + bool ret; + + writel(0, tdata2); + + ret = dbtr_handled; + dbtr_handled = false; + + return ret; +} + +static bool do_load(void *tdata2) +{ + bool ret; + + readl(tdata2); + + ret = dbtr_handled; + dbtr_handled = false; + + return ret; +} + +static bool do_exec(void) +{ + bool ret; + + exec_call(); + + ret = dbtr_handled; + dbtr_handled = false; + + return ret; +} + +static unsigned long gen_tdata1_mcontrol(enum Tdata1Mode mode, enum Tdata1Value value) +{ + unsigned long tdata1 = SBI_DBTR_TDATA1_TYPE_MCONTROL; + + if (value & VALUE_LOAD) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL_LOAD; + + if (value & VALUE_STORE) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL_STORE; + + if (value & VALUE_EXECUTE) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL_EXECUTE; + + if (mode & MODE_M) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL_M; + + if (mode & MODE_U) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL_U; + + if (mode & MODE_S) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL_S; + + return tdata1; +} + +static unsigned long gen_tdata1_mcontrol6(enum Tdata1Mode mode, enum Tdata1Value value) +{ + unsigned long tdata1 = SBI_DBTR_TDATA1_TYPE_MCONTROL6; + + if (value & VALUE_LOAD) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL6_LOAD; + + if (value & VALUE_STORE) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL6_STORE; + + if (value & VALUE_EXECUTE) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL6_EXECUTE; + + if (mode & MODE_M) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL6_M; + + if (mode & MODE_U) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL6_U; + + if (mode & MODE_S) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL6_S; + + if (mode & MODE_VU) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL6_VU; + + if (mode & MODE_VS) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL6_VS; + + return tdata1; +} + +static unsigned long gen_tdata1(enum McontrolType type, enum Tdata1Value value, enum Tdata1Mode mode) +{ + switch (type) { + case SBI_DBTR_TDATA1_TYPE_MCONTROL: + return gen_tdata1_mcontrol(mode, value); + case SBI_DBTR_TDATA1_TYPE_MCONTROL6: + return gen_tdata1_mcontrol6(mode, value); + default: + assert_msg(false, "Invalid mcontrol type: %lu", type); + return 0; + } +} + +static struct sbiret sbi_debug_num_triggers(unsigned long trig_tdata1) +{ + return sbi_ecall(SBI_EXT_DBTR, SBI_EXT_DBTR_NUM_TRIGGERS, trig_tdata1, 0, 0, 0, 0, 0); +} + +static struct sbiret sbi_debug_set_shmem_raw(unsigned long shmem_phys_lo, + unsigned long shmem_phys_hi, + unsigned long flags) +{ + return sbi_ecall(SBI_EXT_DBTR, SBI_EXT_DBTR_SETUP_SHMEM, shmem_phys_lo, + shmem_phys_hi, flags, 0, 0, 0); +} + +static struct sbiret sbi_debug_set_shmem(void *shmem) +{ + unsigned long base_addr_lo, base_addr_hi; + + split_phys_addr(virt_to_phys(shmem), &base_addr_hi, &base_addr_lo); + return sbi_debug_set_shmem_raw(base_addr_lo, base_addr_hi, 0); +} + +static struct sbiret sbi_debug_read_triggers(unsigned long trig_idx_base, + unsigned long trig_count) +{ + return sbi_ecall(SBI_EXT_DBTR, SBI_EXT_DBTR_TRIGGER_READ, trig_idx_base, + trig_count, 0, 0, 0, 0); +} + +static struct sbiret sbi_debug_install_triggers(unsigned long trig_count) +{ + return sbi_ecall(SBI_EXT_DBTR, SBI_EXT_DBTR_TRIGGER_INSTALL, trig_count, 0, 0, 0, 0, 0); +} + +static struct sbiret sbi_debug_update_triggers(unsigned long trig_count) +{ + return sbi_ecall(SBI_EXT_DBTR, SBI_EXT_DBTR_TRIGGER_UPDATE, trig_count, 0, 0, 0, 0, 0); +} + +static struct sbiret sbi_debug_uninstall_triggers(unsigned long trig_idx_base, + unsigned long trig_idx_mask) +{ + return sbi_ecall(SBI_EXT_DBTR, SBI_EXT_DBTR_TRIGGER_UNINSTALL, trig_idx_base, + trig_idx_mask, 0, 0, 0, 0); +} + +static struct sbiret sbi_debug_enable_triggers(unsigned long trig_idx_base, + unsigned long trig_idx_mask) +{ + return sbi_ecall(SBI_EXT_DBTR, SBI_EXT_DBTR_TRIGGER_ENABLE, trig_idx_base, + trig_idx_mask, 0, 0, 0, 0); +} + +static struct sbiret sbi_debug_disable_triggers(unsigned long trig_idx_base, + unsigned long trig_idx_mask) +{ + return sbi_ecall(SBI_EXT_DBTR, SBI_EXT_DBTR_TRIGGER_DISABLE, trig_idx_base, + trig_idx_mask, 0, 0, 0, 0); +} + +static bool dbtr_install_trigger(struct sbi_dbtr_shmem_entry *shmem, void *trigger, + unsigned long control) +{ + struct sbiret sbi_ret; + bool ret; + + shmem->data.tdata1 = control; + shmem->data.tdata2 = (unsigned long)trigger; + + sbi_ret = sbi_debug_install_triggers(1); + ret = sbiret_report_error(&sbi_ret, SBI_SUCCESS, "sbi_debug_install_triggers"); + if (ret) + install_exception_handler(EXC_BREAKPOINT, dbtr_exception_handler); + + return ret; +} + +static bool dbtr_uninstall_trigger(void) +{ + struct sbiret ret; + + install_exception_handler(EXC_BREAKPOINT, NULL); + + ret = sbi_debug_uninstall_triggers(0, 1); + return sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_uninstall_triggers"); +} + +static unsigned long dbtr_test_num_triggers(void) +{ + struct sbiret ret; + unsigned long tdata1 = 0; + /* sbi_debug_num_triggers will return trig_max in sbiret.value when trig_tdata1 == 0 */ + + /* should be at least one trigger. */ + ret = sbi_debug_num_triggers(tdata1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_num_triggers"); + + if (ret.value == 0) { + report_fail("sbi_debug_num_triggers: Returned 0 triggers available"); + } else { + report_pass("sbi_debug_num_triggers: Returned triggers available"); + report_info("sbi_debug_num_triggers: Returned %lu triggers available", ret.value); + } + + return ret.value; +} + +static enum McontrolType dbtr_test_type(unsigned long *num_trig) +{ + struct sbiret ret; + unsigned long tdata1 = SBI_DBTR_TDATA1_TYPE_MCONTROL6; + + report_prefix_push("test type"); + + ret = sbi_debug_num_triggers(tdata1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_num_triggers: mcontrol6"); + *num_trig = ret.value; + if (ret.value > 0) { + report_pass("sbi_debug_num_triggers: Returned mcontrol6 triggers available"); + report_info("sbi_debug_num_triggers: Returned %lu mcontrol6 triggers available", + ret.value); + report_prefix_pop(); + return tdata1; + } + + tdata1 = SBI_DBTR_TDATA1_TYPE_MCONTROL; + + ret = sbi_debug_num_triggers(tdata1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_num_triggers: mcontrol"); + *num_trig = ret.value; + if (ret.value > 0) { + report_pass("sbi_debug_num_triggers: Returned mcontrol triggers available"); + report_info("sbi_debug_num_triggers: Returned %lu mcontrol triggers available", + ret.value); + report_prefix_pop(); + return tdata1; + } + + report_fail("sbi_debug_num_triggers: Returned 0 mcontrol(6) triggers available"); + report_prefix_pop(); + + return SBI_DBTR_TDATA1_TYPE_NONE; +} + +static struct sbiret dbtr_test_store_install_uninstall(struct sbi_dbtr_shmem_entry *shmem, + enum McontrolType type) +{ + static unsigned long test; + struct sbiret ret; + + report_prefix_push("store trigger"); + + shmem->data.tdata1 = gen_tdata1(type, VALUE_STORE, MODE_S); + shmem->data.tdata2 = (unsigned long)&test; + + ret = sbi_debug_install_triggers(1); + if (!sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_install_triggers")) { + report_prefix_pop(); + return ret; + } + + install_exception_handler(EXC_BREAKPOINT, dbtr_exception_handler); + + report(do_store(&test), "triggered"); + + if (do_load(&test)) + report_fail("triggered by load"); + + ret = sbi_debug_uninstall_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_uninstall_triggers"); + + if (do_store(&test)) + report_fail("triggered after uninstall"); + + install_exception_handler(EXC_BREAKPOINT, NULL); + report_prefix_pop(); + + return ret; +} + +static void dbtr_test_update(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + static unsigned long test; + struct sbiret ret; + bool kfail; + + report_prefix_push("update trigger"); + + if (!dbtr_install_trigger(shmem, NULL, gen_tdata1(type, VALUE_NONE, MODE_NONE))) { + report_prefix_pop(); + return; + } + + shmem->id.idx = 0; + shmem->data.tdata1 = gen_tdata1(type, VALUE_STORE, MODE_S); + shmem->data.tdata2 = (unsigned long)&test; + + ret = sbi_debug_update_triggers(1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_update_triggers"); + + /* + * Known broken update_triggers. + * https://lore.kernel.org/opensbi/aDdp1UeUh7GugeHp@ghost/T/#t + */ + kfail = __sbi_get_imp_id() == SBI_IMPL_OPENSBI && + __sbi_get_imp_version() < sbi_impl_opensbi_mk_version(1, 7); + report_kfail(kfail, do_store(&test), "triggered"); + + dbtr_uninstall_trigger(); + report_prefix_pop(); +} + +static void dbtr_test_load(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + static unsigned long test; + + report_prefix_push("load trigger"); + if (!dbtr_install_trigger(shmem, &test, gen_tdata1(type, VALUE_LOAD, MODE_S))) { + report_prefix_pop(); + return; + } + + report(do_load(&test), "triggered"); + + if (do_store(&test)) + report_fail("triggered by store"); + + dbtr_uninstall_trigger(); + report_prefix_pop(); +} + +static void dbtr_test_disable_enable(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + static unsigned long test; + struct sbiret ret; + + report_prefix_push("disable trigger"); + if (!dbtr_install_trigger(shmem, &test, gen_tdata1(type, VALUE_STORE, MODE_S))) { + report_prefix_pop(); + return; + } + + ret = sbi_debug_disable_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_disable_triggers"); + + if (!report(!do_store(&test), "should not trigger")) { + dbtr_uninstall_trigger(); + report_prefix_pop(); + report_skip("enable trigger: no disable"); + + return; + } + + report_prefix_pop(); + report_prefix_push("enable trigger"); + + ret = sbi_debug_enable_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_enable_triggers"); + + report(do_store(&test), "triggered"); + + dbtr_uninstall_trigger(); + report_prefix_pop(); +} + +static void dbtr_test_exec(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + static unsigned long test; + + report_prefix_push("exec trigger"); + /* check if loads and stores trigger exec */ + if (!dbtr_install_trigger(shmem, &test, gen_tdata1(type, VALUE_EXECUTE, MODE_S))) { + report_prefix_pop(); + return; + } + + if (do_load(&test)) + report_fail("triggered by load"); + + if (do_store(&test)) + report_fail("triggered by store"); + + dbtr_uninstall_trigger(); + + /* Check if exec works */ + if (!dbtr_install_trigger(shmem, exec_call, gen_tdata1(type, VALUE_EXECUTE, MODE_S))) { + report_prefix_pop(); + return; + } + report(do_exec(), "triggered"); + + dbtr_uninstall_trigger(); + report_prefix_pop(); +} + +static void dbtr_test_read(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + const unsigned long tstatus_expected = SBI_DBTR_TRIG_STATE_S | SBI_DBTR_TRIG_STATE_MAPPED; + const unsigned long tdata1 = gen_tdata1(type, VALUE_STORE, MODE_S); + static unsigned long test; + struct sbiret ret; + + report_prefix_push("read trigger"); + if (!dbtr_install_trigger(shmem, &test, tdata1)) { + report_prefix_pop(); + return; + } + + ret = sbi_debug_read_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_read_triggers"); + + report(shmem->data.tdata1 == tdata1, "tdata1 expected: 0x%016lx", tdata1); + report_info("tdata1 found: 0x%016lx", shmem->data.tdata1); + report(shmem->data.tdata2 == ((unsigned long)&test), "tdata2 expected: 0x%016lx", + (unsigned long)&test); + report_info("tdata2 found: 0x%016lx", shmem->data.tdata2); + report(shmem->data.tstate == tstatus_expected, "tstate expected: 0x%016lx", tstatus_expected); + report_info("tstate found: 0x%016lx", shmem->data.tstate); + + dbtr_uninstall_trigger(); + report_prefix_pop(); +} + +static void check_exec(unsigned long base) +{ + struct sbiret ret; + + report(do_exec(), "exec triggered"); + + ret = sbi_debug_uninstall_triggers(base, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_uninstall_triggers"); +} + +static void dbtr_test_multiple(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type, + unsigned long num_trigs) +{ + static unsigned long test[2]; + struct sbiret ret; + bool have_three = num_trigs > 2; + + if (num_trigs < 2) { + report_skip("test multiple"); + return; + } + + report_prefix_push("test multiple"); + + if (!dbtr_install_trigger(shmem, &test[0], gen_tdata1(type, VALUE_STORE, MODE_S))) { + report_prefix_pop(); + return; + } + if (!dbtr_install_trigger(shmem, &test[1], gen_tdata1(type, VALUE_LOAD, MODE_S))) + goto error; + if (have_three && + !dbtr_install_trigger(shmem, exec_call, gen_tdata1(type, VALUE_EXECUTE, MODE_S))) { + ret = sbi_debug_uninstall_triggers(1, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_uninstall_triggers"); + goto error; + } + + report(do_store(&test[0]), "store triggered"); + + if (do_load(&test[0])) + report_fail("store triggered by load"); + + report(do_load(&test[1]), "load triggered"); + + if (do_store(&test[1])) + report_fail("load triggered by store"); + + if (have_three) + check_exec(2); + + ret = sbi_debug_uninstall_triggers(1, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_uninstall_triggers"); + + if (do_load(&test[1])) + report_fail("load triggered after uninstall"); + + report(do_store(&test[0]), "store triggered"); + + if (!have_three && + dbtr_install_trigger(shmem, exec_call, gen_tdata1(type, VALUE_EXECUTE, MODE_S))) + check_exec(1); + +error: + ret = sbi_debug_uninstall_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_uninstall_triggers"); + + install_exception_handler(EXC_BREAKPOINT, NULL); + report_prefix_pop(); +} + +static void dbtr_test_multiple_types(struct sbi_dbtr_shmem_entry *shmem, unsigned long type) +{ + static unsigned long test; + + report_prefix_push("test multiple types"); + + /* check if loads and stores trigger exec */ + if (!dbtr_install_trigger(shmem, &test, + gen_tdata1(type, VALUE_EXECUTE | VALUE_LOAD | VALUE_STORE, MODE_S))) { + report_prefix_pop(); + return; + } + + report(do_load(&test), "load triggered"); + + report(do_store(&test), "store triggered"); + + dbtr_uninstall_trigger(); + + /* Check if exec works */ + if (!dbtr_install_trigger(shmem, exec_call, + gen_tdata1(type, VALUE_EXECUTE | VALUE_LOAD | VALUE_STORE, MODE_S))) { + report_prefix_pop(); + return; + } + + report(do_exec(), "exec triggered"); + + dbtr_uninstall_trigger(); + report_prefix_pop(); +} + +static void dbtr_test_disable_uninstall(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + static unsigned long test; + struct sbiret ret; + + report_prefix_push("disable uninstall"); + if (!dbtr_install_trigger(shmem, &test, gen_tdata1(type, VALUE_STORE, MODE_S))) { + report_prefix_pop(); + return; + } + + ret = sbi_debug_disable_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_disable_triggers"); + + dbtr_uninstall_trigger(); + + if (!dbtr_install_trigger(shmem, &test, gen_tdata1(type, VALUE_STORE, MODE_S))) { + report_prefix_pop(); + return; + } + + report(do_store(&test), "triggered"); + + dbtr_uninstall_trigger(); + report_prefix_pop(); +} + +static void dbtr_test_uninstall_enable(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + static unsigned long test; + struct sbiret ret; + + report_prefix_push("uninstall enable"); + if (!dbtr_install_trigger(shmem, &test, gen_tdata1(type, VALUE_STORE, MODE_S))) { + report_prefix_pop(); + return; + } + dbtr_uninstall_trigger(); + + ret = sbi_debug_enable_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_enable_triggers"); + + install_exception_handler(EXC_BREAKPOINT, dbtr_exception_handler); + + report(!do_store(&test), "should not trigger"); + + install_exception_handler(EXC_BREAKPOINT, NULL); + report_prefix_pop(); +} + +static void dbtr_test_uninstall_update(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + static unsigned long test; + struct sbiret ret; + bool kfail; + + report_prefix_push("uninstall update"); + if (!dbtr_install_trigger(shmem, NULL, gen_tdata1(type, VALUE_NONE, MODE_NONE))) { + report_prefix_pop(); + return; + } + + dbtr_uninstall_trigger(); + + shmem->id.idx = 0; + shmem->data.tdata1 = gen_tdata1(type, VALUE_STORE, MODE_S); + shmem->data.tdata2 = (unsigned long)&test; + + /* + * Known broken update_triggers. + * https://lore.kernel.org/opensbi/aDdp1UeUh7GugeHp@ghost/T/#t + */ + kfail = __sbi_get_imp_id() == SBI_IMPL_OPENSBI && + __sbi_get_imp_version() < sbi_impl_opensbi_mk_version(1, 7); + ret = sbi_debug_update_triggers(1); + sbiret_kfail_error(kfail, &ret, SBI_ERR_FAILURE, "sbi_debug_update_triggers"); + + install_exception_handler(EXC_BREAKPOINT, dbtr_exception_handler); + + report(!do_store(&test), "should not trigger"); + + install_exception_handler(EXC_BREAKPOINT, NULL); + report_prefix_pop(); +} + +static void dbtr_test_disable_read(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + const unsigned long tstatus_expected = SBI_DBTR_TRIG_STATE_S | SBI_DBTR_TRIG_STATE_MAPPED; + const unsigned long tdata1 = gen_tdata1(type, VALUE_STORE, MODE_NONE); + static unsigned long test; + struct sbiret ret; + + report_prefix_push("disable read"); + if (!dbtr_install_trigger(shmem, &test, gen_tdata1(type, VALUE_STORE, MODE_S))) { + report_prefix_pop(); + return; + } + + ret = sbi_debug_disable_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_disable_triggers"); + + ret = sbi_debug_read_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_read_triggers"); + + report(shmem->data.tdata1 == tdata1, "tdata1 expected: 0x%016lx", tdata1); + report_info("tdata1 found: 0x%016lx", shmem->data.tdata1); + report(shmem->data.tdata2 == ((unsigned long)&test), "tdata2 expected: 0x%016lx", + (unsigned long)&test); + report_info("tdata2 found: 0x%016lx", shmem->data.tdata2); + report(shmem->data.tstate == tstatus_expected, "tstate expected: 0x%016lx", tstatus_expected); + report_info("tstate found: 0x%016lx", shmem->data.tstate); + + dbtr_uninstall_trigger(); + report_prefix_pop(); +} + +void check_dbtr(void) +{ + static struct sbi_dbtr_shmem_entry shmem[RV_MAX_TRIGGERS] = {}; + unsigned long num_trigs; + enum McontrolType trig_type; + struct sbiret ret; + + report_prefix_push("dbtr"); + + if (!sbi_probe(SBI_EXT_DBTR)) { + report_skip("extension not available"); + report_prefix_pop(); + return; + } + + if (__sbi_get_imp_id() == SBI_IMPL_OPENSBI && + __sbi_get_imp_version() < sbi_impl_opensbi_mk_version(1, 5)) { + report_skip("OpenSBI < v1.5 detected, skipping tests"); + report_prefix_pop(); + return; + } + + num_trigs = dbtr_test_num_triggers(); + if (!num_trigs) + goto error; + + trig_type = dbtr_test_type(&num_trigs); + if (trig_type == SBI_DBTR_TDATA1_TYPE_NONE) + goto error; + + ret = sbi_debug_set_shmem(shmem); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_set_shmem"); + + ret = dbtr_test_store_install_uninstall(&shmem[0], trig_type); + /* install or uninstall failed */ + if (ret.error != SBI_SUCCESS) + goto error; + + dbtr_test_load(&shmem[0], trig_type); + dbtr_test_exec(&shmem[0], trig_type); + dbtr_test_read(&shmem[0], trig_type); + dbtr_test_disable_enable(&shmem[0], trig_type); + dbtr_test_update(&shmem[0], trig_type); + dbtr_test_multiple_types(&shmem[0], trig_type); + dbtr_test_multiple(shmem, trig_type, num_trigs); + dbtr_test_disable_uninstall(&shmem[0], trig_type); + dbtr_test_uninstall_enable(&shmem[0], trig_type); + dbtr_test_uninstall_update(&shmem[0], trig_type); + dbtr_test_disable_read(&shmem[0], trig_type); + +error: + report_prefix_pop(); +} diff --git a/riscv/sbi-tests.h b/riscv/sbi-tests.h index d5c4ae70..c1ebf016 100644 --- a/riscv/sbi-tests.h +++ b/riscv/sbi-tests.h @@ -97,8 +97,10 @@ static inline bool env_enabled(const char *env) return s && (*s == '1' || *s == 'y' || *s == 'Y'); } +void split_phys_addr(phys_addr_t paddr, unsigned long *hi, unsigned long *lo); void sbi_bad_fid(int ext); void check_sse(void); +void check_dbtr(void); #endif /* __ASSEMBLER__ */ #endif /* _RISCV_SBI_TESTS_H_ */ diff --git a/riscv/sbi.c b/riscv/sbi.c index edb1a6be..3b8aadce 100644 --- a/riscv/sbi.c +++ b/riscv/sbi.c @@ -105,7 +105,7 @@ static int rand_online_cpu(prng_state *ps) return cpu; } -static void split_phys_addr(phys_addr_t paddr, unsigned long *hi, unsigned long *lo) +void split_phys_addr(phys_addr_t paddr, unsigned long *hi, unsigned long *lo) { *lo = (unsigned long)paddr; *hi = 0; @@ -1561,6 +1561,7 @@ int main(int argc, char **argv) check_susp(); check_sse(); check_fwft(); + check_dbtr(); return report_summary(); } -- 2.43.0

3 months, 3 weeks

3
3
0 0

[PATCH v2 0/3] replace `allow(...)` lints with `expect(...)`

by Onur Özkan

Changes in v2: - Removed lints are not replaced with `expect` in the first diff. - Removals are done in separate diffs for each. The `#[allow(clippy::non_send_fields_in_send_ty)]` removal was tested on 1.81 and clippy was still happy with it. I couldn't test it on 1.78 because when I go below 1.81 `menuconfig` no longer shows the Rust option. And any manual changes I make to `.config` are immediately reverted on `make` invocations. Onur Özkan (3): replace `#[allow(...)]` with `#[expect(...)]` rust: remove `#[allow(clippy::unnecessary_cast)]` rust: remove `#[allow(clippy::non_send_fields_in_send_ty)]` drivers/gpu/nova-core/regs.rs | 2 +- rust/compiler_builtins.rs | 2 +- rust/kernel/alloc/allocator_test.rs | 2 +- rust/kernel/cpufreq.rs | 1 - rust/kernel/devres.rs | 2 +- rust/kernel/driver.rs | 2 +- rust/kernel/drm/ioctl.rs | 8 ++++---- rust/kernel/error.rs | 3 +-- rust/kernel/init.rs | 6 +++--- rust/kernel/kunit.rs | 2 +- rust/kernel/opp.rs | 4 ++-- rust/kernel/types.rs | 2 +- rust/macros/helpers.rs | 2 +- 13 files changed, 18 insertions(+), 20 deletions(-) -- 2.50.0

3 months, 3 weeks

3
6
0 0

[PATCH 0/2] Possible TTY privilege escalation in TIOCSTI ioctl

by Abhinav Saxena via B4 Relay

This patch series was initially sent to security(a)k.o; resending it in public. I might follow-up with a tests series which addresses similar issues with TIOCLINUX. =============== The TIOCSTI ioctl uses capable(CAP_SYS_ADMIN) for access control, which checks the current process's credentials. However, it doesn't validate against the file opener's credentials stored in file->f_cred. This creates a potential security issue where an unprivileged process can open a TTY fd and pass it to a privileged process via SCM_RIGHTS. The privileged process may then inadvertently grant access based on its elevated privileges rather than the original opener's credentials. Background ========== As noted in previous discussion, while CONFIG_LEGACY_TIOCSTI can restrict TIOCSTI usage, it is enabled by default in most distributions. Even when CONFIG_LEGACY_TIOCSTI=n, processes with CAP_SYS_ADMIN can still use TIOCSTI according to the Kconfig documentation. Additionally, CONFIG_LEGACY_TIOCSTI controls the default value for the dev.tty.legacy_tiocsti sysctl, which remains runtime-configurable. This means the described attack vector could work on systems even with CONFIG_LEGACY_TIOCSTI=n, particularly on Ubuntu 24.04 where it's "restricted" but still functional. Solution Approach ================= This series addresses the issue through SELinux LSM integration rather than modifying core TTY credential checking to avoid potential compatibility issues with existing userspace. The enhancement adds proper current task and file credential capability validation in SELinux's selinux_file_ioctl() hook specifically for TIOCSTI operations. Testing ======= All patches have been validated using: - scripts/checkpatch.pl --strict (0 errors, 0 warnings) - Functional testing on kernel v6.16-rc2 - File descriptor passing security test scenarios - SELinux policy enforcement testing The fd_passing_security test demonstrates the security concern. To verify, disable legacy TIOCSTI and run the test: $ echo "0" | sudo tee /proc/sys/dev/tty/legacy_tiocsti $ sudo ./tools/testing/selftests/tty/tty_tiocsti_test -t fd_passing_security Patch Overview ============== PATCH 1/2: selftests/tty: add TIOCSTI test suite Comprehensive test suite demonstrating the issue and fix validation PATCH 2/2: selinux: add capability checks for TIOCSTI ioctl Core security enhancement via SELinux LSM hook References ========== - tty_ioctl(4) - documents TIOCSTI ioctl and capability requirements - commit 83efeeeb3d04 ("tty: Allow TIOCSTI to be disabled") - Documentation/security/credentials.rst - https://github.com/KSPP/linux/issues/156 - https://lore.kernel.org/linux-hardening/Y0m9l52AKmw6Yxi1@hostpad/ - drivers/tty/Kconfig Configuration References: [1] - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/dri… [2] - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/dri… [3] - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/dri… To: Shuah Khan <shuah(a)kernel.org> To: Nathan Chancellor <nathan(a)kernel.org> To: Nick Desaulniers <nick.desaulniers+lkml(a)gmail.com> To: Bill Wendling <morbo(a)google.com> To: Justin Stitt <justinstitt(a)google.com> To: Paul Moore <paul(a)paul-moore.com> To: Stephen Smalley <stephen.smalley.work(a)gmail.com> To: Ondrej Mosnacek <omosnace(a)redhat.com> Cc: linux-kernel(a)vger.kernel.org Cc: linux-kselftest(a)vger.kernel.org Cc: llvm(a)lists.linux.dev Cc: selinux(a)vger.kernel.org Signed-off-by: Abhinav Saxena <xandfury(a)gmail.com> --- Abhinav Saxena (2): selftests/tty: add TIOCSTI test suite selinux: add capability checks for TIOCSTI ioctl security/selinux/hooks.c | 6 + tools/testing/selftests/tty/Makefile | 6 +- tools/testing/selftests/tty/config | 1 + tools/testing/selftests/tty/tty_tiocsti_test.c | 421 +++++++++++++++++++++++++ 4 files changed, 433 insertions(+), 1 deletion(-) --- base-commit: 5adb635077d1b4bd65b183022775a59a378a9c00 change-id: 20250618-toicsti-bug-7822b8e94a32 Best regards, -- Abhinav Saxena <xandfury(a)gmail.com>

3 months, 3 weeks

7
10
0 0

[PATCH RFT v17 0/8] fork: Support shadow stacks in clone3()

by Mark Brown

The kernel has recently added support for shadow stacks, currently x86 only using their CET feature but both arm64 and RISC-V have equivalent features (GCS and Zicfiss respectively), I am actively working on GCS[1]. With shadow stacks the hardware maintains an additional stack containing only the return addresses for branch instructions which is not generally writeable by userspace and ensures that any returns are to the recorded addresses. This provides some protection against ROP attacks and making it easier to collect call stacks. These shadow stacks are allocated in the address space of the userspace process. Our API for shadow stacks does not currently offer userspace any flexiblity for managing the allocation of shadow stacks for newly created threads, instead the kernel allocates a new shadow stack with the same size as the normal stack whenever a thread is created with the feature enabled. The stacks allocated in this way are freed by the kernel when the thread exits or shadow stacks are disabled for the thread. This lack of flexibility and control isn't ideal, in the vast majority of cases the shadow stack will be over allocated and the implicit allocation and deallocation is not consistent with other interfaces. As far as I can tell the interface is done in this manner mainly because the shadow stack patches were in development since before clone3() was implemented. Since clone3() is readily extensible let's add support for specifying a shadow stack when creating a new thread or process, keeping the current implicit allocation behaviour if one is not specified either with clone3() or through the use of clone(). The user must provide a shadow stack pointer, this must point to memory mapped for use as a shadow stackby map_shadow_stack() with an architecture specified shadow stack token at the top of the stack. Yuri Khrustalev has raised questions from the libc side regarding discoverability of extended clone3() structure sizes[2], this seems like a general issue with clone3(). There was a suggestion to add a hwcap on arm64 which isn't ideal but is doable there, though architecture specific mechanisms would also be needed for x86 (and RISC-V if it's support gets merged before this does). Please note that the x86 portions of this code are build tested only, I don't appear to have a system that can run CET available to me. [1] https://lore.kernel.org/linux-arm-kernel/20241001-arm64-gcs-v13-0-222b78d87… [2] https://lore.kernel.org/r/aCs65ccRQtJBnZ_5@arm.com Signed-off-by: Mark Brown <broonie(a)kernel.org> --- Changes in v17: - Rebase onto v6.16-rc1. - Link to v16: https://lore.kernel.org/r/20250416-clone3-shadow-stack-v16-0-2ffc9ca3917b@k… Changes in v16: - Rebase onto v6.15-rc2. - Roll in fixes from x86 testing from Rick Edgecombe. - Rework so that the argument is shadow_stack_token. - Link to v15: https://lore.kernel.org/r/20250408-clone3-shadow-stack-v15-0-3fa245c6e3be@k… Changes in v15: - Rebase onto v6.15-rc1. - Link to v14: https://lore.kernel.org/r/20250206-clone3-shadow-stack-v14-0-805b53af73b9@k… Changes in v14: - Rebase onto v6.14-rc1. - Link to v13: https://lore.kernel.org/r/20241203-clone3-shadow-stack-v13-0-93b89a81a5ed@k… Changes in v13: - Rebase onto v6.13-rc1. - Link to v12: https://lore.kernel.org/r/20241031-clone3-shadow-stack-v12-0-7183eb8bee17@k… Changes in v12: - Add the regular prctl() to the userspace API document since arm64 support is queued in -next. - Link to v11: https://lore.kernel.org/r/20241005-clone3-shadow-stack-v11-0-2a6a2bd6d651@k… Changes in v11: - Rebase onto arm64 for-next/gcs, which is based on v6.12-rc1, and integrate arm64 support. - Rework the interface to specify a shadow stack pointer rather than a base and size like we do for the regular stack. - Link to v10: https://lore.kernel.org/r/20240821-clone3-shadow-stack-v10-0-06e8797b9445@k… Changes in v10: - Integrate fixes & improvements for the x86 implementation from Rick Edgecombe. - Require that the shadow stack be VM_WRITE. - Require that the shadow stack base and size be sizeof(void *) aligned. - Clean up trailing newline. - Link to v9: https://lore.kernel.org/r/20240819-clone3-shadow-stack-v9-0-962d74f99464@ke… Changes in v9: - Pull token validation earlier and report problems with an error return to parent rather than signal delivery to the child. - Verify that the top of the supplied shadow stack is VM_SHADOW_STACK. - Rework token validation to only do the page mapping once. - Drop no longer needed support for testing for signals in selftest. - Fix typo in comments. - Link to v8: https://lore.kernel.org/r/20240808-clone3-shadow-stack-v8-0-0acf37caf14c@ke… Changes in v8: - Fix token verification with user specified shadow stack. - Don't track user managed shadow stacks for child processes. - Link to v7: https://lore.kernel.org/r/20240731-clone3-shadow-stack-v7-0-a9532eebfb1d@ke… Changes in v7: - Rebase onto v6.11-rc1. - Typo fixes. - Link to v6: https://lore.kernel.org/r/20240623-clone3-shadow-stack-v6-0-9ee7783b1fb9@ke… Changes in v6: - Rebase onto v6.10-rc3. - Ensure we don't try to free the parent shadow stack in error paths of x86 arch code. - Spelling fixes in userspace API document. - Additional cleanups and improvements to the clone3() tests to support the shadow stack tests. - Link to v5: https://lore.kernel.org/r/20240203-clone3-shadow-stack-v5-0-322c69598e4b@ke… Changes in v5: - Rebase onto v6.8-rc2. - Rework ABI to have the user allocate the shadow stack memory with map_shadow_stack() and a token. - Force inlining of the x86 shadow stack enablement. - Move shadow stack enablement out into a shared header for reuse by other tests. - Link to v4: https://lore.kernel.org/r/20231128-clone3-shadow-stack-v4-0-8b28ffe4f676@ke… Changes in v4: - Formatting changes. - Use a define for minimum shadow stack size and move some basic validation to fork.c. - Link to v3: https://lore.kernel.org/r/20231120-clone3-shadow-stack-v3-0-a7b8ed3e2acc@ke… Changes in v3: - Rebase onto v6.7-rc2. - Remove stale shadow_stack in internal kargs. - If a shadow stack is specified unconditionally use it regardless of CLONE_ parameters. - Force enable shadow stacks in the selftest. - Update changelogs for RISC-V feature rename. - Link to v2: https://lore.kernel.org/r/20231114-clone3-shadow-stack-v2-0-b613f8681155@ke… Changes in v2: - Rebase onto v6.7-rc1. - Remove ability to provide preallocated shadow stack, just specify the desired size. - Link to v1: https://lore.kernel.org/r/20231023-clone3-shadow-stack-v1-0-d867d0b5d4d0@ke… --- Mark Brown (8): arm64/gcs: Return a success value from gcs_alloc_thread_stack() Documentation: userspace-api: Add shadow stack API documentation selftests: Provide helper header for shadow stack testing fork: Add shadow stack support to clone3() selftests/clone3: Remove redundant flushes of output streams selftests/clone3: Factor more of main loop into test_clone3() selftests/clone3: Allow tests to flag if -E2BIG is a valid error code selftests/clone3: Test shadow stack support Documentation/userspace-api/index.rst | 1 + Documentation/userspace-api/shadow_stack.rst | 44 +++++ arch/arm64/include/asm/gcs.h | 8 +- arch/arm64/kernel/process.c | 8 +- arch/arm64/mm/gcs.c | 61 +++++- arch/x86/include/asm/shstk.h | 11 +- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/shstk.c | 57 +++++- include/asm-generic/cacheflush.h | 11 ++ include/linux/sched/task.h | 17 ++ include/uapi/linux/sched.h | 9 +- kernel/fork.c | 96 +++++++-- tools/testing/selftests/clone3/clone3.c | 226 ++++++++++++++++++---- tools/testing/selftests/clone3/clone3_selftests.h | 65 ++++++- tools/testing/selftests/ksft_shstk.h | 98 ++++++++++ 15 files changed, 633 insertions(+), 81 deletions(-) --- base-commit: 19272b37aa4f83ca52bdf9c16d5d81bdd1354494 change-id: 20231019-clone3-shadow-stack-15d40d2bf536 Best regards, -- Mark Brown <broonie(a)kernel.org>

3 months, 3 weeks

3
12
0 0

Re: [PATCH v2] selftests: cachestat: Refactor test to remove duplication

by Nhat Pham

On Tue, Jun 24, 2025 at 9:58 PM Suresh Chandrappa <suresh.k.chandrappa(a)gmail.com> wrote: > > Hi @nphamcs > Can you please check the modified change Is this supposed to be on top of the earlier patch you sent out? In that case, you should send both together as a patch series. > > Thanks > Suresh K C > > > On Wed, 11 Jun 2025, 23:39 Suresh K C, <suresh.k.chandrappa(a)gmail.com> wrote: >> >> From: Suresh K C <suresh.k.chandrappa(a)gmail.com> >> >> Refactored the mmap and shmem test logic into a common function >> to reduce code duplication and improve maintainability >> >> Changes in v2: >> Refactored mmap and shmem tests into a common function >> Renamed test function to run_cachestat_test() >> Removed test for /proc/cpuinfo as a general /proc test case already exists >> >> Signed-off-by: Suresh K C <suresh.k.chandrappa(a)gmail.com> >> --- >> .../selftests/cachestat/test_cachestat.c | 97 ++++++------------- >> 1 file changed, 30 insertions(+), 67 deletions(-) >> >> diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c >> index 81e7f6dd2279..7c2f64175943 100644 >> --- a/tools/testing/selftests/cachestat/test_cachestat.c >> +++ b/tools/testing/selftests/cachestat/test_cachestat.c >> @@ -22,7 +22,7 @@ >> >> static const char * const dev_files[] = { >> "/dev/zero", "/dev/null", "/dev/urandom", >> - "/proc/version","/proc/cpuinfo","/proc" >> + "/proc/version","/proc" So you removed one file that you added in an earlier patch, right? Then why bother adding it in the first place...? Can you either: 1. Send the two patch together as a series. In the first patch, do not add /proc/cpuinfo. or 2. Squash them into a single patch. I'll let you decide if this is worth it. >> }; >> >> void print_cachestat(struct cachestat *cs) >> @@ -33,6 +33,11 @@ void print_cachestat(struct cachestat *cs) >> cs->nr_evicted, cs->nr_recently_evicted); >> } >> >> +enum file_type { >> + FILE_MMAP, >> + FILE_SHMEM >> +}; >> + >> bool write_exactly(int fd, size_t filesize) >> { >> int random_fd = open("/dev/urandom", O_RDONLY); >> @@ -202,66 +207,8 @@ static int test_cachestat(const char *filename, bool write_random, bool create, >> return ret; >> } >> >> -bool test_cachestat_mmap(void){ >> - >> - size_t PS = sysconf(_SC_PAGESIZE); >> - size_t filesize = PS * 512 * 2;; >> - int syscall_ret; >> - size_t compute_len = PS * 512; >> - struct cachestat_range cs_range = { PS, compute_len }; >> - char *filename = "tmpshmcstat"; >> - unsigned long num_pages = compute_len / PS; >> - struct cachestat cs; >> - bool ret = true; >> - int fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0666); >> - if (fd < 0) { >> - ksft_print_msg("Unable to create mmap file.\n"); >> - ret = false; >> - goto out; >> - } >> - if (ftruncate(fd, filesize)) { >> - ksft_print_msg("Unable to truncate mmap file.\n"); >> - ret = false; >> - goto close_fd; >> - } >> - if (!write_exactly(fd, filesize)) { >> - ksft_print_msg("Unable to write to mmap file.\n"); >> - ret = false; >> - goto close_fd; >> - } >> - char *map = mmap(NULL, filesize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); >> - if (map == MAP_FAILED) { >> - ksft_print_msg("mmap failed.\n"); >> - ret = false; >> - goto close_fd; >> - } >> - >> - for (int i = 0; i < filesize; i++) { >> - map[i] = 'A'; >> - } >> - map[filesize - 1] = 'X'; >> - >> - syscall_ret = syscall(__NR_cachestat, fd, &cs_range, &cs, 0); >> - >> - if (syscall_ret) { >> - ksft_print_msg("Cachestat returned non-zero.\n"); >> - ret = false; >> - } else { >> - print_cachestat(&cs); >> - if (cs.nr_cache + cs.nr_evicted != num_pages) { >> - ksft_print_msg("Total number of cached and evicted pages is off.\n"); >> - ret = false; >> - } >> - } >> - >> -close_fd: >> - close(fd); >> - unlink(filename); >> -out: >> - return ret; >> -} >> >> -bool test_cachestat_shmem(void) >> +bool run_cachestat_test(enum file_type type) Can you just call this function test_cachestat(enum file_type type) ? >> { >> size_t PS = sysconf(_SC_PAGESIZE); >> size_t filesize = PS * 512 * 2; /* 2 2MB huge pages */ >> @@ -271,27 +218,43 @@ bool test_cachestat_shmem(void) >> char *filename = "tmpshmcstat"; >> struct cachestat cs; >> bool ret = true; >> + int fd; >> unsigned long num_pages = compute_len / PS; >> - int fd = shm_open(filename, O_CREAT | O_RDWR, 0600); >> + if (type == FILE_SHMEM) >> + fd = shm_open(filename, O_CREAT | O_RDWR, 0600); >> + else >> + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0666); >> >> if (fd < 0) { >> - ksft_print_msg("Unable to create shmem file.\n"); >> + ksft_print_msg("Unable to create file.\n"); >> ret = false; >> goto out; >> } >> >> if (ftruncate(fd, filesize)) { >> - ksft_print_msg("Unable to truncate shmem file.\n"); >> + ksft_print_msg("Unable to truncate file.\n"); >> ret = false; >> goto close_fd; >> } >> >> if (!write_exactly(fd, filesize)) { >> - ksft_print_msg("Unable to write to shmem file.\n"); >> + ksft_print_msg("Unable to write to file.\n"); >> ret = false; >> goto close_fd; >> } >> >> + if (type == FILE_MMAP){ >> + char *map = mmap(NULL, filesize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); >> + if (map == MAP_FAILED) { >> + ksft_print_msg("mmap failed.\n"); >> + ret = false; >> + goto close_fd; >> + } >> + for (int i = 0; i < filesize; i++) { >> + map[i] = 'A'; >> + } >> + map[filesize - 1] = 'X'; >> + } >> syscall_ret = syscall(__NR_cachestat, fd, &cs_range, &cs, 0); >> >> if (syscall_ret) { >> @@ -333,7 +296,7 @@ int main(void) >> ret = 1; >> } >> >> - for (int i = 0; i < 6; i++) { >> + for (int i = 0; i < 5; i++) { >> const char *dev_filename = dev_files[i]; >> >> if (test_cachestat(dev_filename, false, false, false, >> @@ -367,14 +330,14 @@ int main(void) >> break; >> } >> >> - if (test_cachestat_shmem()) >> + if (run_cachestat_test(FILE_SHMEM)) >> ksft_test_result_pass("cachestat works with a shmem file\n"); >> else { >> ksft_test_result_fail("cachestat fails with a shmem file\n"); >> ret = 1; >> } >> >> - if (test_cachestat_mmap()) >> + if (run_cachestat_test(FILE_MMAP)) >> ksft_test_result_pass("cachestat works with a mmap file\n"); >> else { >> ksft_test_result_fail("cachestat fails with a mmap file\n"); >> -- >> 2.43.0 >>

3 months, 3 weeks

1
0
0 0

[PATCH net-next v2 0/4] selftest: net: Add selftest for netpoll

by Breno Leitao

I am submitting a new selftest for the netpoll subsystem specifically targeting the case where the RX is polling in the TX path, which is a case that we don't have any test in the tree today. This is done when netpoll_poll_dev() called, and this test creates a scenario when that is probably. The test does the following: 1) Configuring a single RX/TX queue to increase contention on the interface. 2) Generating background traffic to saturate the network, mimicking real-world congestion. 3) Sending netconsole messages to trigger netpoll polling and monitor its behavior. 4) Using dynamic netconsole targets via configfs, with the ability to delete and recreate targets during the test. 5) Running bpftrace in parallel to verify that netpoll_poll_dev() is called when expected. If it is called, then the test passes, otherwise the test is marked as skipped. In order to achieve it, I stole Jakub's bpftrace helper from [1], and did some small changes that I found useful to use the helper. So, this patchset basically contains: 1) The code stolen from Jakub 2) Improvements on bpftrace() helper 3) The selftest itself Link: https://lore.kernel.org/all/20250421222827.283737-22-kuba@kernel.org/ [1] --- Changes in v1 (from RFC): - Toggle the netconsole interfaces up and down after 5 iterations. - Moved the traffic check under DEBUG (Willem de Bruijn). - Bumped the iterations to 20 given it runs faster now. - Link to the RFC: https://lore.kernel.org/r/20250612-netpoll_test-v1-1-4774fd95933f@debian.org --- Changes in v2: - Stole Jakub's helper to run bpftrace - Removed the DEBUG option and moved logs to logging - Change the code to have a higher chance of calling netpoll_poll_dev(). In my current configuration, it is hitting multiple times during the test. - Save and restore TX/RX queue size (Jakub) - Link to v1: https://lore.kernel.org/r/20250620-netpoll_test-v1-1-5068832f72fc@debian.org --- Breno Leitao (3): selftests: drv-net: Improve bpftrace utility error handling selftests: drv-net: Strip '@' prefix from bpftrace map keys selftests: net: add netpoll basic functionality test Jakub Kicinski (1): selftests: drv-net: add helper/wrapper for bpftrace tools/testing/selftests/drivers/net/Makefile | 1 + .../testing/selftests/drivers/net/netpoll_basic.py | 344 +++++++++++++++++++++ tools/testing/selftests/net/lib/py/utils.py | 38 +++ 3 files changed, 383 insertions(+) --- base-commit: eb4c27edb4d8dbfbdcc7bc03e0394a0fab8af7d5 change-id: 20250612-netpoll_test-a1324d2057c8 Best regards, -- Breno Leitao <leitao(a)debian.org>

3 months, 3 weeks

4
18
0 0

[PATCH 0/5] sysctl: Remove last two ctl_tables from the kern_table array

by Joel Granados

This is the last series to relocate sysctl tables from kernel/sysctl.c into their respective subsystems. After the move of two ctl_tables (uevent_helper & overflow{uid,gid}), five remain. They either handle variables defined within sysctl.c or serve as a common place for variables that are defined in different architectures. These five will not be moved. Note that this series includes two auxiliary changes: Removal of an unused variable and Nix-based rework of sysctl.sh test script By decentralizing sysctl registrations, subsystem maintainers regain control over their sysctl interfaces, improving maintainability and reducing the likelihood of merge conflicts. All this is made possible by the work done to reduce the ctl_table memory footprint in commit d7a76ec87195 ("sysctl: Remove check for sentinel element in ctl_table arrays"). A few comments on the process: 1. If you prefer to merge this through a non-sysctl tree, please let me know so I can avoid conflicts in linux-next. 2. Apologies if you were copied by mistake—let me know if you'd like to be removed. 3. This series builds on [1], so please rebase accordingly for clean application. 4. Testing done by running sysctl selftests on x86_64 and 0-day. Comments/Suggestions greatly appreciated [1] https://lore.kernel.org/20250509-jag-mv_ctltables_iter2-v1-0-d0ad83f5f4c3@k… Signed-off-by: Joel Granados <joel.granados(a)kernel.org> --- Joel Granados (5): sysctl: Nixify sysctl.sh sysctl: Removed unused variable uevent: mv uevent_helper into kobject_uevent.c kernel/sys.c: Move overflow{uid,gid} sysctl into kernel/sys.c sysctl: rename kern_table -> sysctl_subsys_table include/linux/sysctl.h | 1 - kernel/sys.c | 29 +++++++++++++++++++ kernel/sysctl.c | 49 +++++++------------------------- lib/kobject_uevent.c | 20 +++++++++++++ tools/testing/selftests/sysctl/sysctl.sh | 2 +- 5 files changed, 61 insertions(+), 40 deletions(-) --- base-commit: 501dd0fbc76bcae57902ea000d9c6ccd9d5f226e change-id: 20250627-jag-sysctl-823adf5732be Best regards, -- Joel Granados <joel.granados(a)kernel.org>

3 months, 3 weeks

1
5
0 0

[PATCH net-next v11 0/8] Support rate management on traffic classes in devlink and mlx5

by Mark Bloch

V11: - Refactored the devlink code to accept relative TC bandwidth share values instead of percentages. - Updated documentation to clarify that values are interpreted as relative shares. - Refactored the logic in mlx5 to support proportional scaling for tc-bw values. - Switched to `nlmsg_for_each_attr_type()` for cleaner attribute parsing. - Added a hardware selftest to validate TC bandwidth behavior. - Refactored esw_qos_is_node_empty for readability. V10: - Added netdevsim selftest for tc-bw ops. - Dropped header: field as it’s unnecessary for local constants in devlink.yaml. V9: - Defined DEVLINK_RATE_TCS_MAX as 8 in uapi/linux/devlink.h. - Replaced IEEE_8021QAZ_MAX_TCS with DEVLINK_RATE_TCS_MAX throughout the code. - Updated devlink-rate-tc-index-max spec to reference the correct UAPI header. V8: - Limit line width to 80 characters in mlx5 changes instead of 100. - Increase the scheduling node levels to support TC arbitration. - Ensure parent nodes are set correctly in all code paths that extend the hierarchy depth for TC arbitration. - Extended the cover letter with the ongoing discussion on devlink-rate and net-shapers. - Extended the cover letter with the Netdev talk link on this series. V7: - Fixed disabling tc-bw on leaf nodes that did not have tc-bw configured. - Fixed an issue where tc-bw was disabled on a node with assigned vports, ensuring that vport->qos.sched_node->parent is correctly updated with the cloned node. - Declared a constant for the maximum allowed Traffic Class index in devlink rate. - Added a range check to validate rate-tc-index. - Added documentation for the tc-bw argument. - Add a validation check to ensure that the total bandwidth assigned to all traffic classes sums to 100. V6: - Addressed comments on devlink patch #3. - Removed first 4 IFC patches, to be pulled from mlx5-next. V5: - Fix warning in devlink_nl_rate_tc_bw_set(). - Fix target branch of patch #4. V4: - Renamed the nested attribute for traffic class bandwidth to DEVLINK_ATTR_RATE_TC_BWS. - Changed the order of the attributes in `devlink.h`. - Refactored the initialization tc-bw array in devlink_nl_rate_tc_bw_set(). - Added extack messages to provide clear feedback on issues with tc-bw arguments. - Updated `rate-tc-bws` to support a multi-attr set, where each attribute includes an index and the corresponding bandwidth for that traffic class. - Handled the issue where the user could provide DEVLINK_ATTR_RATE_TC_BWS with duplicate indices. - Provided ynl exmaples in patch [1/5] commit message. - Take IFC patches to beginning of the series, targeted for mlx5-next. V3: - Dropped rate-tc-index, using tc-bw array index instead. - Renamed rate-bw to rate-tc-bw. - Documneted what the rate-tc-bw represents and added a range check for validation. - Intorduced devlink_nl_rate_tc_bw_set() to parse and set the TC bandwidth values. - Updated the user API in the commit message of patch 1/6 to ensure bandwidths sum equals 100. - Fixed missing filling of rate-parent in devlink_nl_rate_fill(). V2: - Included <linux/dcbnl.h> in devlink.h to resolve missing IEEE_8021QAZ_MAX_TCS definition. - Refactored the rate-tc-bw attribute structure to use a separate rate-tc-index. - Updated patch 2/6 title. This patch series extends the devlink-rate API to support traffic class (TC) bandwidth management, enabling more granular control over traffic shaping and rate limiting across multiple TCs. The API now allows users to specify bandwidth proportions for different traffic classes in a single command. This is particularly useful for managing Enhanced Transmission Selection (ETS) for groups of Virtual Functions (VFs), allowing precise bandwidth allocation across traffic classes. Additionally the series refines the QoS handling in net/mlx5 to support TC arbitration and bandwidth management on vports and rate nodes. Discussions on traffic class shaping in net-shapers began in V5 [1], where we discussed with maintainers whether net-shapers should support traffic classes and how this could be implemented. Later, after further conversations with Paolo Abeni and Simon Horman, Cosmin provided an update [2], confirming that net-shapers' tree-based hierarchy aligns well with traffic classes when treated as distinct subsets of netdev queues. Since mlx5 enforces a 1:1 mapping between TX queues and traffic classes, this approach seems feasible, though some open questions remain regarding queue reconfiguration and certain mlx5 scheduling behaviors. Building on that discussion, Cosmin has now shared a concrete implementation plan on the netdev mailing list [3]. The plan, developed in collaboration with Paolo and Simon, outlines how net-shapers can be extended to support the same use cases currently covered by devlink-rate, with the eventual goal of aligning both and simplifying the shaping infrastructure in the kernel. This work was presented at Netdev 0x19 in Zagreb [4]. There we presented how TC scheduling is enforced in mlx5 hardware, which led to discussions on the mailing list. A summary of how things work: Classification means labeling a packet with a traffic class based on the packet's DSCP or VLAN PCP field, then treating packets with different traffic classes differently during transmit processing. In a virtualized setup, VFs are untrusted and do not control classification or shaping. Classification is done by the hardware using a prio-to-TC mapping set by the hypervisor. VFs only select which send queue to use and are expected to respect the classification logic by sending each traffic class on its dedicated queue. As stated in the net-shapers plan [3], each transmit queue should carry only a single traffic class. Mixing classes in a single queue can lead to HOL blocking. In the mlx5 implementation, if the queue used does not match the classified traffic class, the hardware moves the queue to the correct TC scheduler. This movement is not a reclassification; it’s a necessary enforcement step to ensure traffic class isolation is maintained. Extend devlink-rate API to support rate management on TCs: - devlink: Extend the devlink rate API to support traffic class bandwidth management Introduce a no-op implementation: - net/mlx5: Add no-op implementation for setting tc-bw on rate objects Add support for enabling and disabling TC QoS on vports and nodes: - net/mlx5: Add support for setting tc-bw on nodes - net/mlx5: Add traffic class scheduling support for vport QoS Support for setting tc-bw on rate objects: - net/mlx5: Manage TC arbiter nodes and implement full support for tc-bw [1] https://lore.kernel.org/netdev/20241204220931.254964-1-tariqt@nvidia.com/ [2] https://lore.kernel.org/netdev/67df1a562614b553dcab043f347a0d7c5393ff83.cam… [3] https://lore.kernel.org/netdev/d9831d0c940a7b77419abe7c7330e822bbfd1cfb.cam… [4] https://netdevconf.info/0x19/sessions/talk/optimizing-bandwidth-allocation-… Carolina Jubran (8): netlink: introduce type-checking attribute iteration for nlmsg devlink: Extend devlink rate API with traffic classes bandwidth management selftest: netdevsim: Add devlink rate tc-bw test net/mlx5: Add no-op implementation for setting tc-bw on rate objects net/mlx5: Add support for setting tc-bw on nodes net/mlx5: Add traffic class scheduling support for vport QoS net/mlx5: Manage TC arbiter nodes and implement full support for tc-bw selftests: drv-net: Add test for devlink-rate traffic class bandwidth distribution Documentation/netlink/specs/devlink.yaml | 32 +- .../networking/devlink/devlink-port.rst | 8 + .../net/ethernet/mellanox/mlx5/core/devlink.c | 2 + .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 1037 ++++++++++++++++- .../net/ethernet/mellanox/mlx5/core/esw/qos.h | 8 + .../net/ethernet/mellanox/mlx5/core/eswitch.h | 14 +- drivers/net/netdevsim/dev.c | 43 + drivers/net/netdevsim/netdevsim.h | 1 + drivers/net/vxlan/vxlan_vnifilter.c | 13 +- fs/nfsd/nfsctl.c | 36 +- include/net/devlink.h | 8 + include/net/netlink.h | 14 + include/uapi/linux/devlink.h | 9 + net/devlink/netlink_gen.c | 15 +- net/devlink/netlink_gen.h | 1 + net/devlink/rate.c | 129 ++ .../drivers/net/hw/devlink_rate_tc_bw.py | 466 ++++++++ .../drivers/net/netdevsim/devlink.sh | 51 + .../testing/selftests/net/lib/py/__init__.py | 2 +- tools/testing/selftests/net/lib/py/ynl.py | 5 + 20 files changed, 1823 insertions(+), 71 deletions(-) create mode 100755 tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py base-commit: 8dacfd92dbefee829ca555a860e86108fdd1d55b -- 2.34.1

3 months, 3 weeks

2
9
0 0

[PATCH net-next] selftests: forwarding: lib: Split setup_wait()

by Petr Machata

setup_wait() takes an optional argument and then is called from the top level of the test script. That confuses shellcheck, which thinks that maybe the intention is to pass $1 of the script to the function, which is never the case. To avoid having to annotate every single new test with a SC disable, split the function in two: one that takes a mandatory argument, and one that takes no argument at all. Convert the two existing users of that optional argument, both in Spectrum resource selftest, to use the new form. Clean up vxlan_bridge_1q_mc_ul.sh to not pass a now-unused argument. Signed-off-by: Petr Machata <petrm(a)nvidia.com> --- Notes: CC: Shuah Khan <shuah(a)kernel.org> CC: Matthieu Baerts <matttbe(a)kernel.org> CC: linux-kselftest(a)vger.kernel.org .../drivers/net/mlxsw/spectrum-2/resource_scale.sh | 2 +- .../drivers/net/mlxsw/spectrum/resource_scale.sh | 2 +- tools/testing/selftests/net/forwarding/lib.sh | 9 +++++++-- .../selftests/net/forwarding/vxlan_bridge_1q_mc_ul.sh | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh index 899b6892603f..d7505b933aef 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh @@ -51,7 +51,7 @@ for current_test in ${TESTS:-$ALL_TESTS}; do fi ${current_test}_setup_prepare - setup_wait $num_netifs + setup_wait_n $num_netifs # Update target in case occupancy of a certain resource changed # following the test setup. target=$(${current_test}_get_target "$should_fail") diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh index 482ebb744eba..7b98cdd0580d 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh @@ -55,7 +55,7 @@ for current_test in ${TESTS:-$ALL_TESTS}; do continue fi ${current_test}_setup_prepare - setup_wait $num_netifs + setup_wait_n $num_netifs # Update target in case occupancy of a certain resource # changed following the test setup. target=$(${current_test}_get_target "$should_fail") diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 83ee6a07e072..9308b2f77fed 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -526,9 +526,9 @@ setup_wait_dev_with_timeout() return 1 } -setup_wait() +setup_wait_n() { - local num_netifs=${1:-$NUM_NETIFS} + local num_netifs=$1; shift local i for ((i = 1; i <= num_netifs; ++i)); do @@ -539,6 +539,11 @@ setup_wait() sleep $WAIT_TIME } +setup_wait() +{ + setup_wait_n "$NUM_NETIFS" +} + wait_for_dev() { local dev=$1; shift diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_mc_ul.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_mc_ul.sh index 7ec58b6b1128..462db0b603e7 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_mc_ul.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_mc_ul.sh @@ -765,7 +765,7 @@ ipv6_mcroute_fdb_sep_rx() trap cleanup EXIT setup_prepare -setup_wait "$NUM_NETIFS" +setup_wait tests_run exit "$EXIT_STATUS" -- 2.49.0

3 months, 3 weeks

2
1
0 0

[PATCH] selftests: futex: define SYS_futex on 32-bit architectures with 64-bit time_t

by Ben Zong-You Xie

glibc does not define SYS_futex for 32-bit architectures using 64-bit time_t e.g. riscv32, therefore this test fails to compile since it does not find SYS_futex in C library headers. Define SYS_futex as SYS_futex_time64 in this situation to ensure successful compilation and compatibility. Signed-off-by: Ben Zong-You Xie <ben717(a)andestech.com> Signed-off-by: Cynthia Huang <cynthia(a)andestech.com> --- tools/testing/selftests/futex/include/futextest.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h index ddbcfc9b7bac..7a5fd1d5355e 100644 --- a/tools/testing/selftests/futex/include/futextest.h +++ b/tools/testing/selftests/futex/include/futextest.h @@ -47,6 +47,17 @@ typedef volatile u_int32_t futex_t; FUTEX_PRIVATE_FLAG) #endif +/* + * SYS_futex is expected from system C library, in glibc some 32-bit + * architectures (e.g. RV32) are using 64-bit time_t, therefore it doesn't have + * SYS_futex defined but just SYS_futex_time64. Define SYS_futex as + * SYS_futex_time64 in this situation to ensure the compilation and the + * compatibility. + */ +#if !defined(SYS_futex) && defined(SYS_futex_time64) +#define SYS_futex SYS_futex_time64 +#endif + /** * futex() - SYS_futex syscall wrapper * @uaddr: address of first futex -- 2.34.1

3 months, 3 weeks

2
1
0 0

[PATCH] selftests/futex: Convert 32bit timespec struct to 64bit version for 32bit compatibility mode

by Terry Tritton

Futex_waitv can not accept old_timespec32 struct, so userspace should convert it from 32bit to 64bit before syscall in 32bit compatible mode. This fix is based off [1] Link: https://lore.kernel.org/all/20231203235117.29677-1-wegao@suse.com/ [1] Signed-off-by: Terry Tritton <terry.tritton(a)linaro.org> Signed-off-by: Wei Gao <wegao(a)suse.com> --- The original patch is for an identically named file and function in ltp and we need the same fix in kselftest. The patch is near identical with only a slight change to `syscall` instead of `tst_syscall`. Is the way I have tagged this appropriate? .../testing/selftests/futex/include/futex2test.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/testing/selftests/futex/include/futex2test.h b/tools/testing/selftests/futex/include/futex2test.h index ea79662405bc..6780e51eb2d6 100644 --- a/tools/testing/selftests/futex/include/futex2test.h +++ b/tools/testing/selftests/futex/include/futex2test.h @@ -55,6 +55,13 @@ struct futex32_numa { futex_t numa; }; +#if !defined(__LP64__) +struct timespec64 { + int64_t tv_sec; + int64_t tv_nsec; +}; +#endif + /** * futex_waitv - Wait at multiple futexes, wake on any * @waiters: Array of waiters @@ -65,7 +72,15 @@ struct futex32_numa { static inline int futex_waitv(volatile struct futex_waitv *waiters, unsigned long nr_waiters, unsigned long flags, struct timespec *timo, clockid_t clockid) { +#if !defined(__LP64__) + struct timespec64 timo64 = {0}; + + timo64.tv_sec = timo->tv_sec; + timo64.tv_nsec = timo->tv_nsec; + return syscall(__NR_futex_waitv, waiters, nr_waiters, flags, &timo64, clockid); +#else return syscall(__NR_futex_waitv, waiters, nr_waiters, flags, timo, clockid); +#endif } /* -- 2.39.5

3 months, 3 weeks

2
1
0 0

[PATCH v6 00/25] iommufd: Add vIOMMU infrastructure (Part-4 HW QUEUE)

by Nicolin Chen

The vIOMMU object is designed to represent a slice of an IOMMU HW for its virtualization features shared with or passed to user space (a VM mostly) in a way of HW acceleration. This extended the HWPT-based design for more advanced virtualization feature. HW QUEUE introduced by this series as a part of the vIOMMU infrastructure represents a HW accelerated queue/buffer for VM to use exclusively, e.g. - NVIDIA's Virtual Command Queue - AMD vIOMMU's Command Buffer, Event Log Buffer, and PPR Log Buffer each of which allows its IOMMU HW to directly access a queue memory owned by a guest VM and allows a guest OS to control the HW queue direclty, to avoid VM Exit overheads to improve the performance. Introduce IOMMUFD_OBJ_HW_QUEUE and its pairing IOMMUFD_CMD_HW_QUEUE_ALLOC allowing VMM to forward the IOMMU-specific queue info, such as queue base address, size, and etc. Meanwhile, a guest-owned queue needs the guest kernel to control the queue by reading/writing its consumer and producer indexes, via MMIO acceses to the hardware MMIO registers. Introduce an mmap infrastructure for iommufd to support passing through a piece of MMIO region from the host physical address space to the guest physical address space. The mmap info (offset/ length) used by an mmap syscall must be pre-allocated and returned to the user space via an output driver-data during an IOMMUFD_CMD_HW_QUEUE_ALLOC call. Thus, it requires a driver-specific user data support in the vIOMMU allocation flow. As a real-world use case, this series implements a HW QUEUE support in the tegra241-cmdqv driver for VCMDQs on NVIDIA Grace CPU. In another word, it is also the Tegra CMDQV series Part-2 (user-space support), reworked from Previous RFCv1: https://lore.kernel.org/all/cover.1712978212.git.nicolinc@nvidia.com/ This enables the HW accelerated feature for NVIDIA Grace CPU. Compared to the standard SMMUv3 operating in the nested translation mode trapping CMDQ for TLBI and ATC_INV commands, this gives a huge performance improvement: 70% to 90% reductions of invalidation time were measured by various DMA unmap tests running in a guest OS. // Unmap latencies from "dma_map_benchmark -g @granule -t @threads", // by toggling "/sys/kernel/debug/iommu/tegra241_cmdqv/bypass_vcmdq" @granule | @threads | bypass_vcmdq=1 | bypass_vcmdq=0 4KB 1 35.7 us 5.3 us 16KB 1 41.8 us 6.8 us 64KB 1 68.9 us 9.9 us 128KB 1 109.0 us 12.6 us 256KB 1 187.1 us 18.0 us 4KB 2 96.9 us 6.8 us 16KB 2 97.8 us 7.5 us 64KB 2 151.5 us 10.7 us 128KB 2 257.8 us 12.7 us 256KB 2 443.0 us 17.9 us This is on Github: https://github.com/nicolinc/iommufd/commits/iommufd_hw_queue-v6 Paring QEMU branch for testing: https://github.com/nicolinc/qemu/commits/wip/for_iommufd_hw_queue-v6 Changelog v6 * Rebase on iommufd_hw_queue-prep-v2 * Add Reviewed-by from Kevin and Jason * [iommufd] Update kdocs and notes * [iommufd] Drop redundant pages[i] check * [iommufd] Allow nesting_parent_iova to be 0 * [iommufd] Add iommufd_hw_queue_alloc_phys() * [iommufd] Revise iommufd_viommu_alloc/destroy_mmap APIs * [iommufd] Move destroy ops to vdevice/hw_queue structures * [iommufd] Add union in hw_info struct to share out_data_type field * [iommufd] Replace iopt_pin/unpin_pages() with internal access APIs * [iommufd] Replace vdevice_alloc with vdevice_size and vdevice_init * [iommufd] Replace hw_queue_alloc with get_hw_queue_size/hw_queue_init * [iommufd] Replace IOMMUFD_VIOMMU_FLAG_HW_QUEUE_READS_PA with init_phys * [smmu] Drop arm_smmu_domain_ipa_to_pa * [smmu] Update arm_smmu_impl_ops changes for vsmmu_init * [tegra] Add a vdev_to_vsid macro * [tegra] Add lvcmdq_mutex to protect multi queues * [tegra] Drop duplicated kcalloc for vintf->lvcmdqs (memory leak) v5 https://lore.kernel.org/all/cover.1747537752.git.nicolinc@nvidia.com/ * Rebase on v6.15-rc6 * Add Reviewed-by from Jason and Kevin * Correct typos in kdoc and update commit logs * [iommufd] Add a cosmetic fix * [iommufd] Drop unused num_pfns * [iommufd] Drop unnecessary check * [iommufd] Reorder patch sequence * [iommufd] Use io_remap_pfn_range() * [iommufd] Use success oriented flow * [iommufd] Fix max_npages calculation * [iommufd] Add more selftest coverage * [iommufd] Drop redundant static_assert * [iommufd] Fix mmap pfn range validation * [iommufd] Reject unmap on pinned iovas * [iommufd] Drop redundant vm_flags_set() * [iommufd] Drop iommufd_struct_destroy() * [iommufd] Drop redundant queue iova test * [iommufd] Use "mmio_addr" and "mmio_pfn" * [iommufd] Rename to "nesting_parent_iova" * [iommufd] Make iopt_pin_pages call option * [iommufd] Add ictx comparison in depend() * [iommufd] Add iommufd_object_alloc_ucmd() * [iommufd] Move kcalloc() after validations * [iommufd] Replace ictx setting with WARN_ON * [iommufd] Make hw_info's type bidirectional * [smmu] Add supported_vsmmu_type in impl_ops * [smmu] Drop impl report in smmu vendor struct * [tegra] Add IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV * [tegra] Replace "number of VINTFs" with a note * [tegra] Drop the redundant lvcmdq pointer setting * [tegra] Flag IOMMUFD_VIOMMU_FLAG_HW_QUEUE_READS_PA * [tegra] Use "vintf_alloc_vsid" for vdevice_alloc op v4 https://lore.kernel.org/all/cover.1746757630.git.nicolinc@nvidia.com/ * Rebase on v6.15-rc5 * Add Reviewed-by from Vasant * Rename "vQUEUE" to "HW QUEUE" * Use "offset" and "length" for all mmap-related variables * [iommufd] Use u64 for guest PA * [iommufd] Fix typo in uAPI doc * [iommufd] Rename immap_id to offset * [iommufd] Drop the partial-size mmap support * [iommufd] Do not replace WARN_ON with WARN_ON_ONCE * [iommufd] Use "u64 base_addr" for queue base address * [iommufd] Use u64 base_pfn/num_pfns for immap structure * [iommufd] Correct the size passed in to mtree_alloc_range() * [iommufd] Add IOMMUFD_VIOMMU_FLAG_HW_QUEUE_READS_PA to viommu_ops v3 https://lore.kernel.org/all/cover.1746139811.git.nicolinc@nvidia.com/ * Add Reviewed-by from Baolu, Pranjal, and Alok * Revise kdocs, uAPI docs, and commit logs * Rename "vCMDQ" back to "vQUEUE" for AMD cases * [tegra] Add tegra241_vcmdq_hw_flush_timeout() * [tegra] Rename vsmmu_alloc to alloc_vintf_user * [tegra] Use writel for SID replacement registers * [tegra] Move mmap removal call to vsmmu_destroy op * [tegra] Fix revert in tegra241_vintf_alloc_lvcmdq_user() * [iommufd] Replace "& ~PAGE_MASK" with PAGE_ALIGNED() * [iommufd] Add an object-type "owner" to immap structure * [iommufd] Drop the ictx input in the new for-driver APIs * [iommufd] Add iommufd_vma_ops to keep track of mmap lifecycle * [iommufd] Add viommu-based iommufd_viommu_alloc/destroy_mmap helpers * [iommufd] Rename iommufd_ctx_alloc/free_mmap to _iommufd_alloc/destroy_mmap v2 https://lore.kernel.org/all/cover.1745646960.git.nicolinc@nvidia.com/ * Add Reviewed-by from Jason * [smmu] Fix vsmmu initial value * [smmu] Support impl for hw_info * [tegra] Rename "slot" to "vsid" * [tegra] Update kdocs and commit logs * [tegra] Map/unmap LVCMDQ dynamically * [tegra] Refcount the previous LVCMDQ * [tegra] Return -EEXIST if LVCMDQ exists * [tegra] Simplify VINTF cleanup routine * [tegra] Use vmid and s2_domain in vsmmu * [tegra] Rename "mmap_pgoff" to "immap_id" * [tegra] Add more addr and length validation * [iommufd] Add more narrative to mmap's kdoc * [iommufd] Add iommufd_struct_depend/undepend() * [iommufd] Rename vcmdq_free op to vcmdq_destroy * [iommufd] Fix bug in iommu_copy_struct_to_user() * [iommufd] Drop is_io from iommufd_ctx_alloc_mmap() * [iommufd] Test the queue memory for its contiguity * [iommufd] Return -ENXIO if address or length fails * [iommufd] Do not change @min_last in mock_viommu_alloc() * [iommufd] Generalize TEGRA241_VCMDQ data in core structure * [iommufd] Add selftest coverage for IOMMUFD_CMD_VCMDQ_ALLOC * [iommufd] Add iopt_pin_pages() to prevent queue memory from unmapping v1 https://lore.kernel.org/all/cover.1744353300.git.nicolinc@nvidia.com/ Thanks Nicolin Nicolin Chen (25): iommu: Add iommu_copy_struct_to_user helper iommu: Pass in a driver-level user data structure to viommu_init op iommufd/viommu: Allow driver-specific user data for a vIOMMU object iommufd/selftest: Support user_data in mock_viommu_alloc iommufd/selftest: Add coverage for viommu data iommufd/access: Allow access->ops to be NULL for internal use iommufd/access: Add internal APIs for HW queue to use iommufd/viommu: Add driver-defined vDEVICE support iommufd/viommu: Introduce IOMMUFD_OBJ_HW_QUEUE and its related struct iommufd/viommu: Add IOMMUFD_CMD_HW_QUEUE_ALLOC ioctl iommufd/driver: Add iommufd_hw_queue_depend/undepend() helpers iommufd/selftest: Add coverage for IOMMUFD_CMD_HW_QUEUE_ALLOC iommufd: Add mmap interface iommufd/selftest: Add coverage for the new mmap interface Documentation: userspace-api: iommufd: Update HW QUEUE iommu: Allow an input type in hw_info op iommufd: Allow an input data_type via iommu_hw_info iommufd/selftest: Update hw_info coverage for an input data_type iommu/arm-smmu-v3-iommufd: Add vsmmu_size/type and vsmmu_init impl ops iommu/arm-smmu-v3-iommufd: Add hw_info to impl_ops iommu/tegra241-cmdqv: Use request_threaded_irq iommu/tegra241-cmdqv: Simplify deinit flow in tegra241_cmdqv_remove_vintf() iommu/tegra241-cmdqv: Do not statically map LVCMDQs iommu/tegra241-cmdqv: Add user-space use support iommu/tegra241-cmdqv: Add IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV support drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 16 +- drivers/iommu/iommufd/iommufd_private.h | 35 +- drivers/iommu/iommufd/iommufd_test.h | 20 + include/linux/iommu.h | 49 +- include/linux/iommufd.h | 156 ++++++ include/uapi/linux/iommufd.h | 145 +++++- tools/testing/selftests/iommu/iommufd_utils.h | 89 +++- .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 25 +- .../iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 481 +++++++++++++++++- drivers/iommu/intel/iommu.c | 4 + drivers/iommu/iommufd/device.c | 84 ++- drivers/iommu/iommufd/driver.c | 79 +++ drivers/iommu/iommufd/io_pagetable.c | 17 +- drivers/iommu/iommufd/main.c | 70 +++ drivers/iommu/iommufd/selftest.c | 150 +++++- drivers/iommu/iommufd/viommu.c | 218 +++++++- tools/testing/selftests/iommu/iommufd.c | 143 +++++- .../selftests/iommu/iommufd_fail_nth.c | 15 +- Documentation/userspace-api/iommufd.rst | 12 + 19 files changed, 1708 insertions(+), 100 deletions(-) -- 2.43.0

3 months, 3 weeks

5
76
0 0

[PATCH 0/2] membarrier: allow cpu_id to be set on more commands

by Dylan Yudaken

Hi, I noticed that only the RSEQ membarrier command allows specifying a specific cpu. I have a (extremely toy) lib I was playing around with [1] and noticed this and that being able to specify the cpu would be useful to me. I'm by no means an expert in this code though - and so could be totally missing something. Additionally this seems really difficult to actually test. I added a self test that just proces "some" interrupts are being sent, but nothing more than that. I don't know what else can be done there. Patch 1 is the main change Patch 2 is the self-test. Which maybe you don't want at all. [1]: https://github.com/DylanZA/rseqlock/commit/be7bc7214fd5aacec47e26126118f8bb… Thanks, Dylan Dylan Yudaken (2): membarrier: allow cpu_id to be set on more commands membarrier: self test for cpu specific calls kernel/sched/membarrier.c | 6 +- tools/testing/selftests/membarrier/.gitignore | 1 + tools/testing/selftests/membarrier/Makefile | 3 +- .../membarrier/membarrier_test_expedited.c | 135 ++++++++++++++++++ .../membarrier/membarrier_test_impl.h | 5 + 5 files changed, 148 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/membarrier/membarrier_test_expedited.c base-commit: ee88bddf7f2f5d1f1da87dd7bedc734048b70e88 -- 2.49.0

3 months, 3 weeks

3
4
0 0

[PATCH] kunit: Make default kunit_test timeout configurable via both a module parameter and a Kconfig option

by Marie Zhussupova

To accommodate varying hardware performance and use cases, the default kunit test case timeout (currently 300 seconds) is now configurable. Users can adjust the timeout by either setting the 'timeout' module parameter or the KUNIT_DEFAULT_TIMEOUT Kconfig option to their desired timeout in seconds. Signed-off-by: Marie Zhussupova <marievic(a)google.com> --- lib/kunit/Kconfig | 13 +++++++++++++ lib/kunit/test.c | 15 ++++++++------- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/lib/kunit/Kconfig b/lib/kunit/Kconfig index a97897edd964..c10ede4b1d22 100644 --- a/lib/kunit/Kconfig +++ b/lib/kunit/Kconfig @@ -93,4 +93,17 @@ config KUNIT_AUTORUN_ENABLED In most cases this should be left as Y. Only if additional opt-in behavior is needed should this be set to N. +config KUNIT_DEFAULT_TIMEOUT + int "Default value of the timeout module parameter" + default 300 + help + Sets the default timeout, in seconds, for Kunit test cases. This value + is further multiplied by a factor determined by the assigned speed + setting: 1x for `DEFAULT`, 3x for `KUNIT_SPEED_SLOW`, and 12x for + `KUNIT_SPEED_VERY_SLOW`. This allows slower tests on slower machines + sufficient time to complete. + + If unsure, the default timeout of 300 seconds is suitable for most + cases. + endif # KUNIT diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 002121675605..f3c6b11f12b8 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -69,6 +69,13 @@ static bool enable_param; module_param_named(enable, enable_param, bool, 0); MODULE_PARM_DESC(enable, "Enable KUnit tests"); +/* + * Configure the base timeout. + */ +static unsigned long kunit_base_timeout = CONFIG_KUNIT_DEFAULT_TIMEOUT; +module_param_named(timeout, kunit_base_timeout, ulong, 0644); +MODULE_PARM_DESC(timeout, "Set the base timeout for Kunit test cases"); + /* * KUnit statistic mode: * 0 - disabled @@ -393,12 +400,6 @@ static int kunit_timeout_mult(enum kunit_speed speed) static unsigned long kunit_test_timeout(struct kunit_suite *suite, struct kunit_case *test_case) { int mult = 1; - /* - * TODO: Make the default (base) timeout configurable, so that users with - * particularly slow or fast machines can successfully run tests, while - * still taking advantage of the relative speed. - */ - unsigned long default_timeout = 300; /* * The default test timeout is 300 seconds and will be adjusted by mult @@ -409,7 +410,7 @@ static unsigned long kunit_test_timeout(struct kunit_suite *suite, struct kunit_ mult = kunit_timeout_mult(suite->attr.speed); if (test_case->attr.speed != KUNIT_SPEED_UNSET) mult = kunit_timeout_mult(test_case->attr.speed); - return mult * default_timeout * msecs_to_jiffies(MSEC_PER_SEC); + return mult * kunit_base_timeout * msecs_to_jiffies(MSEC_PER_SEC); } -- 2.50.0.rc2.761.g2dc52ea45b-goog

3 months, 3 weeks

2
1
0 0

[PATCH net-next v5] page_pool: import Jesper's page_pool benchmark

by Mina Almasry

From: Jesper Dangaard Brouer <hawk(a)kernel.org> We frequently consult with Jesper's out-of-tree page_pool benchmark to evaluate page_pool changes. Import the benchmark into the upstream linux kernel tree so that (a) we're all running the same version, (b) pave the way for shared improvements, and (c) maybe one day integrate it with nipa, if possible. Import bench_page_pool_simple from commit 35b1716d0c30 ("Add page_bench06_walk_all"), from this repository: https://github.com/netoptimizer/prototype-kernel.git Changes done during upstreaming: - Fix checkpatch issues. - Remove the tasklet logic not needed. - Move under tools/testing - Create ksft for the benchmark. - Changed slightly how the benchmark gets build. Out of tree, time_bench is built as an independent .ko. Here it is included in bench_page_pool.ko Steps to run: ``` mkdir -p /tmp/run-pp-bench make -C ./tools/testing/selftests/net/bench make -C ./tools/testing/selftests/net/bench install INSTALL_PATH=/tmp/run-pp-bench rsync --delete -avz --progress /tmp/run-pp-bench mina@$SERVER:~/ ssh mina@$SERVER << EOF cd ~/run-pp-bench && sudo ./test_bench_page_pool.sh EOF ``` Note that by default, the Makefile will build the benchmark for the currently installed kernel in /lib/modules/$(shell uname -r)/build. To build against the current tree, do: make KDIR=$(pwd) -C ./tools/testing/selftests/net/bench Output (from Jesper): ``` sudo ./test_bench_page_pool.sh (benchmark dmesg logs snipped) Fast path results: no-softirq-page_pool01 Per elem: 23 cycles(tsc) 6.571 ns ptr_ring results: no-softirq-page_pool02 Per elem: 60 cycles(tsc) 16.862 ns slow path results: no-softirq-page_pool03 Per elem: 265 cycles(tsc) 73.739 ns ``` Output (from me): ``` sudo ./test_bench_page_pool.sh (benchmark dmesg logs snipped) Fast path results: no-softirq-page_pool01 Per elem: 11 cycles(tsc) 4.177 ns ptr_ring results: no-softirq-page_pool02 Per elem: 51 cycles(tsc) 19.117 ns slow path results: no-softirq-page_pool03 Per elem: 168 cycles(tsc) 62.469 ns ``` Results of course will vary based on hardware/kernel/configs, and some variance may be there from run to run due to some noise. Cc: Jesper Dangaard Brouer <hawk(a)kernel.org> Cc: Ilias Apalodimas <ilias.apalodimas(a)linaro.org> Cc: Jakub Kicinski <kuba(a)kernel.org> Cc: Toke Høiland-Jørgensen <toke(a)toke.dk> Signed-off-by: Mina Almasry <almasrymina(a)google.com> Acked-by: Ilias Apalodimas <ilias.apalodimas(a)linaro.org> Signed-off-by: Jesper Dangaard Brouer <hawk(a)kernel.org> --- v5: https://lore.kernel.org/netdev/20250615205914.835368-1-almasrymina@google.c… - Update results in the commit message - Update to build against current tree v4: https://lore.kernel.org/netdev/20250614100853.3f2372f2@kernel.org/ - Fix more checkpatch and coccicheck issues (Jakub) v3: - Non RFC - Collect Signed-off-by from Jesper and Acked-by Ilias. - Move test_bench_page_pool.sh to address nipa complaint. - Remove `static inline` in .c files to address nipa complaint. v2: - Move under tools/selftests (Jakub) - Create ksft for it. - Remove the tasklet logic no longer needed (Jesper + Toke) RFC discussion points: - Desirable to import it? - Can the benchmark be imported as-is for an initial version? Or needs lots of modifications? - Code location. I retained the location in Jesper's tree, but a path like net/core/bench/ may make more sense. --- tools/testing/selftests/net/bench/Makefile | 7 + .../selftests/net/bench/page_pool/Makefile | 17 + .../bench/page_pool/bench_page_pool_simple.c | 276 ++++++++++++ .../net/bench/page_pool/time_bench.c | 394 ++++++++++++++++++ .../net/bench/page_pool/time_bench.h | 238 +++++++++++ .../net/bench/test_bench_page_pool.sh | 32 ++ 6 files changed, 964 insertions(+) create mode 100644 tools/testing/selftests/net/bench/Makefile create mode 100644 tools/testing/selftests/net/bench/page_pool/Makefile create mode 100644 tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c create mode 100644 tools/testing/selftests/net/bench/page_pool/time_bench.c create mode 100644 tools/testing/selftests/net/bench/page_pool/time_bench.h create mode 100755 tools/testing/selftests/net/bench/test_bench_page_pool.sh diff --git a/tools/testing/selftests/net/bench/Makefile b/tools/testing/selftests/net/bench/Makefile new file mode 100644 index 000000000000..2546c45e42f7 --- /dev/null +++ b/tools/testing/selftests/net/bench/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_GEN_MODS_DIR := page_pool + +TEST_PROGS += test_bench_page_pool.sh + +include ../../lib.mk diff --git a/tools/testing/selftests/net/bench/page_pool/Makefile b/tools/testing/selftests/net/bench/page_pool/Makefile new file mode 100644 index 000000000000..0549a16ba275 --- /dev/null +++ b/tools/testing/selftests/net/bench/page_pool/Makefile @@ -0,0 +1,17 @@ +BENCH_PAGE_POOL_SIMPLE_TEST_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) +KDIR ?= /lib/modules/$(shell uname -r)/build + +ifeq ($(V),1) +Q = +else +Q = @ +endif + +obj-m += bench_page_pool.o +bench_page_pool-y += bench_page_pool_simple.o time_bench.o + +all: + +$(Q)make -C $(KDIR) M=$(BENCH_PAGE_POOL_SIMPLE_TEST_DIR) modules + +clean: + +$(Q)make -C $(KDIR) M=$(BENCH_PAGE_POOL_SIMPLE_TEST_DIR) clean diff --git a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c new file mode 100644 index 000000000000..f183d5e30dc6 --- /dev/null +++ b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Benchmark module for page_pool. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/mutex.h> + +#include <linux/version.h> +#include <net/page_pool/helpers.h> + +#include <linux/interrupt.h> +#include <linux/limits.h> + +#include "time_bench.h" + +static int verbose = 1; +#define MY_POOL_SIZE 1024 + +static void _page_pool_put_page(struct page_pool *pool, struct page *page, + bool allow_direct) +{ + page_pool_put_page(pool, page, -1, allow_direct); +} + +/* Makes tests selectable. Useful for perf-record to analyze a single test. + * Hint: Bash shells support writing binary number like: $((2#101010) + * + * # modprobe bench_page_pool_simple run_flags=$((2#100)) + */ +static unsigned long run_flags = 0xFFFFFFFF; +module_param(run_flags, ulong, 0); +MODULE_PARM_DESC(run_flags, "Limit which bench test that runs"); + +/* Count the bit number from the enum */ +enum benchmark_bit { + bit_run_bench_baseline, + bit_run_bench_no_softirq01, + bit_run_bench_no_softirq02, + bit_run_bench_no_softirq03, +}; + +#define bit(b) (1 << (b)) +#define enabled(b) ((run_flags & (bit(b)))) + +/* notice time_bench is limited to U32_MAX nr loops */ +static unsigned long loops = 10000000; +module_param(loops, ulong, 0); +MODULE_PARM_DESC(loops, "Specify loops bench will run"); + +/* Timing at the nanosec level, we need to know the overhead + * introduced by the for loop itself + */ +static int time_bench_for_loop(struct time_bench_record *rec, void *data) +{ + uint64_t loops_cnt = 0; + int i; + + time_bench_start(rec); + /** Loop to measure **/ + for (i = 0; i < rec->loops; i++) { + loops_cnt++; + barrier(); /* avoid compiler to optimize this loop */ + } + time_bench_stop(rec, loops_cnt); + return loops_cnt; +} + +static int time_bench_atomic_inc(struct time_bench_record *rec, void *data) +{ + uint64_t loops_cnt = 0; + atomic_t cnt; + int i; + + atomic_set(&cnt, 0); + + time_bench_start(rec); + /** Loop to measure **/ + for (i = 0; i < rec->loops; i++) { + atomic_inc(&cnt); + barrier(); /* avoid compiler to optimize this loop */ + } + loops_cnt = atomic_read(&cnt); + time_bench_stop(rec, loops_cnt); + return loops_cnt; +} + +/* The ptr_ping in page_pool uses a spinlock. We need to know the minimum + * overhead of taking+releasing a spinlock, to know the cycles that can be saved + * by e.g. amortizing this via bulking. + */ +static int time_bench_lock(struct time_bench_record *rec, void *data) +{ + uint64_t loops_cnt = 0; + spinlock_t lock; + int i; + + spin_lock_init(&lock); + + time_bench_start(rec); + /** Loop to measure **/ + for (i = 0; i < rec->loops; i++) { + spin_lock(&lock); + loops_cnt++; + barrier(); /* avoid compiler to optimize this loop */ + spin_unlock(&lock); + } + time_bench_stop(rec, loops_cnt); + return loops_cnt; +} + +/* Helper for filling some page's into ptr_ring */ +static void pp_fill_ptr_ring(struct page_pool *pp, int elems) +{ + /* GFP_ATOMIC needed when under run softirq */ + gfp_t gfp_mask = GFP_ATOMIC; + struct page **array; + int i; + + array = kcalloc(elems, sizeof(struct page *), gfp_mask); + + for (i = 0; i < elems; i++) + array[i] = page_pool_alloc_pages(pp, gfp_mask); + for (i = 0; i < elems; i++) + _page_pool_put_page(pp, array[i], false); + + kfree(array); +} + +enum test_type { type_fast_path, type_ptr_ring, type_page_allocator }; + +/* Depends on compile optimizing this function */ +static int time_bench_page_pool(struct time_bench_record *rec, void *data, + enum test_type type, const char *func) +{ + uint64_t loops_cnt = 0; + gfp_t gfp_mask = GFP_ATOMIC; /* GFP_ATOMIC is not really needed */ + int i, err; + + struct page_pool *pp; + struct page *page; + + struct page_pool_params pp_params = { + .order = 0, + .flags = 0, + .pool_size = MY_POOL_SIZE, + .nid = NUMA_NO_NODE, + .dev = NULL, /* Only use for DMA mapping */ + .dma_dir = DMA_BIDIRECTIONAL, + }; + + pp = page_pool_create(&pp_params); + if (IS_ERR(pp)) { + err = PTR_ERR(pp); + pr_warn("%s: Error(%d) creating page_pool\n", func, err); + goto out; + } + pp_fill_ptr_ring(pp, 64); + + if (in_serving_softirq()) + pr_warn("%s(): in_serving_softirq fast-path\n", func); + else + pr_warn("%s(): Cannot use page_pool fast-path\n", func); + + time_bench_start(rec); + /** Loop to measure **/ + for (i = 0; i < rec->loops; i++) { + /* Common fast-path alloc that depend on in_serving_softirq() */ + page = page_pool_alloc_pages(pp, gfp_mask); + if (!page) + break; + loops_cnt++; + barrier(); /* avoid compiler to optimize this loop */ + + /* The benchmarks purpose it to test different return paths. + * Compiler should inline optimize other function calls out + */ + if (type == type_fast_path) { + /* Fast-path recycling e.g. XDP_DROP use-case */ + page_pool_recycle_direct(pp, page); + + } else if (type == type_ptr_ring) { + /* Normal return path */ + _page_pool_put_page(pp, page, false); + + } else if (type == type_page_allocator) { + /* Test if not pages are recycled, but instead + * returned back into systems page allocator + */ + get_page(page); /* cause no-recycling */ + _page_pool_put_page(pp, page, false); + put_page(page); + } else { + BUILD_BUG(); + } + } + time_bench_stop(rec, loops_cnt); +out: + page_pool_destroy(pp); + return loops_cnt; +} + +static int time_bench_page_pool01_fast_path(struct time_bench_record *rec, + void *data) +{ + return time_bench_page_pool(rec, data, type_fast_path, __func__); +} + +static int time_bench_page_pool02_ptr_ring(struct time_bench_record *rec, + void *data) +{ + return time_bench_page_pool(rec, data, type_ptr_ring, __func__); +} + +static int time_bench_page_pool03_slow(struct time_bench_record *rec, + void *data) +{ + return time_bench_page_pool(rec, data, type_page_allocator, __func__); +} + +static int run_benchmark_tests(void) +{ + uint32_t nr_loops = loops; + + /* Baseline tests */ + if (enabled(bit_run_bench_baseline)) { + time_bench_loop(nr_loops * 10, 0, "for_loop", NULL, + time_bench_for_loop); + time_bench_loop(nr_loops * 10, 0, "atomic_inc", NULL, + time_bench_atomic_inc); + time_bench_loop(nr_loops, 0, "lock", NULL, time_bench_lock); + } + + /* This test cannot activate correct code path, due to no-softirq ctx */ + if (enabled(bit_run_bench_no_softirq01)) + time_bench_loop(nr_loops, 0, "no-softirq-page_pool01", NULL, + time_bench_page_pool01_fast_path); + if (enabled(bit_run_bench_no_softirq02)) + time_bench_loop(nr_loops, 0, "no-softirq-page_pool02", NULL, + time_bench_page_pool02_ptr_ring); + if (enabled(bit_run_bench_no_softirq03)) + time_bench_loop(nr_loops, 0, "no-softirq-page_pool03", NULL, + time_bench_page_pool03_slow); + + return 0; +} + +static int __init bench_page_pool_simple_module_init(void) +{ + if (verbose) + pr_info("Loaded\n"); + + if (loops > U32_MAX) { + pr_err("Module param loops(%lu) exceeded U32_MAX(%u)\n", loops, + U32_MAX); + return -ECHRNG; + } + + run_benchmark_tests(); + + return 0; +} +module_init(bench_page_pool_simple_module_init); + +static void __exit bench_page_pool_simple_module_exit(void) +{ + if (verbose) + pr_info("Unloaded\n"); +} +module_exit(bench_page_pool_simple_module_exit); + +MODULE_DESCRIPTION("Benchmark of page_pool simple cases"); +MODULE_AUTHOR("Jesper Dangaard Brouer <netoptimizer(a)brouer.com>"); +MODULE_LICENSE("GPL"); diff --git a/tools/testing/selftests/net/bench/page_pool/time_bench.c b/tools/testing/selftests/net/bench/page_pool/time_bench.c new file mode 100644 index 000000000000..073bb36ec5f2 --- /dev/null +++ b/tools/testing/selftests/net/bench/page_pool/time_bench.c @@ -0,0 +1,394 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Benchmarking code execution time inside the kernel + * + * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/time.h> + +#include <linux/perf_event.h> /* perf_event_create_kernel_counter() */ + +/* For concurrency testing */ +#include <linux/completion.h> +#include <linux/sched.h> +#include <linux/workqueue.h> +#include <linux/kthread.h> + +#include "time_bench.h" + +static int verbose = 1; + +/** TSC (Time-Stamp Counter) based ** + * See: linux/time_bench.h + * tsc_start_clock() and tsc_stop_clock() + */ + +/** Wall-clock based ** + */ + +/** PMU (Performance Monitor Unit) based ** + */ +#define PERF_FORMAT \ + (PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | \ + PERF_FORMAT_TOTAL_TIME_RUNNING) + +struct raw_perf_event { + uint64_t config; /* event */ + uint64_t config1; /* umask */ + struct perf_event *save; + char *desc; +}; + +/* if HT is enable a maximum of 4 events (5 if one is instructions + * retired can be specified, if HT is disabled a maximum of 8 (9 if + * one is instructions retired) can be specified. + * + * From Table 19-1. Architectural Performance Events + * Architectures Software Developer’s Manual Volume 3: System Programming + * Guide + */ +struct raw_perf_event perf_events[] = { + { 0x3c, 0x00, NULL, "Unhalted CPU Cycles" }, + { 0xc0, 0x00, NULL, "Instruction Retired" } +}; + +#define NUM_EVTS (ARRAY_SIZE(perf_events)) + +/* WARNING: PMU config is currently broken! + */ +bool time_bench_PMU_config(bool enable) +{ + int i; + struct perf_event_attr perf_conf; + struct perf_event *perf_event; + int cpu; + + preempt_disable(); + cpu = smp_processor_id(); + pr_info("DEBUG: cpu:%d\n", cpu); + preempt_enable(); + + memset(&perf_conf, 0, sizeof(struct perf_event_attr)); + perf_conf.type = PERF_TYPE_RAW; + perf_conf.size = sizeof(struct perf_event_attr); + perf_conf.read_format = PERF_FORMAT; + perf_conf.pinned = 1; + perf_conf.exclude_user = 1; /* No userspace events */ + perf_conf.exclude_kernel = 0; /* Only kernel events */ + + for (i = 0; i < NUM_EVTS; i++) { + perf_conf.disabled = enable; + //perf_conf.disabled = (i == 0) ? 1 : 0; + perf_conf.config = perf_events[i].config; + perf_conf.config1 = perf_events[i].config1; + if (verbose) + pr_info("%s() enable PMU counter: %s\n", + __func__, perf_events[i].desc); + perf_event = perf_event_create_kernel_counter(&perf_conf, cpu, + NULL /* task */, + NULL /* overflow_handler*/, + NULL /* context */); + if (perf_event) { + perf_events[i].save = perf_event; + pr_info("%s():DEBUG perf_event success\n", __func__); + + perf_event_enable(perf_event); + } else { + pr_info("%s():DEBUG perf_event is NULL\n", __func__); + } + } + + return true; +} + +/** Generic functions ** + */ + +/* Calculate stats, store results in record */ +bool time_bench_calc_stats(struct time_bench_record *rec) +{ +#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ + uint64_t ns_per_call_tmp_rem = 0; + uint32_t ns_per_call_remainder = 0; + uint64_t pmc_ipc_tmp_rem = 0; + uint32_t pmc_ipc_remainder = 0; + uint32_t pmc_ipc_div = 0; + uint32_t invoked_cnt_precision = 0; + uint32_t invoked_cnt = 0; /* 32-bit due to div_u64_rem() */ + + if (rec->flags & TIME_BENCH_LOOP) { + if (rec->invoked_cnt < 1000) { + pr_err("ERR: need more(>1000) loops(%llu) for timing\n", + rec->invoked_cnt); + return false; + } + if (rec->invoked_cnt > ((1ULL << 32) - 1)) { + /* div_u64_rem() can only support div with 32bit*/ + pr_err("ERR: Invoke cnt(%llu) too big overflow 32bit\n", + rec->invoked_cnt); + return false; + } + invoked_cnt = (uint32_t)rec->invoked_cnt; + } + + /* TSC (Time-Stamp Counter) records */ + if (rec->flags & TIME_BENCH_TSC) { + rec->tsc_interval = rec->tsc_stop - rec->tsc_start; + if (rec->tsc_interval == 0) { + pr_err("ABORT: timing took ZERO TSC time\n"); + return false; + } + /* Calculate stats */ + if (rec->flags & TIME_BENCH_LOOP) + rec->tsc_cycles = rec->tsc_interval / invoked_cnt; + else + rec->tsc_cycles = rec->tsc_interval; + } + + /* Wall-clock time calc */ + if (rec->flags & TIME_BENCH_WALLCLOCK) { + rec->time_start = rec->ts_start.tv_nsec + + (NANOSEC_PER_SEC * rec->ts_start.tv_sec); + rec->time_stop = rec->ts_stop.tv_nsec + + (NANOSEC_PER_SEC * rec->ts_stop.tv_sec); + rec->time_interval = rec->time_stop - rec->time_start; + if (rec->time_interval == 0) { + pr_err("ABORT: timing took ZERO wallclock time\n"); + return false; + } + /* Calculate stats */ + /*** Division in kernel it tricky ***/ + /* Orig: time_sec = (time_interval / NANOSEC_PER_SEC); */ + /* remainder only correct because NANOSEC_PER_SEC is 10^9 */ + rec->time_sec = div_u64_rem(rec->time_interval, NANOSEC_PER_SEC, + &rec->time_sec_remainder); + //TODO: use existing struct timespec records instead of div? + + if (rec->flags & TIME_BENCH_LOOP) { + /*** Division in kernel it tricky ***/ + /* Orig: ns = ((double)time_interval / invoked_cnt); */ + /* First get quotient */ + rec->ns_per_call_quotient = + div_u64_rem(rec->time_interval, invoked_cnt, + &ns_per_call_remainder); + /* Now get decimals .xxx precision (incorrect roundup)*/ + ns_per_call_tmp_rem = ns_per_call_remainder; + invoked_cnt_precision = invoked_cnt / 1000; + if (invoked_cnt_precision > 0) { + rec->ns_per_call_decimal = + div_u64_rem(ns_per_call_tmp_rem, + invoked_cnt_precision, + &ns_per_call_remainder); + } + } + } + + /* Performance Monitor Unit (PMU) counters */ + if (rec->flags & TIME_BENCH_PMU) { + //FIXME: Overflow handling??? + rec->pmc_inst = rec->pmc_inst_stop - rec->pmc_inst_start; + rec->pmc_clk = rec->pmc_clk_stop - rec->pmc_clk_start; + + /* Calc Instruction Per Cycle (IPC) */ + /* First get quotient */ + rec->pmc_ipc_quotient = div_u64_rem(rec->pmc_inst, rec->pmc_clk, + &pmc_ipc_remainder); + /* Now get decimals .xxx precision (incorrect roundup)*/ + pmc_ipc_tmp_rem = pmc_ipc_remainder; + pmc_ipc_div = rec->pmc_clk / 1000; + if (pmc_ipc_div > 0) { + rec->pmc_ipc_decimal = div_u64_rem(pmc_ipc_tmp_rem, + pmc_ipc_div, + &pmc_ipc_remainder); + } + } + + return true; +} + +/* Generic function for invoking a loop function and calculating + * execution time stats. The function being called/timed is assumed + * to perform a tight loop, and update the timing record struct. + */ +bool time_bench_loop(uint32_t loops, int step, char *txt, void *data, + int (*func)(struct time_bench_record *record, void *data)) +{ + struct time_bench_record rec; + + /* Setup record */ + memset(&rec, 0, sizeof(rec)); /* zero func might not update all */ + rec.version_abi = 1; + rec.loops = loops; + rec.step = step; + rec.flags = (TIME_BENCH_LOOP | TIME_BENCH_TSC | TIME_BENCH_WALLCLOCK); + + /*** Loop function being timed ***/ + if (!func(&rec, data)) { + pr_err("ABORT: function being timed failed\n"); + return false; + } + + if (rec.invoked_cnt < loops) + pr_warn("WARNING: Invoke count(%llu) smaller than loops(%d)\n", + rec.invoked_cnt, loops); + + /* Calculate stats */ + time_bench_calc_stats(&rec); + + pr_info("Type:%s Per elem: %llu cycles(tsc) %llu.%03llu ns (step:%d) - (measurement period time:%llu.%09u sec time_interval:%llu) - (invoke count:%llu tsc_interval:%llu)\n", + txt, rec.tsc_cycles, rec.ns_per_call_quotient, + rec.ns_per_call_decimal, rec.step, rec.time_sec, + rec.time_sec_remainder, rec.time_interval, rec.invoked_cnt, + rec.tsc_interval); + if (rec.flags & TIME_BENCH_PMU) + pr_info("Type:%s PMU inst/clock%llu/%llu = %llu.%03llu IPC (inst per cycle)\n", + txt, rec.pmc_inst, rec.pmc_clk, rec.pmc_ipc_quotient, + rec.pmc_ipc_decimal); + return true; +} + +/* Function getting invoked by kthread */ +static int invoke_test_on_cpu_func(void *private) +{ + struct time_bench_cpu *cpu = private; + struct time_bench_sync *sync = cpu->sync; + cpumask_t newmask = CPU_MASK_NONE; + void *data = cpu->data; + + /* Restrict CPU */ + cpumask_set_cpu(cpu->rec.cpu, &newmask); + set_cpus_allowed_ptr(current, &newmask); + + /* Synchronize start of concurrency test */ + atomic_inc(&sync->nr_tests_running); + wait_for_completion(&sync->start_event); + + /* Start benchmark function */ + if (!cpu->bench_func(&cpu->rec, data)) { + pr_err("ERROR: function being timed failed on CPU:%d(%d)\n", + cpu->rec.cpu, smp_processor_id()); + } else { + if (verbose) + pr_info("SUCCESS: ran on CPU:%d(%d)\n", cpu->rec.cpu, + smp_processor_id()); + } + cpu->did_bench_run = true; + + /* End test */ + atomic_dec(&sync->nr_tests_running); + /* Wait for kthread_stop() telling us to stop */ + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +void time_bench_print_stats_cpumask(const char *desc, + struct time_bench_cpu *cpu_tasks, + const struct cpumask *mask) +{ + uint64_t average = 0; + int cpu; + int step = 0; + struct sum { + uint64_t tsc_cycles; + int records; + } sum = { 0 }; + + /* Get stats */ + for_each_cpu(cpu, mask) { + struct time_bench_cpu *c = &cpu_tasks[cpu]; + struct time_bench_record *rec = &c->rec; + + /* Calculate stats */ + time_bench_calc_stats(rec); + + pr_info("Type:%s CPU(%d) %llu cycles(tsc) %llu.%03llu ns (step:%d) - (measurement period time:%llu.%09u sec time_interval:%llu) - (invoke count:%llu tsc_interval:%llu)\n", + desc, cpu, rec->tsc_cycles, rec->ns_per_call_quotient, + rec->ns_per_call_decimal, rec->step, rec->time_sec, + rec->time_sec_remainder, rec->time_interval, + rec->invoked_cnt, rec->tsc_interval); + + /* Collect average */ + sum.records++; + sum.tsc_cycles += rec->tsc_cycles; + step = rec->step; + } + + if (sum.records) /* avoid div-by-zero */ + average = sum.tsc_cycles / sum.records; + pr_info("Sum Type:%s Average: %llu cycles(tsc) CPUs:%d step:%d\n", desc, + average, sum.records, step); +} + +void time_bench_run_concurrent(uint32_t loops, int step, void *data, + const struct cpumask *mask, /* Support masking outsome CPUs*/ + struct time_bench_sync *sync, + struct time_bench_cpu *cpu_tasks, + int (*func)(struct time_bench_record *record, void *data)) +{ + int cpu, running = 0; + + if (verbose) // DEBUG + pr_warn("%s() Started on CPU:%d\n", __func__, + smp_processor_id()); + + /* Reset sync conditions */ + atomic_set(&sync->nr_tests_running, 0); + init_completion(&sync->start_event); + + /* Spawn off jobs on all CPUs */ + for_each_cpu(cpu, mask) { + struct time_bench_cpu *c = &cpu_tasks[cpu]; + + running++; + c->sync = sync; /* Send sync variable along */ + c->data = data; /* Send opaque along */ + + /* Init benchmark record */ + memset(&c->rec, 0, sizeof(struct time_bench_record)); + c->rec.version_abi = 1; + c->rec.loops = loops; + c->rec.step = step; + c->rec.flags = (TIME_BENCH_LOOP | TIME_BENCH_TSC | + TIME_BENCH_WALLCLOCK); + c->rec.cpu = cpu; + c->bench_func = func; + c->task = kthread_run(invoke_test_on_cpu_func, c, + "time_bench%d", cpu); + if (IS_ERR(c->task)) { + pr_err("%s(): Failed to start test func\n", __func__); + return; /* Argh, what about cleanup?! */ + } + } + + /* Wait until all processes are running */ + while (atomic_read(&sync->nr_tests_running) < running) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(10); + } + /* Kick off all CPU concurrently on completion event */ + complete_all(&sync->start_event); + + /* Wait for CPUs to finish */ + while (atomic_read(&sync->nr_tests_running)) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(10); + } + + /* Stop the kthreads */ + for_each_cpu(cpu, mask) { + struct time_bench_cpu *c = &cpu_tasks[cpu]; + + kthread_stop(c->task); + } + + if (verbose) // DEBUG - happens often, finish on another CPU + pr_warn("%s() Finished on CPU:%d\n", __func__, + smp_processor_id()); +} diff --git a/tools/testing/selftests/net/bench/page_pool/time_bench.h b/tools/testing/selftests/net/bench/page_pool/time_bench.h new file mode 100644 index 000000000000..e113fcf341dc --- /dev/null +++ b/tools/testing/selftests/net/bench/page_pool/time_bench.h @@ -0,0 +1,238 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Benchmarking code execution time inside the kernel + * + * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer + * for licensing details see kernel-base/COPYING + */ +#ifndef _LINUX_TIME_BENCH_H +#define _LINUX_TIME_BENCH_H + +/* Main structure used for recording a benchmark run */ +struct time_bench_record { + uint32_t version_abi; + uint32_t loops; /* Requested loop invocations */ + uint32_t step; /* option for e.g. bulk invocations */ + + uint32_t flags; /* Measurements types enabled */ +#define TIME_BENCH_LOOP BIT(0) +#define TIME_BENCH_TSC BIT(1) +#define TIME_BENCH_WALLCLOCK BIT(2) +#define TIME_BENCH_PMU BIT(3) + + uint32_t cpu; /* Used when embedded in time_bench_cpu */ + + /* Records */ + uint64_t invoked_cnt; /* Returned actual invocations */ + uint64_t tsc_start; + uint64_t tsc_stop; + struct timespec64 ts_start; + struct timespec64 ts_stop; + /* PMU counters for instruction and cycles + * instructions counter including pipelined instructions + */ + uint64_t pmc_inst_start; + uint64_t pmc_inst_stop; + /* CPU unhalted clock counter */ + uint64_t pmc_clk_start; + uint64_t pmc_clk_stop; + + /* Result records */ + uint64_t tsc_interval; + uint64_t time_start, time_stop, time_interval; /* in nanosec */ + uint64_t pmc_inst, pmc_clk; + + /* Derived result records */ + uint64_t tsc_cycles; // +decimal? + uint64_t ns_per_call_quotient, ns_per_call_decimal; + uint64_t time_sec; + uint32_t time_sec_remainder; + uint64_t pmc_ipc_quotient, pmc_ipc_decimal; /* inst per cycle */ +}; + +/* For synchronizing parallel CPUs to run concurrently */ +struct time_bench_sync { + atomic_t nr_tests_running; + struct completion start_event; +}; + +/* Keep track of CPUs executing our bench function. + * + * Embed a time_bench_record for storing info per cpu + */ +struct time_bench_cpu { + struct time_bench_record rec; + struct time_bench_sync *sync; /* back ptr */ + struct task_struct *task; + /* "data" opaque could have been placed in time_bench_sync, + * but to avoid any false sharing, place it per CPU + */ + void *data; + /* Support masking outsome CPUs, mark if it ran */ + bool did_bench_run; + /* int cpu; // note CPU stored in time_bench_record */ + int (*bench_func)(struct time_bench_record *record, void *data); +}; + +/* + * Below TSC assembler code is not compatible with other archs, and + * can also fail on guests if cpu-flags are not correct. + * + * The way TSC reading is used, many iterations, does not require as + * high accuracy as described below (in Intel Doc #324264). + * + * Considering changing to use get_cycles() (#include <asm/timex.h>). + */ + +/** TSC (Time-Stamp Counter) based ** + * Recommend reading, to understand details of reading TSC accurately: + * Intel Doc #324264, "How to Benchmark Code Execution Times on Intel" + * + * Consider getting exclusive ownership of CPU by using: + * unsigned long flags; + * preempt_disable(); + * raw_local_irq_save(flags); + * _your_code_ + * raw_local_irq_restore(flags); + * preempt_enable(); + * + * Clobbered registers: "%rax", "%rbx", "%rcx", "%rdx" + * RDTSC only change "%rax" and "%rdx" but + * CPUID clears the high 32-bits of all (rax/rbx/rcx/rdx) + */ +static __always_inline uint64_t tsc_start_clock(void) +{ + /* See: Intel Doc #324264 */ + unsigned int hi, lo; + + asm volatile("CPUID\n\t" + "RDTSC\n\t" + "mov %%edx, %0\n\t" + "mov %%eax, %1\n\t" + : "=r"(hi), "=r"(lo)::"%rax", "%rbx", "%rcx", "%rdx"); + //FIXME: on 32bit use clobbered %eax + %edx + return ((uint64_t)lo) | (((uint64_t)hi) << 32); +} + +static __always_inline uint64_t tsc_stop_clock(void) +{ + /* See: Intel Doc #324264 */ + unsigned int hi, lo; + + asm volatile("RDTSCP\n\t" + "mov %%edx, %0\n\t" + "mov %%eax, %1\n\t" + "CPUID\n\t" + : "=r"(hi), "=r"(lo)::"%rax", "%rbx", "%rcx", "%rdx"); + return ((uint64_t)lo) | (((uint64_t)hi) << 32); +} + +/** Wall-clock based ** + * + * use: getnstimeofday() + * getnstimeofday(&rec->ts_start); + * getnstimeofday(&rec->ts_stop); + * + * API changed see: Documentation/core-api/timekeeping.rst + * https://www.kernel.org/doc/html/latest/core-api/timekeeping.html#c.getnstim… + * + * We should instead use: ktime_get_real_ts64() is a direct + * replacement, but consider using monotonic time (ktime_get_ts64()) + * and/or a ktime_t based interface (ktime_get()/ktime_get_real()). + */ + +/** PMU (Performance Monitor Unit) based ** + * + * Needed for calculating: Instructions Per Cycle (IPC) + * - The IPC number tell how efficient the CPU pipelining were + */ +//lookup: perf_event_create_kernel_counter() + +bool time_bench_PMU_config(bool enable); + +/* Raw reading via rdpmc() using fixed counters + * + * From: https://github.com/andikleen/simple-pmu + */ +enum { + FIXED_SELECT = (1U << 30), /* == 0x40000000 */ + FIXED_INST_RETIRED_ANY = 0, + FIXED_CPU_CLK_UNHALTED_CORE = 1, + FIXED_CPU_CLK_UNHALTED_REF = 2, +}; + +static __always_inline unsigned int long long p_rdpmc(unsigned int in) +{ + unsigned int d, a; + + asm volatile("rdpmc" : "=d"(d), "=a"(a) : "c"(in) : "memory"); + return ((unsigned long long)d << 32) | a; +} + +/* These PMU counter needs to be enabled, but I don't have the + * configure code implemented. My current hack is running: + * sudo perf stat -e cycles:k -e instructions:k insmod lib/ring_queue_test.ko + */ +/* Reading all pipelined instruction */ +static __always_inline unsigned long long pmc_inst(void) +{ + return p_rdpmc(FIXED_SELECT | FIXED_INST_RETIRED_ANY); +} + +/* Reading CPU clock cycles */ +static __always_inline unsigned long long pmc_clk(void) +{ + return p_rdpmc(FIXED_SELECT | FIXED_CPU_CLK_UNHALTED_CORE); +} + +/* Raw reading via MSR rdmsr() is likely wrong + * FIXME: How can I know which raw MSR registers are conf for what? + */ +#define MSR_IA32_PCM0 0x400000C1 /* PERFCTR0 */ +#define MSR_IA32_PCM1 0x400000C2 /* PERFCTR1 */ +#define MSR_IA32_PCM2 0x400000C3 +static inline uint64_t msr_inst(unsigned long long *msr_result) +{ + return rdmsrq_safe(MSR_IA32_PCM0, msr_result); +} + +/** Generic functions ** + */ +bool time_bench_loop(uint32_t loops, int step, char *txt, void *data, + int (*func)(struct time_bench_record *rec, void *data)); +bool time_bench_calc_stats(struct time_bench_record *rec); + +void time_bench_run_concurrent(uint32_t loops, int step, void *data, + const struct cpumask *mask, /* Support masking outsome CPUs*/ + struct time_bench_sync *sync, struct time_bench_cpu *cpu_tasks, + int (*func)(struct time_bench_record *record, void *data)); +void time_bench_print_stats_cpumask(const char *desc, + struct time_bench_cpu *cpu_tasks, + const struct cpumask *mask); + +//FIXME: use rec->flags to select measurement, should be MACRO +static __always_inline void time_bench_start(struct time_bench_record *rec) +{ + //getnstimeofday(&rec->ts_start); + ktime_get_real_ts64(&rec->ts_start); + if (rec->flags & TIME_BENCH_PMU) { + rec->pmc_inst_start = pmc_inst(); + rec->pmc_clk_start = pmc_clk(); + } + rec->tsc_start = tsc_start_clock(); +} + +static __always_inline void time_bench_stop(struct time_bench_record *rec, + uint64_t invoked_cnt) +{ + rec->tsc_stop = tsc_stop_clock(); + if (rec->flags & TIME_BENCH_PMU) { + rec->pmc_inst_stop = pmc_inst(); + rec->pmc_clk_stop = pmc_clk(); + } + //getnstimeofday(&rec->ts_stop); + ktime_get_real_ts64(&rec->ts_stop); + rec->invoked_cnt = invoked_cnt; +} + +#endif /* _LINUX_TIME_BENCH_H */ diff --git a/tools/testing/selftests/net/bench/test_bench_page_pool.sh b/tools/testing/selftests/net/bench/test_bench_page_pool.sh new file mode 100755 index 000000000000..7b8b18cfedce --- /dev/null +++ b/tools/testing/selftests/net/bench/test_bench_page_pool.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# + +set -e + +DRIVER="./page_pool/bench_page_pool.ko" +result="" + +function run_test() +{ + rmmod "bench_page_pool.ko" || true + insmod $DRIVER > /dev/null 2>&1 + result=$(dmesg | tail -10) + echo "$result" + + echo + echo "Fast path results:" + echo "${result}" | grep -o -E "no-softirq-page_pool01 Per elem: ([0-9]+) cycles$tsc$ ([0-9]+\.[0-9]+) ns" + + echo + echo "ptr_ring results:" + echo "${result}" | grep -o -E "no-softirq-page_pool02 Per elem: ([0-9]+) cycles$tsc$ ([0-9]+\.[0-9]+) ns" + + echo + echo "slow path results:" + echo "${result}" | grep -o -E "no-softirq-page_pool03 Per elem: ([0-9]+) cycles$tsc$ ([0-9]+\.[0-9]+) ns" +} + +run_test + +exit 0 base-commit: afc783fa0aab9cc093fbb04871bfda406480cf8d -- 2.50.0.rc2.701.gf1e915cc24-goog

3 months, 3 weeks

4
7
0 0

[PATCH rc v2 0/4] Fix iommufd selftest FAIL and warnings with v6.16

by Nicolin Chen

A few selftest harness changes being merged to v6.16, which exposed some bugs and vulnerabilities in the iommufd selftest code. Fix them properly. Note that the patch fixing the build warnings at mfd is not ideal, as it has possibly hit some corner case in the gcc: https://lore.kernel.org/all/aEi8DV+ReF3v3Rlf@nvidia.com/ This is on github: https://github.com/nicolinc/iommufd/commits/iommufd_selftest_fixes-v6.16 Changelog: v2 * Add "Reviewed-by" from Jason * Only use kfree() in the teardown() * Add an mmap_buffer_size for readability v1 https://lore.kernel.org/all/cover.1750049883.git.nicolinc@nvidia.com/ Thanks Nicolin Nicolin Chen (4): iommufd/selftest: Fix iommufd_dirty_tracking with large hugepage sizes iommufd/selftest: Add missing close(mfd) in memfd_mmap() iommufd/selftest: Add asserts testing global mfd iommufd/selftest: Fix build warnings due to uninitialized mfd tools/testing/selftests/iommu/iommufd_utils.h | 9 ++++- tools/testing/selftests/iommu/iommufd.c | 40 ++++++++++++++----- 2 files changed, 36 insertions(+), 13 deletions(-) -- 2.43.0

3 months, 3 weeks

2
5
0 0

[PATCH 0/2] rust: replace `allow(...)` with `expect(...)`

by Onur Özkan

Replaces various `#[allow(...)]` with `#[expect(...)]` as suggested in the kernel coding guidelines: [link] [link]: https://docs.kernel.org/rust/coding-guidelines.html#lints After switching to `#[expect(...)]`, I found some dead linting rules that are no longer needed which are removed in the second patch. Onur Özkan (1): replace `#[allow(...)]` with `#[expect(...)]` onur-ozkan (1): rust: drop unnecessary lints caught by `#[expect(...)]` drivers/gpu/nova-core/regs.rs | 2 +- rust/compiler_builtins.rs | 2 +- rust/kernel/alloc/allocator_test.rs | 2 +- rust/kernel/cpufreq.rs | 1 - rust/kernel/devres.rs | 2 +- rust/kernel/driver.rs | 2 +- rust/kernel/drm/ioctl.rs | 8 ++++---- rust/kernel/error.rs | 3 +-- rust/kernel/init.rs | 6 +++--- rust/kernel/kunit.rs | 2 +- rust/kernel/opp.rs | 4 ++-- rust/kernel/types.rs | 2 +- rust/macros/helpers.rs | 2 +- 13 files changed, 18 insertions(+), 20 deletions(-) -- 2.50.0

3 months, 3 weeks

3
4
0 0

[PATCH 0/6] selftests/mm: Fix false positives and skip unsupported tests

by Aboorva Devarajan

This patch series fixes some of the false positives in generic mm selftests and skips tests that cannot run correctly due to missing features or system limitations. Please let us know if you have any feedback. Thanks, Aboorva Aboorva Devarajan (2): selftests/mm: Fix child process exit codes in KSM tests selftests/mm: Mark thuge-gen as skipped if shmmax is too small or no 1G pages Donet Tom (4): mm/selftests: Fix virtual_address_range test issues. selftest/mm: Fix ksm_funtional_test failures selftests/mm : fix test_prctl_fork_exec failure mm/selftests: Fix split_huge_page_test failure on systems with 64KB page size .../selftests/mm/ksm_functional_tests.c | 24 +++++++++++++------ .../selftests/mm/split_huge_page_test.c | 23 ++++++++++++++---- tools/testing/selftests/mm/thuge-gen.c | 11 +++++---- .../selftests/mm/virtual_address_range.c | 14 +++-------- 4 files changed, 45 insertions(+), 27 deletions(-) -- 2.43.5

3 months, 3 weeks

6
43
0 0

[PATCH] selftests/mm: remove duplicate .gitignore entries

by Moon Hee Lee

Remove redundant entries in .gitignore confirmed by: $ sort tools/testing/selftests/mm/.gitignore | uniq -d hugetlb_dio pkey_sighandler_tests_32 pkey_sighandler_tests_64 These entries were originally added by [1], and later duplicated by [2]. [1] https://lore.kernel.org/all/20240924185911.117937-1-lorenzo.stoakes@oracle.… [2] https://lore.kernel.org/all/20241125064036.413536-1-lizhijian@fujitsu.com/ Signed-off-by: Moon Hee Lee <moonhee.lee.ca(a)gmail.com> --- tools/testing/selftests/mm/.gitignore | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index 824266982aa3..f2dafa0b700b 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -38,9 +38,6 @@ map_fixed_noreplace write_to_hugetlbfs hmm-tests memfd_secret -hugetlb_dio -pkey_sighandler_tests_32 -pkey_sighandler_tests_64 soft-dirty split_huge_page_test ksm_tests -- 2.43.0

3 months, 3 weeks

2
1
0 0

[PATCH 0/3] tools/nolibc: add support for SuperH

by Thomas Weißschuh

Add support for SuperH/"sh" to nolibc. Only sh4 is tested for now. This is only tested on QEMU so far. Additional testing would be very welcome. Signed-off-by: Thomas Weißschuh <linux(a)weissschuh.net> --- Thomas Weißschuh (3): selftests/nolibc: fix EXTRACONFIG variables ordering selftests/nolibc: use file driver for QEMU serial tools/nolibc: add support for SuperH tools/include/nolibc/arch-sh.h | 162 ++++++++++++++++++++++++++++ tools/include/nolibc/arch.h | 2 + tools/testing/selftests/nolibc/Makefile | 15 ++- tools/testing/selftests/nolibc/run-tests.sh | 3 +- 4 files changed, 177 insertions(+), 5 deletions(-) --- base-commit: 6275a61db2f0586b8a5d651dfc7b4aacf9d0b2d6 change-id: 20250528-nolibc-sh-8b4e3bb8efcb Best regards, -- Thomas Weißschuh <linux(a)weissschuh.net>

3 months, 3 weeks

4
9
0 0

[PATCH v5 0/7] use per-vma locks for /proc/pid/maps reads and PROCMAP_QUERY

by Suren Baghdasaryan

Reading /proc/pid/maps requires read-locking mmap_lock which prevents any other task from concurrently modifying the address space. This guarantees coherent reporting of virtual address ranges, however it can block important updates from happening. Oftentimes /proc/pid/maps readers are low priority monitoring tasks and them blocking high priority tasks results in priority inversion. Locking the entire address space is required to present fully coherent picture of the address space, however even current implementation does not strictly guarantee that by outputting vmas in page-size chunks and dropping mmap_lock in between each chunk. Address space modifications are possible while mmap_lock is dropped and userspace reading the content is expected to deal with possible concurrent address space modifications. Considering these relaxed rules, holding mmap_lock is not strictly needed as long as we can guarantee that a concurrently modified vma is reported either in its original form or after it was modified. This patchset switches from holding mmap_lock while reading /proc/pid/maps to taking per-vma locks as we walk the vma tree. This reduces the contention with tasks modifying the address space because they would have to contend for the same vma as opposed to the entire address space. Same is done for PROCMAP_QUERY ioctl which locks only the vma that fell into the requested range instead of the entire address space. Previous version of this patchset [1] tried to perform /proc/pid/maps reading under RCU, however its implementation is quite complex and the results are worse than the new version because it still relied on mmap_lock speculation which retries if any part of the address space gets modified. New implementaion is both simpler and results in less contention. Note that similar approach would not work for /proc/pid/smaps reading as it also walks the page table and that's not RCU-safe. Paul McKenney's designed a test [2] to measure mmap/munmap latencies while concurrently reading /proc/pid/maps. The test has a pair of processes scanning /proc/PID/maps, and another process unmapping and remapping 4K pages from a 128MB range of anonymous memory. At the end of each 10 second run, the latency of each mmap() or munmap() operation is measured, and for each run the maximum and mean latency is printed. The map/unmap process is started first, its PID is passed to the scanners, and then the map/unmap process waits until both scanners are running before starting its timed test. The scanners keep scanning until the specified /proc/PID/maps file disappears. This test registered close to 10x improvement in update latencies: Before the change: ./run-proc-vs-map.sh --nsamples 100 --rawdata -- --busyduration 2 0.011 0.008 0.455 0.011 0.008 0.472 0.011 0.008 0.535 0.011 0.009 0.545 ... 0.011 0.014 2.875 0.011 0.014 2.913 0.011 0.014 3.007 0.011 0.015 3.018 After the change: ./run-proc-vs-map.sh --nsamples 100 --rawdata -- --busyduration 2 0.006 0.005 0.036 0.006 0.005 0.039 0.006 0.005 0.039 0.006 0.005 0.039 ... 0.006 0.006 0.403 0.006 0.006 0.474 0.006 0.006 0.479 0.006 0.006 0.498 The patchset also adds a number of tests to check for /proc/pid/maps data coherency. They are designed to detect any unexpected data tearing while performing some common address space modifications (vma split, resize and remap). Even before these changes, reading /proc/pid/maps might have inconsistent data because the file is read page-by-page with mmap_lock being dropped between the pages. An example of user-visible inconsistency can be that the same vma is printed twice: once before it was modified and then after the modifications. For example if vma was extended, it might be found and reported twice. What is not expected is to see a gap where there should have been a vma both before and after modification. This patchset increases the chances of such tearing, therefore it's even more important now to test for unexpected inconsistencies. In [3] Lorenzo identified the following possible vma merging/splitting scenarios: Merges with changes to existing vmas: 1 Merge both - mapping a vma over another one and between two vmas which can be merged after this replacement; 2. Merge left full - mapping a vma at the end of an existing one and completely over its right neighbor; 3. Merge left partial - mapping a vma at the end of an existing one and partially over its right neighbor; 4. Merge right full - mapping a vma before the start of an existing one and completely over its left neighbor; 5. Merge right partial - mapping a vma before the start of an existing one and partially over its left neighbor; Merges without changes to existing vmas: 6. Merge both - mapping a vma into a gap between two vmas which can be merged after the insertion; 7. Merge left - mapping a vma at the end of an existing one; 8. Merge right - mapping a vma before the start end of an existing one; Splits 9. Split with new vma at the lower address; 10. Split with new vma at the higher address; If such merges or splits happen concurrently with the /proc/maps reading we might report a vma twice, once before the modification and once after it is modified: Case 1 might report overwritten and previous vma along with the final merged vma; Case 2 might report previous and the final merged vma; Case 3 might cause us to retry once we detect the temporary gap caused by shrinking of the right neighbor; Case 4 might report overritten and the final merged vma; Case 5 might cause us to retry once we detect the temporary gap caused by shrinking of the left neighbor; Case 6 might report previous vma and the gap along with the final marged vma; Case 7 might report previous and the final merged vma; Case 8 might report the original gap and the final merged vma covering the gap; Case 9 might cause us to retry once we detect the temporary gap caused by shrinking of the original vma at the vma start; Case 10 might cause us to retry once we detect the temporary gap caused by shrinking of the original vma at the vma end; In all these cases the retry mechanism prevents us from reporting possible temporary gaps. Changes from v4 [4]: - refactored trylock_vma() and other locking parts into mmap_lock.c, per Lorenzo - renamed {lock|unlock}_content() into {lock|unlock}_vma_range(), per Lorenzo - added clarifying comments for sentinels, per Lorenzo - introduced is_sentinel_pos() helper function - fixed position reset logic when last_addr is a sentinel, per Lorenzo - added Acked-by to the last patch, per Andrii Nakryiko [1] https://lore.kernel.org/all/20250418174959.1431962-1-surenb@google.com/ [2] https://github.com/paulmckrcu/proc-mmap_sem-test [3] https://lore.kernel.org/all/e1863f40-39ab-4e5b-984a-c48765ffde1c@lucifer.lo… [4] https://lore.kernel.org/all/20250604231151.799834-1-surenb@google.com/ Suren Baghdasaryan (7): selftests/proc: add /proc/pid/maps tearing from vma split test selftests/proc: extend /proc/pid/maps tearing test to include vma resizing selftests/proc: extend /proc/pid/maps tearing test to include vma remapping selftests/proc: test PROCMAP_QUERY ioctl while vma is concurrently modified selftests/proc: add verbose more for tests to facilitate debugging mm/maps: read proc/pid/maps under per-vma lock mm/maps: execute PROCMAP_QUERY ioctl under per-vma locks fs/proc/internal.h | 5 + fs/proc/task_mmu.c | 179 ++++- include/linux/mmap_lock.h | 11 + mm/mmap_lock.c | 88 +++ tools/testing/selftests/proc/proc-pid-vm.c | 793 ++++++++++++++++++++- 5 files changed, 1053 insertions(+), 23 deletions(-) base-commit: 0b2a863368fb0cf674b40925c55dc8898c5a33af -- 2.50.0.714.g196bf9f422-goog

3 months, 3 weeks

2
9
0 0

Re: [PATCH 1/6] mm/selftests: Fix virtual_address_range test issues.

by Donet Tom

eOn Tue, Jun 24, 2025 at 11:45:09AM +0530, Dev Jain wrote: > > On 23/06/25 11:02 pm, Donet Tom wrote: > > On Mon, Jun 23, 2025 at 10:23:02AM +0530, Dev Jain wrote: > > > On 21/06/25 11:25 pm, Donet Tom wrote: > > > > On Fri, Jun 20, 2025 at 08:15:25PM +0530, Dev Jain wrote: > > > > > On 19/06/25 1:53 pm, Donet Tom wrote: > > > > > > On Wed, Jun 18, 2025 at 08:13:54PM +0530, Dev Jain wrote: > > > > > > > On 18/06/25 8:05 pm, Lorenzo Stoakes wrote: > > > > > > > > On Wed, Jun 18, 2025 at 07:47:18PM +0530, Dev Jain wrote: > > > > > > > > > On 18/06/25 7:37 pm, Lorenzo Stoakes wrote: > > > > > > > > > > On Wed, Jun 18, 2025 at 07:28:16PM +0530, Dev Jain wrote: > > > > > > > > > > > On 18/06/25 5:27 pm, Lorenzo Stoakes wrote: > > > > > > > > > > > > On Wed, Jun 18, 2025 at 05:15:50PM +0530, Dev Jain wrote: > > > > > > > > > > > > Are you accounting for sys.max_map_count? If not, then you'll be hitting that > > > > > > > > > > > > first. > > > > > > > > > > > run_vmtests.sh will run the test in overcommit mode so that won't be an issue. > > > > > > > > > > Umm, what? You mean overcommit all mode, and that has no bearing on the max > > > > > > > > > > mapping count check. > > > > > > > > > > > > > > > > > > > > In do_mmap(): > > > > > > > > > > > > > > > > > > > > /* Too many mappings? */ > > > > > > > > > > if (mm->map_count > sysctl_max_map_count) > > > > > > > > > > return -ENOMEM; > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > As well as numerous other checks in mm/vma.c. > > > > > > > > > Ah sorry, didn't look at the code properly just assumed that overcommit_always meant overriding > > > > > > > > > this. > > > > > > > > No problem! It's hard to be aware of everything in mm :) > > > > > > > > > > > > > > > > > > I'm not sure why an overcommit toggle is even necessary when you could use > > > > > > > > > > MAP_NORESERVE or simply map PROT_NONE to avoid the OVERCOMMIT_GUESS limits? > > > > > > > > > > > > > > > > > > > > I'm pretty confused as to what this test is really achieving honestly. This > > > > > > > > > > isn't a useful way of asserting mmap() behaviour as far as I can tell. > > > > > > > > > Well, seems like a useful way to me at least : ) Not sure if you are in the mood > > > > > > > > > to discuss that but if you'd like me to explain from start to end what the test > > > > > > > > > is doing, I can do that : ) > > > > > > > > > > > > > > > > > I just don't have time right now, I guess I'll have to come back to it > > > > > > > > later... it's not the end of the world for it to be iffy in my view as long as > > > > > > > > it passes, but it might just not be of great value. > > > > > > > > > > > > > > > > Philosophically I'd rather we didn't assert internal implementation details like > > > > > > > > where we place mappings in userland memory. At no point do we promise to not > > > > > > > > leave larger gaps if we feel like it :) > > > > > > > You have a fair point. Anyhow a debate for another day. > > > > > > > > > > > > > > > I'm guessing, reading more, the _real_ test here is some mathematical assertion > > > > > > > > about layout from HIGH_ADDR_SHIFT -> end of address space when using hints. > > > > > > > > > > > > > > > > But again I'm not sure that achieves much and again also is asserting internal > > > > > > > > implementation details. > > > > > > > > > > > > > > > > Correct behaviour of this kind of thing probably better belongs to tests in the > > > > > > > > userland VMA testing I'd say. > > > > > > > > > > > > > > > > Sorry I don't mean to do down work you've done before, just giving an honest > > > > > > > > technical appraisal! > > > > > > > Nah, it will be rather hilarious to see it all go down the drain xD > > > > > > > > > > > > > > > Anyway don't let this block work to fix the test if it's failing. We can revisit > > > > > > > > this later. > > > > > > > Sure. @Aboorva and Donet, I still believe that the correct approach is to elide > > > > > > > the gap check at the crossing boundary. What do you think? > > > > > > > > > > > > > One problem I am seeing with this approach is that, since the hint address > > > > > > is generated randomly, the VMAs are also being created at randomly based on > > > > > > the hint address.So, for the VMAs created at high addresses, we cannot guarantee > > > > > > that the gaps between them will be aligned to MAP_CHUNK_SIZE. > > > > > > > > > > > > High address VMAs > > > > > > ----------------- > > > > > > 1000000000000-1000040000000 r--p 00000000 00:00 0 > > > > > > 2000000000000-2000040000000 r--p 00000000 00:00 0 > > > > > > 4000000000000-4000040000000 r--p 00000000 00:00 0 > > > > > > 8000000000000-8000040000000 r--p 00000000 00:00 0 > > > > > > e80009d260000-fffff9d260000 r--p 00000000 00:00 0 > > > > > > > > > > > > I have a different approach to solve this issue. > > > > > It is really weird that such a large amount of VA space > > > > > is left between the two VMAs yet mmap is failing. > > > > > > > > > > > > > > > > > > > > Can you please do the following: > > > > > set /proc/sys/vm/max_map_count to the highest value possible. > > > > > If running without run_vmtests.sh, set /proc/sys/vm/overcommit_memory to 1. > > > > > In validate_complete_va_space: > > > > > > > > > > if (start_addr >= HIGH_ADDR_MARK && found == false) { > > > > > found = true; > > > > > continue; > > > > > } > > > > Thanks Dev for the suggestion. I set max_map_count and set overcommit > > > > memory to 1, added this code change as well, and then tried. Still, the > > > > test is failing > > > > > > > > > where found is initialized to false. This will skip the check > > > > > for the boundary. > > > > > > > > > > After this can you tell whether the test is still failing. > > > > > > > > > > Also can you give me the complete output of proc/pid/maps > > > > > after putting a sleep at the end of the test. > > > > > > > > > on powerpc support DEFAULT_MAP_WINDOW is 128TB and with > > > > total address space size is 4PB With hint it can map upto > > > > 4PB. Since the hint addres is random in this test random hing VMAs > > > > are getting created. IIUC this is expected only. > > > > > > > > > > > > 10000000-10010000 r-xp 00000000 fd:05 134226638 /home/donet/linux/tools/testing/selftests/mm/virtual_address_range > > > > 10010000-10020000 r--p 00000000 fd:05 134226638 /home/donet/linux/tools/testing/selftests/mm/virtual_address_range > > > > 10020000-10030000 rw-p 00010000 fd:05 134226638 /home/donet/linux/tools/testing/selftests/mm/virtual_address_range > > > > 30000000-10030000000 r--p 00000000 00:00 0 [anon:virtual_address_range] > > > > 10030770000-100307a0000 rw-p 00000000 00:00 0 [heap] > > > > 1004f000000-7fff8f000000 r--p 00000000 00:00 0 [anon:virtual_address_range] > > > > 7fff8faf0000-7fff8fe00000 rw-p 00000000 00:00 0 > > > > 7fff8fe00000-7fff90030000 r-xp 00000000 fd:00 792355 /usr/lib64/libc.so.6 > > > > 7fff90030000-7fff90040000 r--p 00230000 fd:00 792355 /usr/lib64/libc.so.6 > > > > 7fff90040000-7fff90050000 rw-p 00240000 fd:00 792355 /usr/lib64/libc.so.6 > > > > 7fff90050000-7fff90130000 r-xp 00000000 fd:00 792358 /usr/lib64/libm.so.6 > > > > 7fff90130000-7fff90140000 r--p 000d0000 fd:00 792358 /usr/lib64/libm.so.6 > > > > 7fff90140000-7fff90150000 rw-p 000e0000 fd:00 792358 /usr/lib64/libm.so.6 > > > > 7fff90160000-7fff901a0000 r--p 00000000 00:00 0 [vvar] > > > > 7fff901a0000-7fff901b0000 r-xp 00000000 00:00 0 [vdso] > > > > 7fff901b0000-7fff90200000 r-xp 00000000 fd:00 792351 /usr/lib64/ld64.so.2 > > > > 7fff90200000-7fff90210000 r--p 00040000 fd:00 792351 /usr/lib64/ld64.so.2 > > > > 7fff90210000-7fff90220000 rw-p 00050000 fd:00 792351 /usr/lib64/ld64.so.2 > > > > 7fffc9770000-7fffc9880000 rw-p 00000000 00:00 0 [stack] > > > > 1000000000000-1000040000000 r--p 00000000 00:00 0 [anon:virtual_address_range] > > > > 2000000000000-2000040000000 r--p 00000000 00:00 0 [anon:virtual_address_range] > > > > 4000000000000-4000040000000 r--p 00000000 00:00 0 [anon:virtual_address_range] > > > > 8000000000000-8000040000000 r--p 00000000 00:00 0 [anon:virtual_address_range] > > > > eb95410220000-fffff90220000 r--p 00000000 00:00 0 [anon:virtual_address_range] > > > > > > > > > > > > > > > > > > > > If I give the hint address serially from 128TB then the address > > > > space is contigous and gap is also MAP_SIZE, the test is passing. > > > > > > > > 10000000-10010000 r-xp 00000000 fd:05 134226638 /home/donet/linux/tools/testing/selftests/mm/virtual_address_range > > > > 10010000-10020000 r--p 00000000 fd:05 134226638 /home/donet/linux/tools/testing/selftests/mm/virtual_address_range > > > > 10020000-10030000 rw-p 00010000 fd:05 134226638 /home/donet/linux/tools/testing/selftests/mm/virtual_address_range > > > > 33000000-10033000000 r--p 00000000 00:00 0 [anon:virtual_address_range] > > > > 10033380000-100333b0000 rw-p 00000000 00:00 0 [heap] > > > > 1006f0f0000-10071000000 rw-p 00000000 00:00 0 > > > > 10071000000-7fffb1000000 r--p 00000000 00:00 0 [anon:virtual_address_range] > > > > 7fffb15d0000-7fffb1800000 r-xp 00000000 fd:00 792355 /usr/lib64/libc.so.6 > > > > 7fffb1800000-7fffb1810000 r--p 00230000 fd:00 792355 /usr/lib64/libc.so.6 > > > > 7fffb1810000-7fffb1820000 rw-p 00240000 fd:00 792355 /usr/lib64/libc.so.6 > > > > 7fffb1820000-7fffb1900000 r-xp 00000000 fd:00 792358 /usr/lib64/libm.so.6 > > > > 7fffb1900000-7fffb1910000 r--p 000d0000 fd:00 792358 /usr/lib64/libm.so.6 > > > > 7fffb1910000-7fffb1920000 rw-p 000e0000 fd:00 792358 /usr/lib64/libm.so.6 > > > > 7fffb1930000-7fffb1970000 r--p 00000000 00:00 0 [vvar] > > > > 7fffb1970000-7fffb1980000 r-xp 00000000 00:00 0 [vdso] > > > > 7fffb1980000-7fffb19d0000 r-xp 00000000 fd:00 792351 /usr/lib64/ld64.so.2 > > > > 7fffb19d0000-7fffb19e0000 r--p 00040000 fd:00 792351 /usr/lib64/ld64.so.2 > > > > 7fffb19e0000-7fffb19f0000 rw-p 00050000 fd:00 792351 /usr/lib64/ld64.so.2 > > > > 7fffc5470000-7fffc5580000 rw-p 00000000 00:00 0 [stack] > > > > 800000000000-2aab000000000 r--p 00000000 00:00 0 [anon:virtual_address_range] > > > > > > > > > > > Thank you for this output. I can't wrap my head around why this behaviour changes > > > when you generate the hint sequentially. The mmap() syscall is supposed to do the > > > following (irrespective of high VA space or not) - if the allocation at the hint > > Yes, it is working as expected. On PowerPC, the DEFAULT_MAP_WINDOW is > > 128TB, and the system can map up to 4PB. > > > > In the test, the first mmap call maps memory up to 128TB without any > > hint, so the VMAs are created below the 128TB boundary. > > > > In the second mmap call, we provide a hint starting from 256TB, and > > the hint address is generated randomly above 256TB. The mappings are > > correctly created at these hint addresses. Since the hint addresses > > are random, the resulting VMAs are also created at random locations. > > > > So, what I tried is: mapping from 0 to 128TB without any hint, and > > then for the second mmap, instead of starting the hint from 256TB, I > > started from 128TB. Instead of using random hint addresses, I used > > sequential hint addresses from 128TB up to 512TB. With this change, > > the VMAs are created in order, and the test passes. > > > > 800000000000-2aab000000000 r--p 00000000 00:00 0 128TB to 512TB VMA > > > > I think we will see same behaviour on x86 with X86_FEATURE_LA57. > > > > I will send the updated patch in V2. > > Since you say it fails on both radix and hash, it means that the generic > code path is failing. I see that on my system, when I run the test with > LPA2 config, write() fails with errno set to -ENOMEM. Can you apply > the following diff and check whether the test fails still. Doing this > fixed it for arm64. > > diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c > > index b380e102b22f..3032902d01f2 100644 > > --- a/tools/testing/selftests/mm/virtual_address_range.c > > +++ b/tools/testing/selftests/mm/virtual_address_range.c > > @@ -173,10 +173,6 @@ static int validate_complete_va_space(void) > > */ > > hop = 0; > > while (start_addr + hop < end_addr) { > > - if (write(fd, (void *)(start_addr + hop), 1) != 1) > > - return 1; > > - lseek(fd, 0, SEEK_SET); > > - > > if (is_marked_vma(vma_name)) > > munmap((char *)(start_addr + hop), MAP_CHUNK_SIZE); > Even with this change, the test is still failing. In this case, we are allocating physical memory and writing into it, but our issue seems to be with the gap between VMAs, so I believe this might not be directly related. I will send the next revision where the test passes and no issues are observed Just curious — with LPA2, is the second mmap() call successful? And are the VMAs being created at the hint address as expected? > > > > > addr succeeds, then all is well, otherwise, do a top-down search for a large > > > enough gap. I am not aware of the nuances in powerpc but I really am suspecting > > > a bug in powerpc mmap code. Can you try to do some tracing - which function > > > eventually fails to find the empty gap? > > > > > > Through my limited code tracing - we should end up in slice_find_area_topdown, > > > then we ask the generic code to find the gap using vm_unmapped_area. So I > > > suspect something is happening between this, probably slice_scan_available(). > > > > > > > > > From 0 to 128TB, we map memory directly without using any hint. For the range above > > > > > > 256TB up to 512TB, we perform the mapping using hint addresses. In the current test, > > > > > > we use random hint addresses, but I have modified it to generate hint addresses linearly > > > > > > starting from 128TB. > > > > > > > > > > > > With this change: > > > > > > > > > > > > The 0–128TB range is mapped without hints and verified accordingly. > > > > > > > > > > > > The 128TB–512TB range is mapped using linear hint addresses and then verified. > > > > > > > > > > > > Below are the VMAs obtained with this approach: > > > > > > > > > > > > 10000000-10010000 r-xp 00000000 fd:05 135019531 > > > > > > 10010000-10020000 r--p 00000000 fd:05 135019531 > > > > > > 10020000-10030000 rw-p 00010000 fd:05 135019531 > > > > > > 20000000-10020000000 r--p 00000000 00:00 0 > > > > > > 10020800000-10020830000 rw-p 00000000 00:00 0 > > > > > > 1004bcf0000-1004c000000 rw-p 00000000 00:00 0 > > > > > > 1004c000000-7fff8c000000 r--p 00000000 00:00 0 > > > > > > 7fff8c130000-7fff8c360000 r-xp 00000000 fd:00 792355 > > > > > > 7fff8c360000-7fff8c370000 r--p 00230000 fd:00 792355 > > > > > > 7fff8c370000-7fff8c380000 rw-p 00240000 fd:00 792355 > > > > > > 7fff8c380000-7fff8c460000 r-xp 00000000 fd:00 792358 > > > > > > 7fff8c460000-7fff8c470000 r--p 000d0000 fd:00 792358 > > > > > > 7fff8c470000-7fff8c480000 rw-p 000e0000 fd:00 792358 > > > > > > 7fff8c490000-7fff8c4d0000 r--p 00000000 00:00 0 > > > > > > 7fff8c4d0000-7fff8c4e0000 r-xp 00000000 00:00 0 > > > > > > 7fff8c4e0000-7fff8c530000 r-xp 00000000 fd:00 792351 > > > > > > 7fff8c530000-7fff8c540000 r--p 00040000 fd:00 792351 > > > > > > 7fff8c540000-7fff8c550000 rw-p 00050000 fd:00 792351 > > > > > > 7fff8d000000-7fffcd000000 r--p 00000000 00:00 0 > > > > > > 7fffe9c80000-7fffe9d90000 rw-p 00000000 00:00 0 > > > > > > 800000000000-2000000000000 r--p 00000000 00:00 0 -> High Address (128TB to 512TB) > > > > > > > > > > > > diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c > > > > > > index 4c4c35eac15e..0be008cba4b0 100644 > > > > > > --- a/tools/testing/selftests/mm/virtual_address_range.c > > > > > > +++ b/tools/testing/selftests/mm/virtual_address_range.c > > > > > > @@ -56,21 +56,21 @@ > > > > > > #ifdef __aarch64__ > > > > > > #define HIGH_ADDR_MARK ADDR_MARK_256TB > > > > > > -#define HIGH_ADDR_SHIFT 49 > > > > > > +#define HIGH_ADDR_SHIFT 48 > > > > > > #define NR_CHUNKS_LOW NR_CHUNKS_256TB > > > > > > #define NR_CHUNKS_HIGH NR_CHUNKS_3840TB > > > > > > #else > > > > > > #define HIGH_ADDR_MARK ADDR_MARK_128TB > > > > > > -#define HIGH_ADDR_SHIFT 48 > > > > > > +#define HIGH_ADDR_SHIFT 47 > > > > > > #define NR_CHUNKS_LOW NR_CHUNKS_128TB > > > > > > #define NR_CHUNKS_HIGH NR_CHUNKS_384TB > > > > > > #endif > > > > > > -static char *hint_addr(void) > > > > > > +static char *hint_addr(int hint) > > > > > > { > > > > > > - int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT); > > > > > > + unsigned long addr = ((1UL << HIGH_ADDR_SHIFT) + (hint * MAP_CHUNK_SIZE)); > > > > > > - return (char *) (1UL << bits); > > > > > > + return (char *) (addr); > > > > > > } > > > > > > static void validate_addr(char *ptr, int high_addr) > > > > > > @@ -217,7 +217,7 @@ int main(int argc, char *argv[]) > > > > > > } > > > > > > for (i = 0; i < NR_CHUNKS_HIGH; i++) { > > > > > > - hint = hint_addr(); > > > > > > + hint = hint_addr(i); > > > > > > hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ, > > > > > > MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); > > > > > > > > > > > > > > > > > > > > > > > > Can we fix it this way?

3 months, 3 weeks

2
1
0 0

[PATCH net 03/10] netlink: specs: ethtool: replace underscores with dashes in names

by Jakub Kicinski

We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen replaces special chars in names) but gives more uniform naming in Python. Fixes: 13e59344fb9d ("net: ethtool: add support for symmetric-xor RSS hash") Fixes: 46fb3ba95b93 ("ethtool: Add an interface for flashing transceiver modules' firmware") Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> --- CC: andrew(a)lunn.ch CC: donald.hunter(a)gmail.com CC: shuah(a)kernel.org CC: kory.maincent(a)bootlin.com CC: sdf(a)fomichev.me CC: gal(a)nvidia.com CC: noren(a)nvidia.com CC: ahmed.zaki(a)intel.com CC: wojciech.drewek(a)intel.com CC: petrm(a)nvidia.com CC: danieller(a)nvidia.com CC: linux-kselftest(a)vger.kernel.org --- Documentation/netlink/specs/ethtool.yaml | 6 +++--- tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 72a076b0e1b5..348c6ad548f5 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -48,7 +48,7 @@ c-version-name: ethtool-genl-version name: started doc: The firmware flashing process has started. - - name: in_progress + name: in-progress doc: The firmware flashing process is in progress. - name: completed @@ -1422,7 +1422,7 @@ c-version-name: ethtool-genl-version name: hkey type: binary - - name: input_xfrm + name: input-xfrm type: u32 - name: start-context @@ -2238,7 +2238,7 @@ c-version-name: ethtool-genl-version - hfunc - indir - hkey - - input_xfrm + - input-xfrm dump: request: attributes: diff --git a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py index f439c434ba36..648ff50bc1c3 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py +++ b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py @@ -38,7 +38,7 @@ from lib.py import rand_port raise KsftSkipEx("socket.SO_INCOMING_CPU was added in Python 3.11") input_xfrm = cfg.ethnl.rss_get( - {'header': {'dev-name': cfg.ifname}}).get('input_xfrm') + {'header': {'dev-name': cfg.ifname}}).get('input-xfrm') # Check for symmetric xor/or-xor if not input_xfrm or (input_xfrm != 1 and input_xfrm != 2): -- 2.49.0

3 months, 3 weeks

3
3
0 0

[PATCH v2 0/3] tools/nolibc: add support for SuperH

by Thomas Weißschuh

Add support for SuperH/"sh" to nolibc. Only sh4 is tested for now. This is only tested on QEMU so far. Additional testing would be very welcome. Test instructions: $ cd tools/testings/selftests/nolibc/ $ make -f Makefile.nolibc ARCH=sh CROSS_COMPILE=sh4-linux- nolibc-test $ file nolibc-test nolibc-test: ELF 32-bit LSB executable, Renesas SH, version 1 (SYSV), statically linked, not stripped $ ./nolibc-test Running test 'startup' 0 argc = 1 [OK] ... Total number of errors: 0 Exiting with status 0 Signed-off-by: Thomas Weißschuh <linux(a)weissschuh.net> --- Changes in v2: - Rebase onto latest nolibc-next - Pick up Ack from Willy - Provide some test instructions - Link to v1: https://lore.kernel.org/r/20250609-nolibc-sh-v1-0-9dcdb1b66bb5@weissschuh.n… --- Thomas Weißschuh (3): selftests/nolibc: fix EXTRACONFIG variables ordering selftests/nolibc: use file driver for QEMU serial tools/nolibc: add support for SuperH tools/include/nolibc/arch-sh.h | 162 +++++++++++++++++++++++++ tools/include/nolibc/arch.h | 2 + tools/testing/selftests/nolibc/Makefile.nolibc | 15 ++- tools/testing/selftests/nolibc/run-tests.sh | 3 +- 4 files changed, 177 insertions(+), 5 deletions(-) --- base-commit: eb135311083100b6590a7545618cd9760d896a86 change-id: 20250528-nolibc-sh-8b4e3bb8efcb Best regards, -- Thomas Weißschuh <linux(a)weissschuh.net>

3 months, 3 weeks

4
7
0 0

[PATCH v3 2/2] selftests/bpf: Add testcases for BPF_ADD and BPF_SUB

by Harishankar Vishwanathan

The previous commit improves the precision in scalar(32)_min_max_add, and scalar(32)_min_max_sub. The improvement in precision occurs in cases when all outcomes overflow or underflow, respectively. This commit adds selftests that exercise those cases. This commit also adds selftests for cases where the output register state bounds for u(32)_min/u(32)_max are conservatively set to unbounded (when there is partial overflow or underflow). Signed-off-by: Harishankar Vishwanathan <harishankar.vishwanathan(a)gmail.com> Co-developed-by: Matan Shachnai <m.shachnai(a)rutgers.edu> Signed-off-by: Matan Shachnai <m.shachnai(a)rutgers.edu> Suggested-by: Eduard Zingerman <eddyz87(a)gmail.com> --- .../selftests/bpf/progs/verifier_bounds.c | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 30e16153fdf1..31986f6c609e 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -1371,4 +1371,165 @@ __naked void mult_sign_ovf(void) __imm(bpf_skb_store_bytes) : __clobber_all); } + +SEC("socket") +__description("64-bit addition, all outcomes overflow") +__success __log_level(2) +__msg("5: (0f) r3 += r3 {{.*}} R3_w=scalar(umin=0x4000000000000000,umax=0xfffffffffffffffe)") +__retval(0) +__naked void add64_full_overflow(void) +{ + asm volatile ( + "r4 = 0;" + "r4 = -r4;" + "r3 = 0xa000000000000000 ll;" + "r3 |= r4;" + "r3 += r3;" + "r0 = 0;" + "exit" + : + : + : __clobber_all); +} + +SEC("socket") +__description("64-bit addition, partial overflow, result in unbounded reg") +__success __log_level(2) +__msg("4: (0f) r3 += r3 {{.*}} R3_w=scalar()") +__retval(0) +__naked void add64_partial_overflow(void) +{ + asm volatile ( + "r4 = 0;" + "r4 = -r4;" + "r3 = 2;" + "r3 |= r4;" + "r3 += r3;" + "r0 = 0;" + "exit" + : + : + : __clobber_all); +} + +SEC("socket") +__description("32-bit addition overflow, all outcomes overflow") +__success __log_level(2) +__msg("4: (0c) w3 += w3 {{.*}} R3_w=scalar(smin=umin=umin32=0x40000000,smax=umax=umax32=0xfffffffe,var_off=(0x0; 0xffffffff))") +__retval(0) +__naked void add32_full_overflow(void) +{ + asm volatile ( + "w4 = 0;" + "w4 = -w4;" + "w3 = 0xa0000000;" + "w3 |= w4;" + "w3 += w3;" + "r0 = 0;" + "exit" + : + : + : __clobber_all); +} + +SEC("socket") +__description("32-bit addition, partial overflow, result in unbounded u32 bounds") +__success __log_level(2) +__msg("4: (0c) w3 += w3 {{.*}} R3_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") +__retval(0) +__naked void add32_partial_overflow(void) +{ + asm volatile ( + "w4 = 0;" + "w4 = -w4;" + "w3 = 2;" + "w3 |= w4;" + "w3 += w3;" + "r0 = 0;" + "exit" + : + : + : __clobber_all); +} + +SEC("socket") +__description("64-bit subtraction, all outcomes underflow") +__success __log_level(2) +__msg("6: (1f) r3 -= r1 {{.*}} R3_w=scalar(umin=1,umax=0x8000000000000000)") +__retval(0) +__naked void sub64_full_overflow(void) +{ + asm volatile ( + "r1 = 0;" + "r1 = -r1;" + "r2 = 0x8000000000000000 ll;" + "r1 |= r2;" + "r3 = 0;" + "r3 -= r1;" + "r0 = 0;" + "exit" + : + : + : __clobber_all); +} + +SEC("socket") +__description("64-bit subtration, partial overflow, result in unbounded reg") +__success __log_level(2) +__msg("3: (1f) r3 -= r2 {{.*}} R3_w=scalar()") +__retval(0) +__naked void sub64_partial_overflow(void) +{ + asm volatile ( + "r3 = 0;" + "r3 = -r3;" + "r2 = 1;" + "r3 -= r2;" + "r0 = 0;" + "exit" + : + : + : __clobber_all); +} + +SEC("socket") +__description("32-bit subtraction overflow, all outcomes underflow") +__success __log_level(2) +__msg("5: (1c) w3 -= w1 {{.*}} R3_w=scalar(smin=umin=umin32=1,smax=umax=umax32=0x80000000,var_off=(0x0; 0xffffffff))") +__retval(0) +__naked void sub32_full_overflow(void) +{ + asm volatile ( + "w1 = 0;" + "w1 = -w1;" + "w2 = 0x80000000;" + "w1 |= w2;" + "w3 = 0;" + "w3 -= w1;" + "r0 = 0;" + "exit" + : + : + : __clobber_all); +} + +SEC("socket") +__description("32-bit subtration, partial overflow, result in unbounded u32 bounds") +__success __log_level(2) +__msg("3: (1c) w3 -= w2 {{.*}} R3_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") +__retval(0) +__naked void sub32_partial_overflow(void) +{ + asm volatile ( + "w3 = 0;" + "w3 = -w3;" + "w2 = 1;" + "w3 -= w2;" + "r0 = 0;" + "exit" + : + : + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; -- 2.45.2

3 months, 3 weeks

3
2
0 0

[PATCH v2 00/23] ARM64 PMU Partitioning

by Colton Lewis

This series creates a new PMU scheme on ARM, a partitioned PMU that allows reserving a subset of counters for more direct guest access, significantly reducing overhead. More details, including performance benchmarks, can be read in the v1 cover letter linked below. v2: * Rebased on top of kvm/queue to pick up Sean's patch [1] that reorganizes some of the same headers and would otherwise conflict. * Changed the semantics of the command line parameters and the ioctl. It was pointed out in the comments last time that it doesn't work to repartition at runtime because the perf subsystem assumes the number of counters it gets will not change after the PMU is probed. Now the PMUv3 command line parameters are the sole thing that divides up guest and host counters and the ioctl just toggles a flag for whether a vcpu should use the partitioned PMU. I've also moved from one to two parameters: partition_pmu=[y/n] and reserved_guest_counters=[0-N]. This makes it possible to unambiguously express configurations like a partitioned PMU with 0 general purpose counters exposed to the guest (which still exposes the cycle counter. * Moved the partitioning code into the PMUv3 driver itself so KVM code isn't modifying fields that are otherwise internal to the driver. * Define PMI{CNTR,FILTR} as undef_access since KVM isn't ready to support that counter. It is, however, still handled in the partitioning because the driver recognizes it. * Take out the dependency on FEAT_FGT since it is not widely available on hardware yet. Instead, define a fast path in switch.h for handling accesses to the registers that would otherwise be untrapped. * During MDCR_EL2 setup for guests, ensure the computed HPMN value is always below the number of guest counters allocated by the driver at boot and always below the number of counters on the current CPU. This accounts for the possibiliy of heterogeneous hardware where I guest might be able to use the partitioned PMU on one CPU but not another. * The KVM PMU event filter API says that counters must not count while the event is filtered. To ensure this, enforce the filter on every vcpu_load into the guest. * Settable PMCR_EL0.N with a partitioned PMU now works and the vcpu_counter_access selftest changes reflect that. v1: https://lore.kernel.org/kvm/20250602192702.2125115-1-coltonlewis@google.com/ Colton Lewis (22): arm64: cpufeature: Add cpucap for HPMN0 arm64: Generate sign macro for sysreg Enums arm64: cpufeature: Add cpucap for PMICNTR arm64: Define PMI{CNTR,FILTR}_EL0 as undef_access KVM: arm64: Reorganize PMU functions perf: arm_pmuv3: Introduce method to partition the PMU perf: arm_pmuv3: Generalize counter bitmasks perf: arm_pmuv3: Keep out of guest counter partition KVM: arm64: Correct kvm_arm_pmu_get_max_counters() KVM: arm64: Set up FGT for Partitioned PMU KVM: arm64: Writethrough trapped PMEVTYPER register KVM: arm64: Use physical PMSELR for PMXEVTYPER if partitioned KVM: arm64: Writethrough trapped PMOVS register KVM: arm64: Write fast path PMU register handlers KVM: arm64: Setup MDCR_EL2 to handle a partitioned PMU KVM: arm64: Account for partitioning in PMCR_EL0 access KVM: arm64: Context swap Partitioned PMU guest registers KVM: arm64: Enforce PMU event filter at vcpu_load() perf: arm_pmuv3: Handle IRQs for Partitioned PMU guest counters KVM: arm64: Inject recorded guest interrupts KVM: arm64: Add ioctl to partition the PMU when supported KVM: arm64: selftests: Add test case for partitioned PMU Marc Zyngier (1): KVM: arm64: Cleanup PMU includes Documentation/virt/kvm/api.rst | 21 + arch/arm/include/asm/arm_pmuv3.h | 34 + arch/arm64/include/asm/arm_pmuv3.h | 61 +- arch/arm64/include/asm/kvm_host.h | 20 +- arch/arm64/include/asm/kvm_pmu.h | 61 ++ arch/arm64/kernel/cpufeature.c | 15 + arch/arm64/kvm/Makefile | 2 +- arch/arm64/kvm/arm.c | 22 + arch/arm64/kvm/debug.c | 24 +- arch/arm64/kvm/hyp/include/hyp/switch.h | 233 ++++++ arch/arm64/kvm/pmu-emul.c | 676 +---------------- arch/arm64/kvm/pmu-part.c | 359 +++++++++ arch/arm64/kvm/pmu.c | 687 ++++++++++++++++++ arch/arm64/kvm/sys_regs.c | 66 +- arch/arm64/tools/cpucaps | 2 + arch/arm64/tools/gen-sysreg.awk | 1 + arch/arm64/tools/sysreg | 6 +- drivers/perf/arm_pmuv3.c | 150 +++- include/linux/perf/arm_pmu.h | 15 +- include/linux/perf/arm_pmuv3.h | 14 +- include/uapi/linux/kvm.h | 4 + tools/include/uapi/linux/kvm.h | 2 + .../selftests/kvm/arm64/vpmu_counter_access.c | 63 +- virt/kvm/kvm_main.c | 1 + 24 files changed, 1791 insertions(+), 748 deletions(-) create mode 100644 arch/arm64/kvm/pmu-part.c base-commit: 79150772457f4d45e38b842d786240c36bb1f97f -- 2.50.0.714.g196bf9f422-goog

3 months, 3 weeks

3
39
0 0

[PATCH] selftests: kvm: Fix spelling of 'occurrences' in sparsebit.c comments

by Rahul Kumar

Corrected two instances of the misspelled word 'occurences' to 'occurrences' in comments explaining node invariants in sparsebit.c. These comments describe core behavior of the data structure and should be clear. Signed-off-by: Rahul Kumar <rk0006818(a)gmail.com> --- tools/testing/selftests/kvm/lib/sparsebit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/sparsebit.c b/tools/testing/selftests/kvm/lib/sparsebit.c index cfed9d26cc71..a99188f87a38 100644 --- a/tools/testing/selftests/kvm/lib/sparsebit.c +++ b/tools/testing/selftests/kvm/lib/sparsebit.c @@ -116,7 +116,7 @@ * * + A node with all mask bits set only occurs when the last bit * described by the previous node is not equal to this nodes - * starting index - 1. All such occurences of this condition are + * starting index - 1. All such occurrences of this condition are * avoided by moving the setting of the nodes mask bits into * the previous nodes num_after setting. * @@ -592,7 +592,7 @@ static struct node *node_split(struct sparsebit *s, sparsebit_idx_t idx) * * + A node with all mask bits set only occurs when the last bit * described by the previous node is not equal to this nodes - * starting index - 1. All such occurences of this condition are + * starting index - 1. All such occurrences of this condition are * avoided by moving the setting of the nodes mask bits into * the previous nodes num_after setting. */ -- 2.43.0 This patch fixes two misspellings of the word 'occurrences' in comments within sparsebit.c used by the KVM selftests. Fixing the spelling improves readability and clarity of the documented behavior. Only comment text has been changed — there are no modifications to the functional logic of the tests. I would appreciate your review and any feedback you may have. Thank you for your time and support. Best regards, Rahul Kumar

3 months, 3 weeks

2
1
0 0

[PATCH v3 00/13] KVM: Make irqfd registration globally unique

by Sean Christopherson

Non-KVM folks, I am hoping to route this through the KVM tree (6.17 or later), as the non-KVM changes should be glorified nops. Please holler if you object to that idea. Hyper-V folks in particular, let me know if you want a stable topic branch/tag, e.g. on the off chance you want to make similar changes to the Hyper-V code, and I'll make sure that happens. As for what this series actually does... Rework KVM's irqfd registration to require that an eventfd is bound to at most one irqfd throughout the entire system. KVM currently disallows binding an eventfd to multiple irqfds for a single VM, but doesn't reject attempts to bind an eventfd to multiple VMs. This is obviously an ABI change, but I'm fairly confident that it won't break userspace, because binding an eventfd to multiple irqfds hasn't truly worked since commit e8dbf19508a1 ("kvm/eventfd: Use priority waitqueue to catch events before userspace"). A somewhat undocumented, and perhaps even unintentional, side effect of suppressing eventfd notifications for userspace is that the priority+exclusive behavior also suppresses eventfd notifications for any subsequent waiters, even if they are priority waiters. I.e. only the first VM with an irqfd+eventfd binding will get notifications. And for IRQ bypass, a.k.a. device posted interrupts, globally unique bindings are a hard requirement (at least on x86; I assume other archs are the same). KVM and the IRQ bypass manager kinda sorta handle this, but in the absolute worst way possible (IMO). Instead of surfacing an error to userspace, KVM silently ignores IRQ bypass registration errors. The motivation for this series is to harden against userspace goofs. AFAIK, we (Google) have never actually had a bug where userspace tries to assign an eventfd to multiple VMs, but the possibility has come up in more than one bug investigation (our intra-host, a.k.a. copyless, migration scheme transfers eventfds from the old to the new VM when updating the host VMM). v3: - Retain WQ_FLAG_EXCLUSIVE in mshv_eventfd.c, which snuck in between v1 and v2. [Peter] - Use EXPORT_SYMBOL_GPL. [Peter] - Move WQ_FLAG_EXCLUSIVE out of add_wait_queue_priority() in a prep patch so that the affected subsystems are more explicitly documented (and then immediately drop the flag from drivers/xen/privcmd.c, which amusingly hides that file from the diff stats). v2: - https://lore.kernel.org/all/20250519185514.2678456-1-seanjc@google.com - Use guard(spinlock_irqsave). [Prateek] v1: https://lore.kernel.org/all/20250401204425.904001-1-seanjc@google.com Sean Christopherson (13): KVM: Use a local struct to do the initial vfs_poll() on an irqfd KVM: Acquire SCRU lock outside of irqfds.lock during assignment KVM: Initialize irqfd waitqueue callback when adding to the queue KVM: Add irqfd to KVM's list via the vfs_poll() callback KVM: Add irqfd to eventfd's waitqueue while holding irqfds.lock sched/wait: Drop WQ_FLAG_EXCLUSIVE from add_wait_queue_priority() xen: privcmd: Don't mark eventfd waiter as EXCLUSIVE sched/wait: Add a waitqueue helper for fully exclusive priority waiters KVM: Disallow binding multiple irqfds to an eventfd with a priority waiter KVM: Drop sanity check that per-VM list of irqfds is unique KVM: selftests: Assert that eventfd() succeeds in Xen shinfo test KVM: selftests: Add utilities to create eventfds and do KVM_IRQFD KVM: selftests: Add a KVM_IRQFD test to verify uniqueness requirements drivers/hv/mshv_eventfd.c | 8 ++ include/linux/kvm_irqfd.h | 1 - include/linux/wait.h | 2 + kernel/sched/wait.c | 22 ++- tools/testing/selftests/kvm/Makefile.kvm | 1 + tools/testing/selftests/kvm/arm64/vgic_irq.c | 12 +- .../testing/selftests/kvm/include/kvm_util.h | 40 ++++++ tools/testing/selftests/kvm/irqfd_test.c | 130 ++++++++++++++++++ .../selftests/kvm/x86/xen_shinfo_test.c | 21 +-- virt/kvm/eventfd.c | 130 +++++++++++++----- 10 files changed, 302 insertions(+), 65 deletions(-) create mode 100644 tools/testing/selftests/kvm/irqfd_test.c base-commit: 45eb29140e68ffe8e93a5471006858a018480a45 -- 2.49.0.1151.ga128411c76-goog

3 months, 3 weeks

4
20
0 0

[PATCH net-next] selftests: net: add netpoll basic functionality test

by Breno Leitao

Add a basic selftest for the netpoll polling mechanism, specifically targeting the netpoll poll() side. The test creates a scenario where network transmission is running at maximum speed, and netpoll needs to poll the NIC. This is achieved by: 1. Configuring a single RX/TX queue to create contention 2. Generating background traffic to saturate the interface 3. Sending netconsole messages to trigger netpoll polling 4. Using dynamic netconsole targets via configfs 5. Delete and create new netconsole targets after 5 iterations The test validates a critical netpoll code path by monitoring traffic flow and ensuring netpoll_poll_dev() is called when the normal TX path is blocked. Perf probing confirms this test successfully triggers netpoll_poll_dev() in typical test runs. This addresses a gap in netpoll test coverage for a path that is tricky for the network stack. Signed-off-by: Breno Leitao <leitao(a)debian.org> --- Changes since RFC: - Toggle the netconsole interfaces up and down after 5 iterations. - Moved the traffic check under DEBUG (Willem de Bruijn). - Bumped the iterations to 20 given it runs faster now. - Link to the RFC: https://lore.kernel.org/r/20250612-netpoll_test-v1-1-4774fd95933f@debian.org --- tools/testing/selftests/drivers/net/Makefile | 1 + .../testing/selftests/drivers/net/netpoll_basic.py | 231 +++++++++++++++++++++ 2 files changed, 232 insertions(+) diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index bd309b2d39095..9bd84d6b542e5 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -16,6 +16,7 @@ TEST_PROGS := \ netcons_fragmented_msg.sh \ netcons_overflow.sh \ netcons_sysdata.sh \ + netpoll_basic.py \ ping.py \ queues.py \ stats.py \ diff --git a/tools/testing/selftests/drivers/net/netpoll_basic.py b/tools/testing/selftests/drivers/net/netpoll_basic.py new file mode 100755 index 0000000000000..2a81926169262 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netpoll_basic.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +# This test aims to evaluate the netpoll polling mechanism (as in +# netpoll_poll_dev()). It presents a complex scenario where the network +# attempts to send a packet but fails, prompting it to poll the NIC from within +# the netpoll TX side. +# +# This has been a crucial path in netpoll that was previously untested. Jakub +# suggested using a single RX/TX queue, pushing traffic to the NIC, and then +# sending netpoll messages (via netconsole) to trigger the poll. `perf` probing +# of netpoll_poll_dev() showed that this test indeed triggers +# netpoll_poll_dev() once or twice in 10 iterations. + +# Author: Breno Leitao <leitao(a)debian.org> + +import errno +import os +import random +import string +import time + +from lib.py import ( + ethtool, + GenerateTraffic, + ksft_exit, + ksft_pr, + ksft_run, + KsftFailEx, + KsftSkipEx, + NetdevFamily, + NetDrvEpEnv, +) + +NETCONSOLE_CONFIGFS_PATH = "/sys/kernel/config/netconsole" +REMOTE_PORT = 6666 +LOCAL_PORT = 1514 +# Number of netcons messages to send. I usually see netpoll_poll_dev() +# being called at least once in 10 iterations. Having 20 to have some buffers +ITERATIONS = 20 +DEBUG = False + + +def generate_random_netcons_name() -> str: + """Generate a random target name starting with 'netcons'""" + random_suffix = "".join(random.choices(string.ascii_lowercase + string.digits, k=8)) + return f"netcons_{random_suffix}" + + +def get_stats(cfg: NetDrvEpEnv, netdevnl: NetdevFamily) -> dict[str, int]: + """Get the statistics for the interface""" + return netdevnl.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0] + + +def set_single_rx_tx_queue(interface_name: str) -> None: + """Set the number of RX and TX queues to 1 using ethtool""" + try: + # This don't need to be reverted, since interfaces will be deleted after test + ethtool(f"-G {interface_name} rx 1 tx 1") + except Exception as e: + raise KsftSkipEx( + f"Failed to configure RX/TX queues: {e}. Ethtool not available?" + ) + + +def create_netconsole_target( + config_data: dict[str, str], + target_name: str, +) -> None: + """Create a netconsole dynamic target against the interfaces""" + ksft_pr(f"Using netconsole name: {target_name}") + try: + os.makedirs(f"{NETCONSOLE_CONFIGFS_PATH}/{target_name}", exist_ok=True) + ksft_pr(f"Created target directory: {NETCONSOLE_CONFIGFS_PATH}/{target_name}") + except OSError as e: + if e.errno != errno.EEXIST: + raise KsftFailEx(f"Failed to create netconsole target directory: {e}") + + try: + for key, value in config_data.items(): + if DEBUG: + ksft_pr(f"Setting {key} to {value}") + with open( + f"{NETCONSOLE_CONFIGFS_PATH}/{target_name}/{key}", + "w", + encoding="utf-8", + ) as f: + # Always convert to string to write to file + f.write(str(value)) + f.close() + + if DEBUG: + # Read all configuration values for debugging + for debug_key in config_data.keys(): + with open( + f"{NETCONSOLE_CONFIGFS_PATH}/{target_name}/{debug_key}", + "r", + encoding="utf-8", + ) as f: + content = f.read() + ksft_pr( + f"{NETCONSOLE_CONFIGFS_PATH}/{target_name}/{debug_key} {content}" + ) + + except Exception as e: + raise KsftFailEx(f"Failed to configure netconsole target: {e}") + + +def set_netconsole(cfg: NetDrvEpEnv, interface_name: str, target_name: str) -> None: + """Configure netconsole on the interface with the given target name""" + config_data = { + "extended": "1", + "dev_name": interface_name, + "local_port": LOCAL_PORT, + "remote_port": REMOTE_PORT, + "local_ip": cfg.addr_v["4"] if cfg.addr_ipver == "4" else cfg.addr_v["6"], + "remote_ip": ( + cfg.remote_addr_v["4"] if cfg.addr_ipver == "4" else cfg.remote_addr_v["6"] + ), + "remote_mac": "00:00:00:00:00:00", # Not important for this test + "enabled": "1", + } + + create_netconsole_target(config_data, target_name) + ksft_pr(f"Created netconsole target: {target_name} on interface {interface_name}") + + +def delete_netconsole_target(name: str) -> None: + """Delete a netconsole dynamic target""" + target_path = f"{NETCONSOLE_CONFIGFS_PATH}/{name}" + try: + if os.path.exists(target_path): + os.rmdir(target_path) + except OSError as e: + raise KsftFailEx(f"Failed to delete netconsole target: {e}") + + +def check_traffic_flowing(cfg: NetDrvEpEnv, netdevnl: NetdevFamily) -> int: + """Check if traffic is flowing on the interface""" + stat1 = get_stats(cfg, netdevnl) + time.sleep(1) + stat2 = get_stats(cfg, netdevnl) + pkts_per_sec = stat2["rx-packets"] - stat1["rx-packets"] + # Just make sure this will not fail even in slow/debug kernels + if pkts_per_sec < 10: + raise KsftFailEx(f"Traffic seems low: {pkts_per_sec}") + if DEBUG: + ksft_pr(f"Traffic per second {pkts_per_sec}") + + return pkts_per_sec + + +def do_netpoll_flush( + cfg: NetDrvEpEnv, netdevnl: NetdevFamily, ifname: str, target_name: str +) -> None: + """Print messages to the console, trying to trigger a netpoll poll""" + + set_netconsole(cfg, ifname, target_name) + for i in range(int(ITERATIONS)): + msg = f"netcons test #{i}." + + if DEBUG: + pkts_per_s = check_traffic_flowing(cfg, netdevnl) + msg += f" ({pkts_per_s} packets/s)" + + with open("/dev/kmsg", "w", encoding="utf-8") as kmsg: + kmsg.write(msg) + + if not i % 5: + # Every 5 iterations, toggle netconsole + delete_netconsole_target(target_name) + set_netconsole(cfg, ifname, target_name) + + +def test_netpoll(cfg: NetDrvEpEnv, netdevnl: NetdevFamily) -> None: + """ + Test netpoll by sending traffic to the interface and then sending + netconsole messages to trigger a poll + """ + + target_name = generate_random_netcons_name() + ifname = cfg.dev["ifname"] + traffic = None + + try: + set_single_rx_tx_queue(ifname) + traffic = GenerateTraffic(cfg) + check_traffic_flowing(cfg, netdevnl) + do_netpoll_flush(cfg, netdevnl, ifname, target_name) + finally: + if traffic: + traffic.stop() + delete_netconsole_target(target_name) + + +def check_dependencies() -> None: + """Check if the dependencies are met""" + if not os.path.exists(NETCONSOLE_CONFIGFS_PATH): + raise KsftSkipEx( + f"Directory {NETCONSOLE_CONFIGFS_PATH} does not exist. CONFIG_NETCONSOLE_DYNAMIC might not be set." + ) + + +def load_netconsole_module() -> None: + """Try to load the netconsole module""" + try: + os.system("modprobe netconsole") + except Exception: + # It is fine if we fail to load the module, it will fail later + # at check_dependencies() + pass + + +def main() -> None: + """Main function to run the test""" + load_netconsole_module() + check_dependencies() + netdevnl = NetdevFamily() + with NetDrvEpEnv(__file__, nsim_test=True) as cfg: + ksft_run( + [test_netpoll], + args=( + cfg, + netdevnl, + ), + ) + ksft_exit() + + +if __name__ == "__main__": + main() --- base-commit: 4f4040ea5d3e4bebebbef9379f88085c8b99221c change-id: 20250612-netpoll_test-a1324d2057c8 Best regards, -- Breno Leitao <leitao(a)debian.org>

3 months, 3 weeks

2
3
0 0

[PATCH] mm/selftests: improve UFFD-WP feature detection in KSM test

by Li Wang

The current implementation of test_unmerge_uffd_wp() explicitly sets `uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP` before calling UFFDIO_API. This can cause the ioctl() call to fail with EINVAL on kernels that do not support UFFD-WP, leading the test to fail unnecessarily: # ------------------------------ # running ./ksm_functional_tests # ------------------------------ # TAP version 13 # 1..9 # # [RUN] test_unmerge # ok 1 Pages were unmerged # # [RUN] test_unmerge_zero_pages # ok 2 KSM zero pages were unmerged # # [RUN] test_unmerge_discarded # ok 3 Pages were unmerged # # [RUN] test_unmerge_uffd_wp # not ok 4 UFFDIO_API failed <----- # # [RUN] test_prot_none # ok 5 Pages were unmerged # # [RUN] test_prctl # ok 6 Setting/clearing PR_SET_MEMORY_MERGE works # # [RUN] test_prctl_fork # # No pages got merged # # [RUN] test_prctl_fork_exec # ok 7 PR_SET_MEMORY_MERGE value is inherited # # [RUN] test_prctl_unmerge # ok 8 Pages were unmerged # Bail out! 1 out of 8 tests failed # # Planned tests != run tests (9 != 8) # # Totals: pass:7 fail:1 xfail:0 xpass:0 skip:0 error:0 # [FAIL] This patch improves compatibility and error handling by: 1. Changes the feature check to first query supported features (features=0) rather than specifically requesting WP support. 2. Gracefully skipping the test if: - UFFDIO_API fails with EINVAL (feature not supported), or - UFFD_FEATURE_PAGEFAULT_FLAG_WP is not advertised by the kernel. 3. Providing better diagnostics by distinguishing expected failures (e.g., EINVAL) from unexpected ones and reporting them using strerror(). The updated logic makes the test more robust across different kernel versions and configurations, while preserving existing behavior on systems that do support UFFD-WP. Signed-off-by: Li Wang <liwang(a)redhat.com> Cc: Aruna Ramakrishna <aruna.ramakrishna(a)oracle.com> Cc: Bagas Sanjaya <bagasdotme(a)gmail.com> Cc: Catalin Marinas <catalin.marinas(a)arm.com> Cc: Dave Hansen <dave.hansen(a)linux.intel.com> Cc: David Hildenbrand <david(a)redhat.com> Cc: Joey Gouly <joey.gouly(a)arm.com> Cc: Johannes Weiner <hannes(a)cmpxchg.org> Cc: Keith Lucas <keith.lucas(a)oracle.com> Cc: Ryan Roberts <ryan.roberts(a)arm.com> Cc: Shuah Khan <shuah(a)kernel.org> --- tools/testing/selftests/mm/ksm_functional_tests.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index b61803e36d1c..f3db257dc555 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -393,9 +393,13 @@ static void test_unmerge_uffd_wp(void) /* See if UFFD-WP is around. */ uffdio_api.api = UFFD_API; - uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP; + uffdio_api.features = 0; if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { - ksft_test_result_fail("UFFDIO_API failed\n"); + if (errno == EINVAL) + ksft_test_result_skip("UFFDIO_API not supported (EINVAL)\n"); + else + ksft_test_result_fail("UFFDIO_API failed: %s\n", strerror(errno)); + goto close_uffd; } if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { -- 2.49.0

3 months, 3 weeks

4
10
0 0

[PATCH v3] selftests: riscv: add misaligned access testing

by Clément Léger

This selftest tests all the currently emulated instructions (except for the RV32 compressed ones which are left as a future exercise for a RV32 user). For the FPU instructions, all the FPU registers are tested. Signed-off-by: Clément Léger <cleger(a)rivosinc.com> --- Note: This test can be executed with the FWFT series [1] or using an SBI firmware that delegates misaligned traps by default. If using QEMU, you will need the patches mentioned at [2] so that misaligned accesses will generate a trap. Note: This commit was part of a series [3] that was partially merged. Note: the remaining checkpatch errors are not applicable to this tests which is a user-space one and does not use the kernel headers. Macros with complex values can not be enclosed in do while loop since they are generating functions. Link: https://lore.kernel.org/all/20250424173204.1948385-1-cleger@rivosinc.com/ [1] Link: https://lore.kernel.org/all/20241211211933.198792-1-fkonrad@amd.com/ [2] Link: https://lore.kernel.org/linux-riscv/20250422162324.956065-1-cleger@rivosinc… [3] V3: - Fixed a segfault and a sign extension error found when compiling with -O<x>, x != 0 (Alex) - Use inline assembly to generate the sigbus and avoid GCC optimizations V2: - Fix commit description - Fix a few errors reported by checkpatch.pl tools/testing/selftests/riscv/Makefile | 2 +- .../selftests/riscv/misaligned/.gitignore | 1 + .../selftests/riscv/misaligned/Makefile | 12 + .../selftests/riscv/misaligned/common.S | 33 +++ .../testing/selftests/riscv/misaligned/fpu.S | 180 +++++++++++++ tools/testing/selftests/riscv/misaligned/gp.S | 103 +++++++ .../selftests/riscv/misaligned/misaligned.c | 253 ++++++++++++++++++ 7 files changed, 583 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/riscv/misaligned/.gitignore create mode 100644 tools/testing/selftests/riscv/misaligned/Makefile create mode 100644 tools/testing/selftests/riscv/misaligned/common.S create mode 100644 tools/testing/selftests/riscv/misaligned/fpu.S create mode 100644 tools/testing/selftests/riscv/misaligned/gp.S create mode 100644 tools/testing/selftests/riscv/misaligned/misaligned.c diff --git a/tools/testing/selftests/riscv/Makefile b/tools/testing/selftests/riscv/Makefile index 099b8c1f46f8..95a98ceeb3b3 100644 --- a/tools/testing/selftests/riscv/Makefile +++ b/tools/testing/selftests/riscv/Makefile @@ -5,7 +5,7 @@ ARCH ?= $(shell uname -m 2>/dev/null || echo not) ifneq (,$(filter $(ARCH),riscv)) -RISCV_SUBTARGETS ?= abi hwprobe mm sigreturn vector +RISCV_SUBTARGETS ?= abi hwprobe mm sigreturn vector misaligned else RISCV_SUBTARGETS := endif diff --git a/tools/testing/selftests/riscv/misaligned/.gitignore b/tools/testing/selftests/riscv/misaligned/.gitignore new file mode 100644 index 000000000000..5eff15a1f981 --- /dev/null +++ b/tools/testing/selftests/riscv/misaligned/.gitignore @@ -0,0 +1 @@ +misaligned diff --git a/tools/testing/selftests/riscv/misaligned/Makefile b/tools/testing/selftests/riscv/misaligned/Makefile new file mode 100644 index 000000000000..1aa40110c50d --- /dev/null +++ b/tools/testing/selftests/riscv/misaligned/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2021 ARM Limited +# Originally tools/testing/arm64/abi/Makefile + +CFLAGS += -I$(top_srcdir)/tools/include + +TEST_GEN_PROGS := misaligned + +include ../../lib.mk + +$(OUTPUT)/misaligned: misaligned.c fpu.S gp.S + $(CC) -g3 -static -o$@ -march=rv64imafdc $(CFLAGS) $(LDFLAGS) $^ diff --git a/tools/testing/selftests/riscv/misaligned/common.S b/tools/testing/selftests/riscv/misaligned/common.S new file mode 100644 index 000000000000..8fa00035bd5d --- /dev/null +++ b/tools/testing/selftests/riscv/misaligned/common.S @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025 Rivos Inc. + * + * Authors: + * Clément Léger <cleger(a)rivosinc.com> + */ + +.macro lb_sb temp, offset, src, dst + lb \temp, \offset(\src) + sb \temp, \offset(\dst) +.endm + +.macro copy_long_to temp, src, dst + lb_sb \temp, 0, \src, \dst, + lb_sb \temp, 1, \src, \dst, + lb_sb \temp, 2, \src, \dst, + lb_sb \temp, 3, \src, \dst, + lb_sb \temp, 4, \src, \dst, + lb_sb \temp, 5, \src, \dst, + lb_sb \temp, 6, \src, \dst, + lb_sb \temp, 7, \src, \dst, +.endm + +.macro sp_stack_prologue offset + addi sp, sp, -8 + sub sp, sp, \offset +.endm + +.macro sp_stack_epilogue offset + add sp, sp, \offset + addi sp, sp, 8 +.endm diff --git a/tools/testing/selftests/riscv/misaligned/fpu.S b/tools/testing/selftests/riscv/misaligned/fpu.S new file mode 100644 index 000000000000..a7ad4430a424 --- /dev/null +++ b/tools/testing/selftests/riscv/misaligned/fpu.S @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025 Rivos Inc. + * + * Authors: + * Clément Léger <cleger(a)rivosinc.com> + */ + +#include "common.S" + +#define CASE_ALIGN 4 + +.macro fpu_load_inst fpreg, inst, precision, load_reg +.align CASE_ALIGN + \inst \fpreg, 0(\load_reg) + fmv.\precision fa0, \fpreg + j 2f +.endm + +#define flw(__fpreg) fpu_load_inst __fpreg, flw, s, a4 +#define fld(__fpreg) fpu_load_inst __fpreg, fld, d, a4 +#define c_flw(__fpreg) fpu_load_inst __fpreg, c.flw, s, a4 +#define c_fld(__fpreg) fpu_load_inst __fpreg, c.fld, d, a4 +#define c_fldsp(__fpreg) fpu_load_inst __fpreg, c.fldsp, d, sp + +.macro fpu_store_inst fpreg, inst, precision, store_reg +.align CASE_ALIGN + fmv.\precision \fpreg, fa0 + \inst \fpreg, 0(\store_reg) + j 2f +.endm + +#define fsw(__fpreg) fpu_store_inst __fpreg, fsw, s, a4 +#define fsd(__fpreg) fpu_store_inst __fpreg, fsd, d, a4 +#define c_fsw(__fpreg) fpu_store_inst __fpreg, c.fsw, s, a4 +#define c_fsd(__fpreg) fpu_store_inst __fpreg, c.fsd, d, a4 +#define c_fsdsp(__fpreg) fpu_store_inst __fpreg, c.fsdsp, d, sp + +.macro fp_test_prologue + move a4, a1 + /* + * Compute jump offset to store the correct FP register since we don't + * have indirect FP register access (or at least we don't use this + * extension so that works on all archs) + */ + sll t0, a0, CASE_ALIGN + la t2, 1f + add t0, t0, t2 + jr t0 +.align CASE_ALIGN +1: +.endm + +.macro fp_test_prologue_compressed + /* FP registers for compressed instructions starts from 8 to 16 */ + addi a0, a0, -8 + fp_test_prologue +.endm + +#define fp_test_body_compressed(__inst_func) \ + __inst_func(f8); \ + __inst_func(f9); \ + __inst_func(f10); \ + __inst_func(f11); \ + __inst_func(f12); \ + __inst_func(f13); \ + __inst_func(f14); \ + __inst_func(f15); \ +2: + +#define fp_test_body(__inst_func) \ + __inst_func(f0); \ + __inst_func(f1); \ + __inst_func(f2); \ + __inst_func(f3); \ + __inst_func(f4); \ + __inst_func(f5); \ + __inst_func(f6); \ + __inst_func(f7); \ + __inst_func(f8); \ + __inst_func(f9); \ + __inst_func(f10); \ + __inst_func(f11); \ + __inst_func(f12); \ + __inst_func(f13); \ + __inst_func(f14); \ + __inst_func(f15); \ + __inst_func(f16); \ + __inst_func(f17); \ + __inst_func(f18); \ + __inst_func(f19); \ + __inst_func(f20); \ + __inst_func(f21); \ + __inst_func(f22); \ + __inst_func(f23); \ + __inst_func(f24); \ + __inst_func(f25); \ + __inst_func(f26); \ + __inst_func(f27); \ + __inst_func(f28); \ + __inst_func(f29); \ + __inst_func(f30); \ + __inst_func(f31); \ +2: +.text + +#define __gen_test_inst(__inst, __suffix) \ +.global test_ ## __inst; \ +test_ ## __inst:; \ + fp_test_prologue ## __suffix; \ + fp_test_body ## __suffix(__inst); \ + ret + +#define gen_test_inst_compressed(__inst) \ + .option arch,+c; \ + __gen_test_inst(c_ ## __inst, _compressed) + +#define gen_test_inst(__inst) \ + .balign 16; \ + .option push; \ + .option arch,-c; \ + __gen_test_inst(__inst, ); \ + .option pop + +.macro fp_test_prologue_load_compressed_sp + copy_long_to t0, a1, sp +.endm + +.macro fp_test_epilogue_load_compressed_sp +.endm + +.macro fp_test_prologue_store_compressed_sp +.endm + +.macro fp_test_epilogue_store_compressed_sp + copy_long_to t0, sp, a1 +.endm + +#define gen_inst_compressed_sp(__inst, __type) \ + .global test_c_ ## __inst ## sp; \ + test_c_ ## __inst ## sp:; \ + sp_stack_prologue a2; \ + fp_test_prologue_## __type ## _compressed_sp; \ + fp_test_prologue_compressed; \ + fp_test_body_compressed(c_ ## __inst ## sp); \ + fp_test_epilogue_## __type ## _compressed_sp; \ + sp_stack_epilogue a2; \ + ret + +#define gen_test_load_compressed_sp(__inst) gen_inst_compressed_sp(__inst, load) +#define gen_test_store_compressed_sp(__inst) gen_inst_compressed_sp(__inst, store) + +/* + * float_fsw_reg - Set a FP register from a register containing the value + * a0 = FP register index to be set + * a1 = addr where to store register value + * a2 = address offset + * a3 = value to be store + */ +gen_test_inst(fsw) + +/* + * float_flw_reg - Get a FP register value and return it + * a0 = FP register index to be retrieved + * a1 = addr to load register from + * a2 = address offset + */ +gen_test_inst(flw) + +gen_test_inst(fsd) +#ifdef __riscv_compressed +gen_test_inst_compressed(fsd) +gen_test_store_compressed_sp(fsd) +#endif + +gen_test_inst(fld) +#ifdef __riscv_compressed +gen_test_inst_compressed(fld) +gen_test_load_compressed_sp(fld) +#endif diff --git a/tools/testing/selftests/riscv/misaligned/gp.S b/tools/testing/selftests/riscv/misaligned/gp.S new file mode 100644 index 000000000000..f53f4c6d81dd --- /dev/null +++ b/tools/testing/selftests/riscv/misaligned/gp.S @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025 Rivos Inc. + * + * Authors: + * Clément Léger <cleger(a)rivosinc.com> + */ + +#include "common.S" + +.text + +.macro __gen_test_inst inst, src_reg + \inst a2, 0(\src_reg) + move a0, a2 +.endm + +.macro gen_func_header func_name, rvc + .option arch,\rvc + .global test_\func_name + test_\func_name: +.endm + +.macro gen_test_inst inst + .option push + gen_func_header \inst, -c + __gen_test_inst \inst, a0 + .option pop + ret +.endm + +.macro __gen_test_inst_c name, src_reg + .option push + gen_func_header c_\name, +c + __gen_test_inst c.\name, \src_reg + .option pop + ret +.endm + +.macro gen_test_inst_c name + __gen_test_inst_c \name, a0 +.endm + + +.macro gen_test_inst_load_c_sp name + .option push + gen_func_header c_\name\()sp, +c + sp_stack_prologue a1 + copy_long_to t0, a0, sp + c.ldsp a0, 0(sp) + sp_stack_epilogue a1 + .option pop + ret +.endm + +.macro lb_sp_sb_a0 reg, offset + lb_sb \reg, \offset, sp, a0 +.endm + +.macro gen_test_inst_store_c_sp inst_name + .option push + gen_func_header c_\inst_name\()sp, +c + /* Misalign stack pointer */ + sp_stack_prologue a1 + /* Misalign access */ + c.sdsp a2, 0(sp) + copy_long_to t0, sp, a0 + sp_stack_epilogue a1 + .option pop + ret +.endm + + + /* + * a0 = addr to load from + * a1 = address offset + * a2 = value to be loaded + */ +gen_test_inst lh +gen_test_inst lhu +gen_test_inst lw +gen_test_inst lwu +gen_test_inst ld +#ifdef __riscv_compressed +gen_test_inst_c lw +gen_test_inst_c ld +gen_test_inst_load_c_sp ld +#endif + +/* + * a0 = addr where to store value + * a1 = address offset + * a2 = value to be stored + */ +gen_test_inst sh +gen_test_inst sw +gen_test_inst sd +#ifdef __riscv_compressed +gen_test_inst_c sw +gen_test_inst_c sd +gen_test_inst_store_c_sp sd +#endif + diff --git a/tools/testing/selftests/riscv/misaligned/misaligned.c b/tools/testing/selftests/riscv/misaligned/misaligned.c new file mode 100644 index 000000000000..50cd8f5ece94 --- /dev/null +++ b/tools/testing/selftests/riscv/misaligned/misaligned.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025 Rivos Inc. + * + * Authors: + * Clément Léger <cleger(a)rivosinc.com> + */ +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <linux/ptrace.h> +#include "../../kselftest_harness.h" + +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <float.h> +#include <errno.h> +#include <math.h> +#include <string.h> +#include <signal.h> +#include <stdbool.h> +#include <unistd.h> +#include <inttypes.h> +#include <ucontext.h> + +#include <sys/prctl.h> + +#define stringify(s) __stringify(s) +#define __stringify(s) #s + +#define VAL16 0x1234U +#define VAL32 0x5EADBEEFUL +#define VAL64 0x45674321D00DF789ULL + +#define VAL_float 78951.234375 +#define VAL_double 567890.512396965789589290 + +static bool float_equal(float a, float b) +{ + float scaled_epsilon; + float difference = fabsf(a - b); + + // Scale to the largest value. + a = fabsf(a); + b = fabsf(b); + if (a > b) + scaled_epsilon = FLT_EPSILON * a; + else + scaled_epsilon = FLT_EPSILON * b; + + return difference <= scaled_epsilon; +} + +static bool double_equal(double a, double b) +{ + double scaled_epsilon; + double difference = fabsl(a - b); + + // Scale to the largest value. + a = fabs(a); + b = fabs(b); + if (a > b) + scaled_epsilon = DBL_EPSILON * a; + else + scaled_epsilon = DBL_EPSILON * b; + + return difference <= scaled_epsilon; +} + +#define fpu_load_proto(__inst, __type) \ +extern __type test_ ## __inst(unsigned long fp_reg, void *addr, unsigned long offset, __type value) + +fpu_load_proto(flw, float); +fpu_load_proto(fld, double); +fpu_load_proto(c_flw, float); +fpu_load_proto(c_fld, double); +fpu_load_proto(c_fldsp, double); + +#define fpu_store_proto(__inst, __type) \ +extern void test_ ## __inst(unsigned long fp_reg, void *addr, unsigned long offset, __type value) + +fpu_store_proto(fsw, float); +fpu_store_proto(fsd, double); +fpu_store_proto(c_fsw, float); +fpu_store_proto(c_fsd, double); +fpu_store_proto(c_fsdsp, double); + +#define gp_load_proto(__inst, __type) \ +extern __type test_ ## __inst(void *addr, unsigned long offset, __type value) + +gp_load_proto(lh, uint16_t); +gp_load_proto(lhu, uint16_t); +gp_load_proto(lw, uint32_t); +gp_load_proto(lwu, uint32_t); +gp_load_proto(ld, uint64_t); +gp_load_proto(c_lw, uint32_t); +gp_load_proto(c_ld, uint64_t); +gp_load_proto(c_ldsp, uint64_t); + +#define gp_store_proto(__inst, __type) \ +extern void test_ ## __inst(void *addr, unsigned long offset, __type value) + +gp_store_proto(sh, uint16_t); +gp_store_proto(sw, uint32_t); +gp_store_proto(sd, uint64_t); +gp_store_proto(c_sw, uint32_t); +gp_store_proto(c_sd, uint64_t); +gp_store_proto(c_sdsp, uint64_t); + +#define TEST_GP_LOAD(__inst, __type_size) \ +TEST(gp_load_ ## __inst) \ +{ \ + int offset, ret; \ + uint8_t buf[16] __attribute__((aligned(16))); \ + \ + ret = prctl(PR_SET_UNALIGN, PR_UNALIGN_NOPRINT); \ + ASSERT_EQ(ret, 0); \ + \ + for (offset = 1; offset < (__type_size) / 8; offset++) { \ + uint ## __type_size ## _t val = VAL ## __type_size; \ + uint ## __type_size ## _t *ptr = (uint ## __type_size ## _t *)(buf + offset); \ + memcpy(ptr, &val, sizeof(val)); \ + val = test_ ## __inst(ptr, offset, val); \ + EXPECT_EQ(VAL ## __type_size, val); \ + } \ +} + +TEST_GP_LOAD(lh, 16); +TEST_GP_LOAD(lhu, 16); +TEST_GP_LOAD(lw, 32); +TEST_GP_LOAD(lwu, 32); +TEST_GP_LOAD(ld, 64); +#ifdef __riscv_compressed +TEST_GP_LOAD(c_lw, 32); +TEST_GP_LOAD(c_ld, 64); +TEST_GP_LOAD(c_ldsp, 64); +#endif + +#define TEST_GP_STORE(__inst, __type_size) \ +TEST(gp_load_ ## __inst) \ +{ \ + int offset, ret; \ + uint8_t buf[16] __attribute__((aligned(16))); \ + \ + ret = prctl(PR_SET_UNALIGN, PR_UNALIGN_NOPRINT); \ + ASSERT_EQ(ret, 0); \ + \ + for (offset = 1; offset < (__type_size) / 8; offset++) { \ + uint ## __type_size ## _t val = VAL ## __type_size; \ + uint ## __type_size ## _t *ptr = (uint ## __type_size ## _t *)(buf + offset); \ + memset(ptr, 0, sizeof(val)); \ + test_ ## __inst(ptr, offset, val); \ + memcpy(&val, ptr, sizeof(val)); \ + EXPECT_EQ(VAL ## __type_size, val); \ + } \ +} +TEST_GP_STORE(sh, 16); +TEST_GP_STORE(sw, 32); +TEST_GP_STORE(sd, 64); +#ifdef __riscv_compressed +TEST_GP_STORE(c_sw, 32); +TEST_GP_STORE(c_sd, 64); +TEST_GP_STORE(c_sdsp, 64); +#endif + +#define __TEST_FPU_LOAD(__type, __inst, __reg_start, __reg_end) \ +TEST(fpu_load_ ## __inst) \ +{ \ + int ret, offset, fp_reg; \ + uint8_t buf[16] __attribute__((aligned(16))); \ + \ + ret = prctl(PR_SET_UNALIGN, PR_UNALIGN_NOPRINT); \ + ASSERT_EQ(ret, 0); \ + \ + for (fp_reg = __reg_start; fp_reg < __reg_end; fp_reg++) { \ + for (offset = 1; offset < 4; offset++) { \ + void *load_addr = (buf + offset); \ + __type val = VAL_ ## __type ; \ + \ + memcpy(load_addr, &val, sizeof(val)); \ + val = test_ ## __inst(fp_reg, load_addr, offset, val); \ + EXPECT_TRUE(__type ##_equal(val, VAL_## __type)); \ + } \ + } \ +} +#define TEST_FPU_LOAD(__type, __inst) \ + __TEST_FPU_LOAD(__type, __inst, 0, 32) +#define TEST_FPU_LOAD_COMPRESSED(__type, __inst) \ + __TEST_FPU_LOAD(__type, __inst, 8, 16) + +TEST_FPU_LOAD(float, flw) +TEST_FPU_LOAD(double, fld) +#ifdef __riscv_compressed +TEST_FPU_LOAD_COMPRESSED(double, c_fld) +TEST_FPU_LOAD_COMPRESSED(double, c_fldsp) +#endif + +#define __TEST_FPU_STORE(__type, __inst, __reg_start, __reg_end) \ +TEST(fpu_store_ ## __inst) \ +{ \ + int ret, offset, fp_reg; \ + uint8_t buf[16] __attribute__((aligned(16))); \ + \ + ret = prctl(PR_SET_UNALIGN, PR_UNALIGN_NOPRINT); \ + ASSERT_EQ(ret, 0); \ + \ + for (fp_reg = __reg_start; fp_reg < __reg_end; fp_reg++) { \ + for (offset = 1; offset < 4; offset++) { \ + \ + void *store_addr = (buf + offset); \ + __type val = VAL_ ## __type ; \ + \ + test_ ## __inst(fp_reg, store_addr, offset, val); \ + memcpy(&val, store_addr, sizeof(val)); \ + EXPECT_TRUE(__type ## _equal(val, VAL_## __type)); \ + } \ + } \ +} +#define TEST_FPU_STORE(__type, __inst) \ + __TEST_FPU_STORE(__type, __inst, 0, 32) +#define TEST_FPU_STORE_COMPRESSED(__type, __inst) \ + __TEST_FPU_STORE(__type, __inst, 8, 16) + +TEST_FPU_STORE(float, fsw) +TEST_FPU_STORE(double, fsd) +#ifdef __riscv_compressed +TEST_FPU_STORE_COMPRESSED(double, c_fsd) +TEST_FPU_STORE_COMPRESSED(double, c_fsdsp) +#endif + +TEST_SIGNAL(gen_sigbus, SIGBUS) +{ + uint32_t val = VAL32; + uint8_t buf[16] __attribute__((aligned(16))); + int ret; + + ret = prctl(PR_SET_UNALIGN, PR_UNALIGN_SIGBUS); + ASSERT_EQ(ret, 0); + + asm volatile("sw %0, 1(%1)" : : "r"(val), "r"(buf) : "memory"); +} + +int main(int argc, char **argv) +{ + int ret, val; + + ret = prctl(PR_GET_UNALIGN, &val); + if (ret == -1 && errno == EINVAL) + ksft_exit_skip("SKIP GET_UNALIGN_CTL not supported\n"); + + exit(test_harness_run(argc, argv)); +} -- 2.49.0

3 months, 3 weeks

2
4
0 0

[PATCH] selftests/nolibc: avoid GCC 15 -Wunterminated-string-initialization

by Thomas Weißschuh

On GCC 15 the following warnings is emitted: nolibc-test.c: In function ‘run_stdlib’: nolibc-test.c:1416:32: warning: initializer-string for array of ‘char’ truncates NUL terminator but destination lacks ‘nonstring’ attribute (11 chars into 10 available) [-Wunterminated-string-initialization] 1416 | char buf[10] = "test123456"; | ^~~~~~~~~~~~ Increase the size of buf to avoid the warning. It would also be possible to use __attribute__((nonstring)) but that would require some ifdeffery to work with older compilers. Fixes: 1063649cf531 ("selftests/nolibc: Add tests for strlcat() and strlcpy()") Signed-off-by: Thomas Weißschuh <linux(a)weissschuh.net> --- tools/testing/selftests/nolibc/nolibc-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index dbe13000fb1ac153e9a89f627492daeb584a05d4..52640d8ae402b9e34174ae798e74882ca750ec2b 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -1413,7 +1413,7 @@ int run_stdlib(int min, int max) * Add some more chars after the \0, to test functions that overwrite the buffer set * the \0 at the exact right position. */ - char buf[10] = "test123456"; + char buf[11] = "test123456"; buf[4] = '\0'; --- base-commit: eb135311083100b6590a7545618cd9760d896a86 change-id: 20250623-nolibc-nonstring-7fe6974552b5 Best regards, -- Thomas Weißschuh <linux(a)weissschuh.net>

3 months, 3 weeks

3
2
0 0

[PATCH] selftests/kexec: ignore selftest binary

by Dylan Yudaken

Add a .gitignore for the test case build object. Signed-off-by: Dylan Yudaken <dyudaken(a)gmail.com> --- Hi, I noticed this was causing some noise in my git checkout, but perhaps I was doing something odd that it has not been noticed before? Regards, Dylan tools/testing/selftests/kexec/.gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 tools/testing/selftests/kexec/.gitignore diff --git a/tools/testing/selftests/kexec/.gitignore b/tools/testing/selftests/kexec/.gitignore new file mode 100644 index 000000000000..5f3d9e089ae8 --- /dev/null +++ b/tools/testing/selftests/kexec/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +test_kexec_jump base-commit: 86731a2a651e58953fc949573895f2fa6d456841 -- 2.49.0

3 months, 3 weeks

2
1
0 0

[kvm-unit-tests PATCH] riscv: lib: sbi_shutdown add exit code.

by Jesse Taube

When exiting it may be useful for the sbi implementation to know the exit code. Add exit code to sbi_shutdown, and use it in exit(). Signed-off-by: Jesse Taube <jesse(a)rivosinc.com> --- lib/riscv/asm/sbi.h | 2 +- lib/riscv/io.c | 2 +- lib/riscv/sbi.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/riscv/asm/sbi.h b/lib/riscv/asm/sbi.h index a5738a5c..de11c109 100644 --- a/lib/riscv/asm/sbi.h +++ b/lib/riscv/asm/sbi.h @@ -250,7 +250,7 @@ struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, unsigned long arg3, unsigned long arg4, unsigned long arg5); -void sbi_shutdown(void); +void sbi_shutdown(unsigned int code); struct sbiret sbi_hart_start(unsigned long hartid, unsigned long entry, unsigned long sp); struct sbiret sbi_hart_stop(void); struct sbiret sbi_hart_get_status(unsigned long hartid); diff --git a/lib/riscv/io.c b/lib/riscv/io.c index fb40adb7..02231268 100644 --- a/lib/riscv/io.c +++ b/lib/riscv/io.c @@ -150,7 +150,7 @@ void halt(int code); void exit(int code) { printf("\nEXIT: STATUS=%d\n", ((code) << 1) | 1); - sbi_shutdown(); + sbi_shutdown(code & 1); halt(code); __builtin_unreachable(); } diff --git a/lib/riscv/sbi.c b/lib/riscv/sbi.c index 2959378f..9dd11e9d 100644 --- a/lib/riscv/sbi.c +++ b/lib/riscv/sbi.c @@ -107,9 +107,9 @@ struct sbiret sbi_sse_inject(unsigned long event_id, unsigned long hart_id) return sbi_ecall(SBI_EXT_SSE, SBI_EXT_SSE_INJECT, event_id, hart_id, 0, 0, 0, 0); } -void sbi_shutdown(void) +void sbi_shutdown(unsigned int code) { - sbi_ecall(SBI_EXT_SRST, 0, 0, 0, 0, 0, 0, 0); + sbi_ecall(SBI_EXT_SRST, 0, 0, code, 0, 0, 0, 0); puts("SBI shutdown failed!\n"); } -- 2.43.0

3 months, 3 weeks

2
3
0 0

[PATCH] ksm_tests: Skip hugepage test when Transparent Hugepages are disabled

by Li Wang

Some systems (e.g. minimal or real-time kernels) may not enable Transparent Hugepages (THP), causing MADV_HUGEPAGE to return EINVAL. This patch introduces a runtime check using the existing THP sysfs interface and skips the hugepage merging test (`-H`) when THP is not available. To avoid those failures: # ----------------------------- # running ./ksm_tests -H -s 100 # ----------------------------- # ksm_tests: MADV_HUGEPAGE: Invalid argument # [FAIL] not ok 1 ksm_tests -H -s 100 # exit=2 # -------------------- # running ./khugepaged # -------------------- # Reading PMD pagesize failed# [FAIL] not ok 1 khugepaged # exit=1 # -------------------- # running ./soft-dirty # -------------------- # TAP version 13 # 1..15 # ok 1 Test test_simple # ok 2 Test test_vma_reuse dirty bit of allocated page # ok 3 Test test_vma_reuse dirty bit of reused address page # Bail out! Reading PMD pagesize failed# Planned tests != run tests (15 != 3) # # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0 # [FAIL] not ok 1 soft-dirty # exit=1 # SUMMARY: PASS=0 SKIP=0 FAIL=1 # ------------------- # running ./migration # ------------------- # TAP version 13 # 1..3 # # Starting 3 tests from 1 test cases. # # RUN migration.private_anon ... # # OK migration.private_anon # ok 1 migration.private_anon # # RUN migration.shared_anon ... # # OK migration.shared_anon # ok 2 migration.shared_anon # # RUN migration.private_anon_thp ... # # migration.c:196:private_anon_thp:Expected madvise(ptr, TWOMEG, MADV_HUGEPAGE) (-1) == 0 (0) # # private_anon_thp: Test terminated by assertion # # FAIL migration.private_anon_thp # not ok 3 migration.private_anon_thp # # FAILED: 2 / 3 tests passed. # # Totals: pass:2 fail:1 xfail:0 xpass:0 skip:0 error:0 # [FAIL] not ok 1 migration # exit=1 Signed-off-by: Li Wang <liwang(a)redhat.com> Cc: Aruna Ramakrishna <aruna.ramakrishna(a)oracle.com> Cc: Bagas Sanjaya <bagasdotme(a)gmail.com> Cc: Catalin Marinas <catalin.marinas(a)arm.com> Cc: Dave Hansen <dave.hansen(a)linux.intel.com> Cc: David Hildenbrand <david(a)redhat.com> Cc: Joey Gouly <joey.gouly(a)arm.com> Cc: Johannes Weiner <hannes(a)cmpxchg.org> Cc: Keith Lucas <keith.lucas(a)oracle.com> Cc: Ryan Roberts <ryan.roberts(a)arm.com> Cc: Shuah Khan <shuah(a)kernel.org> --- tools/testing/selftests/mm/khugepaged.c | 5 +++++ tools/testing/selftests/mm/ksm_tests.c | 6 ++++++ tools/testing/selftests/mm/migration.c | 8 ++++++++ tools/testing/selftests/mm/soft-dirty.c | 9 ++++++++- tools/testing/selftests/mm/thp_settings.c | 11 +++++++++++ tools/testing/selftests/mm/thp_settings.h | 2 ++ 6 files changed, 40 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index 8a4d34cce36b..6822eb7ea58e 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -1190,6 +1190,11 @@ int main(int argc, char **argv) .read_ahead_kb = 0, }; + if (!thp_is_enabled()) { + printf("Transparent Hugepages not available\n"); + return KSFT_SKIP; + } + parse_test_type(argc, argv); setbuf(stdout, NULL); diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c index dcdd5bb20f3d..1fb2263faa8d 100644 --- a/tools/testing/selftests/mm/ksm_tests.c +++ b/tools/testing/selftests/mm/ksm_tests.c @@ -15,6 +15,7 @@ #include "../kselftest.h" #include <include/vdso/time64.h> #include "vm_util.h" +#include "thp_settings.h" #define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/" #define KSM_FP(s) (KSM_SYSFS_PATH s) @@ -555,6 +556,11 @@ static int ksm_merge_hugepages_time(int merge_type, int mapping, int prot, unsigned long scan_time_ns; int pagemap_fd, n_normal_pages, n_huge_pages; + if (!thp_is_enabled()) { + printf("Transparent Hugepages not available\n"); + return KSFT_SKIP; + } + map_size *= MB; size_t len = map_size; diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c index 1e3a595fbf01..a306f8bab087 100644 --- a/tools/testing/selftests/mm/migration.c +++ b/tools/testing/selftests/mm/migration.c @@ -5,6 +5,8 @@ */ #include "../kselftest_harness.h" +#include "thp_settings.h" + #include <strings.h> #include <pthread.h> #include <numa.h> @@ -185,6 +187,9 @@ TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME) uint64_t *ptr; int i; + if (!thp_is_enabled()) + SKIP(return, "Transparent Hugepages not available"); + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) SKIP(return, "Not enough threads or NUMA nodes available"); @@ -214,6 +219,9 @@ TEST_F_TIMEOUT(migration, shared_anon_thp, 2*RUNTIME) uint64_t *ptr; int i; + if (!thp_is_enabled()) + SKIP(return, "Transparent Hugepages not available"); + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) SKIP(return, "Not enough threads or NUMA nodes available"); diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index 8e1462ce0532..72d8ded87756 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -6,8 +6,10 @@ #include <stdint.h> #include <malloc.h> #include <sys/mman.h> + #include "../kselftest.h" #include "vm_util.h" +#include "thp_settings.h" #define PAGEMAP_FILE_PATH "/proc/self/pagemap" #define TEST_ITERATIONS 10000 @@ -78,8 +80,13 @@ static void test_hugepage(int pagemap_fd, int pagesize) { char *map; int i, ret; - size_t hpage_len = read_pmd_pagesize(); + if (!thp_is_enabled()) { + printf("Skipping test: Transparent Hugepages not available\n"); + return KSFT_SKIP; + } + + size_t hpage_len = read_pmd_pagesize(); if (!hpage_len) ksft_exit_fail_msg("Reading PMD pagesize failed"); diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c index ad872af1c81a..bad60ac52874 100644 --- a/tools/testing/selftests/mm/thp_settings.c +++ b/tools/testing/selftests/mm/thp_settings.c @@ -381,3 +381,14 @@ unsigned long thp_shmem_supported_orders(void) { return __thp_supported_orders(true); } + +bool thp_is_enabled(void) +{ + if (access(THP_SYSFS, F_OK) != 0) + return false; + + int mode = thp_read_string("enabled", thp_enabled_strings); + + /* THP is considered enabled if it's either "always" or "madvise" */ + return mode == 1 || mode == 3; +} diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h index fc131d23d593..6c07f70beee9 100644 --- a/tools/testing/selftests/mm/thp_settings.h +++ b/tools/testing/selftests/mm/thp_settings.h @@ -84,4 +84,6 @@ void thp_set_read_ahead_path(char *path); unsigned long thp_supported_orders(void); unsigned long thp_shmem_supported_orders(void); +bool thp_is_enabled(void); + #endif /* __THP_SETTINGS_H__ */ -- 2.49.0

3 months, 3 weeks

3
3
0 0

[PATCH v8 net-next 00/15] AccECN protocol patch series

by chia-yu.chang＠nokia-bell-labs.com

From: Chia-Yu Chang <chia-yu.chang(a)nokia-bell-labs.com> Hello, Please find the v8 AccECN protocol patch series, which covers the core functionality of Accurate ECN, AccECN negotiation, AccECN TCP options, and AccECN failure handling. The Accurate ECN draft can be found in https://datatracker.ietf.org/doc/html/draft-ietf-tcpm-accurate-ecn-28 This patch series is part of the full AccECN patch series, which is available at https://github.com/L4STeam/linux-net-next/commits/upstream_l4steam/ v8 (10-Jun-2025) - Add new helper function tcp_ecn_received_counters_payload() in #6 (Paolo Abeni <pabeni(a)redhat.com>) - Set opts->num_sack_blocks=0 to avoid potential undefined value in #8 (Paolo Abeni <pabeni(a)redhat.com>) - Reset leftover_size to 2 once leftover_bytes is used in #9 (Paolo Abeni <pabeni(a)redhat.com>) - Add new helper function tcp_accecn_opt_demand_min() in #10 (Paolo Abeni <pabeni(a)redhat.com>) - Add new helper function tcp_accecn_saw_opt_fail_recv() in #11 (Paolo Abeni <pabeni(a)redhat.com>) - Update tcp_options_fit_accecn() to avoid using recursion in #14 (Paolo Abeni <pabeni(a)redhat.com>) v7 (14-May-2025) - Modify group sizes of tcp_sock_write_txrx and tcp_sock_write_rx in #3 based on pahole results (Paolo Abeni <pabeni(a)redhat.com>) - Fix the issue in #4 and #5 where the RFC3168 ECN behavior in tcp_ecn_send() is changed (Paolo Abeni <pabeni(a)redhat.com>) - Modify group size of tcp_sock_write_txrx in #4 and #6 based on pahole results (Paolo Abeni <pabeni(a)redhat.com>) - Update commit message for #9 to explain the increase in tcp_sock_write_rx group size - Modify group size of tcp_sock_write_tx in #10 based on pahole results v6 (09-May-2025) - Add #3 to utilize exisintg holes of tcp_sock_write_txrx group for later patches (#4, #9, #10) with new u8 members (Paolo Abeni <pabeni(a)redhat.com>) - Add pahole outcomes before and after commit in #4, #5, #6, #9, #10, #15 (Paolo Abeni <pabeni(a)redhat.com>) - Define new helper function tcp_send_ack_reflect_ect() for sending ACK with reflected ECT in #5 (Paolo Abeni <pabeni(a)redhat.com>) - Add comments for function tcp_ecn_rcv_synack() in #5 (Paolo Abeni <pabeni(a)redhat.com>) - Add enum/define to be used by sysctl_tcp_ecn in #5, sysctl_tcp_ecn_option in #9, and sysctl_tcp_ecn_option_beacon in #10 (Paolo Abeni <pabeni(a)redhat.com>) - Move accecn_fail_mode and saw_accecn_opt in #5 and #11 to use exisintg holes of tcp_sock (Paolo Abeni <pabeni(a)redhat.com>) - Change data type of new members of tcp_request_sock and move them to the end of struct in #5 and #11 (Paolo Abeni <pabeni(a)redhat.com>) - Move new members of tcp_info to the end of struct in #6 (Paolo Abeni <pabeni(a)redhat.com>) - Merge previous #7 into #9 (Paolo Abeni <pabeni(a)redhat.com>) - Mask ecnfield with INET_ECN_MASK to remove WARN_ONCE in #9 (Paolo Abeni <pabeni(a)redhat.com>) - Reduce the indentation levels for reabability in #9 and #10 (Paolo Abeni <pabeni(a)redhat.com>) - Move delivered_ecn_bytes to the RX group in #9, accecn_opt_tstamp to the TX group in #10, pkts_acked_ewma to the RX group in #15 (Paolo Abeni <pabeni(a)redhat.com>) - Add changes in Documentation/networking/net_cachelines/tcp_sock.rst for new tcp_sock members in #3, #5, #6, #9, #10, #15 v5 (22-Apr-2025) - Further fix for 32-bit ARM alignment in tcp.c (Simon Horman <horms(a)kernel.org>) v4 (18-Apr-2025) - Fix 32-bit ARM assertion for alignment requirement (Simon Horman <horms(a)kernel.org>) v3 (14-Apr-2025) - Fix patch apply issue in v2 (Jakub Kicinski <kuba(a)kernel.org>) v2 (18-Mar-2025) - Add one missing patch from the previous AccECN protocol preparation patch series to this patch series. Best regards, Chia-Yu Chia-Yu Chang (3): tcp: reorganize tcp_sock_write_txrx group for variables later tcp: accecn: AccECN option failure handling tcp: accecn: try to fit AccECN option with SACK Ilpo Järvinen (12): tcp: reorganize SYN ECN code tcp: fast path functions later tcp: AccECN core tcp: accecn: AccECN negotiation tcp: accecn: add AccECN rx byte counters tcp: accecn: AccECN needs to know delivered bytes tcp: sack option handling improvements tcp: accecn: AccECN option tcp: accecn: AccECN option send control tcp: accecn: AccECN option ceb/cep heuristic tcp: accecn: AccECN ACE field multi-wrap heuristic tcp: try to avoid safer when ACKs are thinned .../networking/net_cachelines/tcp_sock.rst | 14 + include/linux/tcp.h | 34 +- include/net/netns/ipv4.h | 2 + include/net/tcp.h | 225 ++++++- include/uapi/linux/tcp.h | 7 + net/ipv4/syncookies.c | 3 + net/ipv4/sysctl_net_ipv4.c | 19 + net/ipv4/tcp.c | 30 +- net/ipv4/tcp_input.c | 611 +++++++++++++++++- net/ipv4/tcp_ipv4.c | 7 +- net/ipv4/tcp_minisocks.c | 91 ++- net/ipv4/tcp_output.c | 303 ++++++++- net/ipv6/syncookies.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 14 files changed, 1250 insertions(+), 98 deletions(-) -- 2.34.1

3 months, 3 weeks

4
22
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-kselftest-mirror