July 2025 - Linux-kselftest-mirror

[PATCH net-next] devlink: Fix excessive stack usage in rate TC bandwidth parsing

by Tariq Toukan

From: Carolina Jubran <cjubran(a)nvidia.com> The devlink_nl_rate_tc_bw_parse function uses a large stack array for devlink attributes, which triggers a warning about excessive stack usage: net/devlink/rate.c: In function 'devlink_nl_rate_tc_bw_parse': net/devlink/rate.c:382:1: error: the frame size of 1648 bytes is larger than 1536 bytes [-Werror=frame-larger-than=] Introduce a separate attribute set specifically for rate TC bandwidth parsing that only contains the two attributes actually used: index and bandwidth. This reduces the stack array from DEVLINK_ATTR_MAX entries to just 2 entries, solving the stack usage issue. Update devlink selftest to use the new 'index' and 'bw' attribute names consistent with the YAML spec. Example usage with ynl with the new spec: ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \ --do rate-set --json '{ "bus-name": "pci", "dev-name": "0000:08:00.0", "port-index": 1, "rate-tc-bws": [ {"index": 0, "bw": 50}, {"index": 1, "bw": 50}, {"index": 2, "bw": 0}, {"index": 3, "bw": 0}, {"index": 4, "bw": 0}, {"index": 5, "bw": 0}, {"index": 6, "bw": 0}, {"index": 7, "bw": 0} ] }' ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \ --do rate-get --json '{ "bus-name": "pci", "dev-name": "0000:08:00.0", "port-index": 1 }' output for rate-get: {'bus-name': 'pci', 'dev-name': '0000:08:00.0', 'port-index': 1, 'rate-tc-bws': [{'bw': 50, 'index': 0}, {'bw': 50, 'index': 1}, {'bw': 0, 'index': 2}, {'bw': 0, 'index': 3}, {'bw': 0, 'index': 4}, {'bw': 0, 'index': 5}, {'bw': 0, 'index': 6}, {'bw': 0, 'index': 7}], 'rate-tx-max': 0, 'rate-tx-priority': 0, 'rate-tx-share': 0, 'rate-tx-weight': 0, 'rate-type': 'leaf'} Fixes: 566e8f108fc7 ("devlink: Extend devlink rate API with traffic classes bandwidth management") Reported-by: Arnd Bergmann <arnd(a)arndb.de> Closes: https://lore.kernel.org/netdev/20250708160652.1810573-1-arnd@kernel.org/ Reported-by: kernel test robot <lkp(a)intel.com> Closes: https://lore.kernel.org/oe-kbuild-all/202507171943.W7DJcs6Y-lkp@intel.com/ Suggested-by: Jakub Kicinski <kuba(a)kernel.org> Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> Signed-off-by: Carolina Jubran <cjubran(a)nvidia.com> Tested-by: Carolina Jubran <cjubran(a)nvidia.com> Signed-off-by: Tariq Toukan <tariqt(a)nvidia.com> --- Documentation/netlink/specs/devlink.yaml | 26 ++++++++----------- include/uapi/linux/devlink.h | 11 ++++++-- net/devlink/netlink_gen.c | 6 ++--- net/devlink/netlink_gen.h | 2 +- net/devlink/rate.c | 20 +++++++------- .../drivers/net/hw/devlink_rate_tc_bw.py | 16 ++++++------ 6 files changed, 42 insertions(+), 39 deletions(-) diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index 1c4bb0cbe5f0..bb87111d5e16 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -853,18 +853,6 @@ attribute-sets: type: nest multi-attr: true nested-attributes: dl-rate-tc-bws - - - name: rate-tc-index - type: u8 - checks: - max: rate-tc-index-max - - - name: rate-tc-bw - type: u32 - doc: | - Specifies the bandwidth share assigned to the Traffic Class. - The bandwidth for the traffic class is determined - in proportion to the sum of the shares of all configured classes. - name: dl-dev-stats subset-of: devlink @@ -1271,12 +1259,20 @@ attribute-sets: type: flag - name: dl-rate-tc-bws - subset-of: devlink + name-prefix: devlink-rate-tc-attr- attributes: - - name: rate-tc-index + name: index + type: u8 + checks: + max: rate-tc-index-max - - name: rate-tc-bw + name: bw + type: u32 + doc: | + Specifies the bandwidth share assigned to the Traffic Class. + The bandwidth for the traffic class is determined + in proportion to the sum of the shares of all configured classes. operations: enum-model: directional diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index e72bcc239afd..9fcb25a0f447 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -635,8 +635,6 @@ enum devlink_attr { DEVLINK_ATTR_REGION_DIRECT, /* flag */ DEVLINK_ATTR_RATE_TC_BWS, /* nested */ - DEVLINK_ATTR_RATE_TC_INDEX, /* u8 */ - DEVLINK_ATTR_RATE_TC_BW, /* u32 */ /* Add new attributes above here, update the spec in * Documentation/netlink/specs/devlink.yaml and re-generate @@ -647,6 +645,15 @@ enum devlink_attr { DEVLINK_ATTR_MAX = __DEVLINK_ATTR_MAX - 1 }; +enum devlink_rate_tc_attr { + DEVLINK_RATE_TC_ATTR_UNSPEC, + DEVLINK_RATE_TC_ATTR_INDEX, /* u8 */ + DEVLINK_RATE_TC_ATTR_BW, /* u32 */ + + __DEVLINK_RATE_TC_ATTR_MAX, + DEVLINK_RATE_TC_ATTR_MAX = __DEVLINK_RATE_TC_ATTR_MAX - 1 +}; + /* Mapping between internal resource described by the field and system * structure */ diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index c50436433c18..d97c326a9045 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -45,9 +45,9 @@ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_ [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15), }; -const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = { - [DEVLINK_ATTR_RATE_TC_INDEX] = NLA_POLICY_MAX(NLA_U8, DEVLINK_RATE_TC_INDEX_MAX), - [DEVLINK_ATTR_RATE_TC_BW] = { .type = NLA_U32, }, +const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_RATE_TC_ATTR_BW + 1] = { + [DEVLINK_RATE_TC_ATTR_INDEX] = NLA_POLICY_MAX(NLA_U8, DEVLINK_RATE_TC_INDEX_MAX), + [DEVLINK_RATE_TC_ATTR_BW] = { .type = NLA_U32, }, }; const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = { diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h index fb733b5d4ff1..09cc6f264ccf 100644 --- a/net/devlink/netlink_gen.h +++ b/net/devlink/netlink_gen.h @@ -13,7 +13,7 @@ /* Common nested types */ extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1]; -extern const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1]; +extern const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_RATE_TC_ATTR_BW + 1]; extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1]; /* Ops table for devlink */ diff --git a/net/devlink/rate.c b/net/devlink/rate.c index d39300a9b3d4..110b3fa8a0b1 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -90,8 +90,8 @@ static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw) if (!nla_tc_bw) return -EMSGSIZE; - if (nla_put_u8(msg, DEVLINK_ATTR_RATE_TC_INDEX, i) || - nla_put_u32(msg, DEVLINK_ATTR_RATE_TC_BW, tc_bw[i])) + if (nla_put_u8(msg, DEVLINK_RATE_TC_ATTR_INDEX, i) || + nla_put_u32(msg, DEVLINK_RATE_TC_ATTR_BW, tc_bw[i])) goto nla_put_failure; nla_nest_end(msg, nla_tc_bw); @@ -346,26 +346,26 @@ static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw, unsigned long *bitmap, struct netlink_ext_ack *extack) { - struct nlattr *tb[DEVLINK_ATTR_MAX + 1]; + struct nlattr *tb[DEVLINK_RATE_TC_ATTR_MAX + 1]; u8 tc_index; int err; - err = nla_parse_nested(tb, DEVLINK_ATTR_MAX, parent_nest, + err = nla_parse_nested(tb, DEVLINK_RATE_TC_ATTR_MAX, parent_nest, devlink_dl_rate_tc_bws_nl_policy, extack); if (err) return err; - if (!tb[DEVLINK_ATTR_RATE_TC_INDEX]) { + if (!tb[DEVLINK_RATE_TC_ATTR_INDEX]) { NL_SET_ERR_ATTR_MISS(extack, parent_nest, - DEVLINK_ATTR_RATE_TC_INDEX); + DEVLINK_RATE_TC_ATTR_INDEX); return -EINVAL; } - tc_index = nla_get_u8(tb[DEVLINK_ATTR_RATE_TC_INDEX]); + tc_index = nla_get_u8(tb[DEVLINK_RATE_TC_ATTR_INDEX]); - if (!tb[DEVLINK_ATTR_RATE_TC_BW]) { + if (!tb[DEVLINK_RATE_TC_ATTR_BW]) { NL_SET_ERR_ATTR_MISS(extack, parent_nest, - DEVLINK_ATTR_RATE_TC_BW); + DEVLINK_RATE_TC_ATTR_BW); return -EINVAL; } @@ -376,7 +376,7 @@ static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw, return -EINVAL; } - tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_ATTR_RATE_TC_BW]); + tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_RATE_TC_ATTR_BW]); return 0; } diff --git a/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py b/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py index 820d8a03becc..835c357919a8 100755 --- a/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py +++ b/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py @@ -208,14 +208,14 @@ def setup_devlink_rate(cfg): "port-index": port_index, "rate-tx-max": 125000000, "rate-tc-bws": [ - {"rate-tc-index": 0, "rate-tc-bw": 0}, - {"rate-tc-index": 1, "rate-tc-bw": 0}, - {"rate-tc-index": 2, "rate-tc-bw": 0}, - {"rate-tc-index": 3, "rate-tc-bw": 20}, - {"rate-tc-index": 4, "rate-tc-bw": 80}, - {"rate-tc-index": 5, "rate-tc-bw": 0}, - {"rate-tc-index": 6, "rate-tc-bw": 0}, - {"rate-tc-index": 7, "rate-tc-bw": 0}, + {"index": 0, "bw": 0}, + {"index": 1, "bw": 0}, + {"index": 2, "bw": 0}, + {"index": 3, "bw": 20}, + {"index": 4, "bw": 80}, + {"index": 5, "bw": 0}, + {"index": 6, "bw": 0}, + {"index": 7, "bw": 0}, ] }) except NlError as exc: base-commit: 3fc894728fb3a0d9282e81247b68c07468fe2985 -- 2.31.1

5 months, 1 week

4
3
0 0

[PATCH net v2] selftests: drv-net: wait for iperf client to stop sending

by Nimrod Oren

A few packets may still be sent out during the termination of iperf processes. These late packets cause failures in rss_ctx.py when they arrive on queues expected to be empty. Example failure observed: Check failed 2 != 0 traffic on inactive queues (context 1): [0, 0, 1, 1, 386385, 397196, 0, 0, 0, 0, ...] Check failed 4 != 0 traffic on inactive queues (context 2): [0, 0, 0, 0, 2, 2, 247152, 253013, 0, 0, ...] Check failed 2 != 0 traffic on inactive queues (context 3): [0, 0, 0, 0, 0, 0, 1, 1, 282434, 283070, ...] To avoid such failures, wait until all client sockets for the requested port are either closed or in the TIME_WAIT state. Fixes: 847aa551fa78 ("selftests: drv-net: rss_ctx: factor out send traffic and check") Signed-off-by: Nimrod Oren <noren(a)nvidia.com> Reviewed-by: Gal Pressman <gal(a)nvidia.com> Reviewed-by: Carolina Jubran <cjubran(a)nvidia.com> --- Changelog: v2: - Replace fixed sleep with logic that waits for client sockets to close. - Update commit title and message to reflect new approach. v1: https://lore.kernel.org/all/20250629111812.644282-1-noren@nvidia.com/ --- .../selftests/drivers/net/lib/py/load.py | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/drivers/net/lib/py/load.py b/tools/testing/selftests/drivers/net/lib/py/load.py index d9c10613ae67..44151b7b1a24 100644 --- a/tools/testing/selftests/drivers/net/lib/py/load.py +++ b/tools/testing/selftests/drivers/net/lib/py/load.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 +import re import time from lib.py import ksft_pr, cmd, ip, rand_port, wait_port_listen @@ -10,12 +11,11 @@ class GenerateTraffic: self.env = env - if port is None: - port = rand_port() - self._iperf_server = cmd(f"iperf3 -s -1 -p {port}", background=True) - wait_port_listen(port) + self.port = rand_port() if port is None else port + self._iperf_server = cmd(f"iperf3 -s -1 -p {self.port}", background=True) + wait_port_listen(self.port) time.sleep(0.1) - self._iperf_client = cmd(f"iperf3 -c {env.addr} -P 16 -p {port} -t 86400", + self._iperf_client = cmd(f"iperf3 -c {env.addr} -P 16 -p {self.port} -t 86400", background=True, host=env.remote) # Wait for traffic to ramp up @@ -56,3 +56,16 @@ class GenerateTraffic: ksft_pr(">> Server:") ksft_pr(self._iperf_server.stdout) ksft_pr(self._iperf_server.stderr) + self._wait_client_stopped() + + def _wait_client_stopped(self, sleep=0.005, timeout=5): + end = time.monotonic() + timeout + + live_port_pattern = re.compile(fr":{self.port:04X} 0[^6] ") + + while time.monotonic() < end: + data = cmd("cat /proc/net/tcp*", host=self.env.remote).stdout + if not live_port_pattern.search(data): + return + time.sleep(sleep) + raise Exception(f"Waiting for client to stop timed out after {timeout}s") -- 2.40.1

5 months, 1 week

3
2
0 0

[PATCH net-next v3 4/4] selftests: drv-net: add test for RSS on flow label

by Jakub Kicinski

Add a simple test for checking that RSS on flow label works, and that its rejected for IPv4 flows. # ./tools/testing/selftests/drivers/net/hw/rss_flow_label.py TAP version 13 1..2 ok 1 rss_flow_label.test_rss_flow_label ok 2 rss_flow_label.test_rss_flow_label_6only # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 Reviewed-by: Willem de Bruijn <willemb(a)google.com> Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> --- v2: - check for RPS / RFS v1: https://lore.kernel.org/20250722014915.3365370-5-kuba@kernel.org CC: shuah(a)kernel.org CC: sdf(a)fomichev.me CC: linux-kselftest(a)vger.kernel.org --- .../testing/selftests/drivers/net/hw/Makefile | 1 + .../drivers/net/hw/rss_flow_label.py | 167 ++++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/hw/rss_flow_label.py diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index fdc97355588c..5159fd34cb33 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -18,6 +18,7 @@ TEST_PROGS = \ pp_alloc_fail.py \ rss_api.py \ rss_ctx.py \ + rss_flow_label.py \ rss_input_xfrm.py \ tso.py \ xsk_reconfig.py \ diff --git a/tools/testing/selftests/drivers/net/hw/rss_flow_label.py b/tools/testing/selftests/drivers/net/hw/rss_flow_label.py new file mode 100755 index 000000000000..6fa95fe27c47 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/rss_flow_label.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +Tests for RSS hashing on IPv6 Flow Label. +""" + +import glob +import os +import socket +from lib.py import CmdExitFailure +from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_ge, ksft_in, \ + ksft_not_in, ksft_raises, KsftSkipEx +from lib.py import bkg, cmd, defer, fd_read_timeout, rand_port +from lib.py import NetDrvEpEnv + + +def _check_system(cfg): + if not hasattr(socket, "SO_INCOMING_CPU"): + raise KsftSkipEx("socket.SO_INCOMING_CPU was added in Python 3.11") + + qcnt = len(glob.glob(f"/sys/class/net/{cfg.ifname}/queues/rx-*")) + if qcnt < 2: + raise KsftSkipEx(f"Local has only {qcnt} queues") + + for f in [f"/sys/class/net/{cfg.ifname}/queues/rx-0/rps_flow_cnt", + f"/sys/class/net/{cfg.ifname}/queues/rx-0/rps_cpus"]: + try: + with open(f, 'r') as fp: + setting = fp.read().strip() + # CPU mask will be zeros and commas + if setting.replace("0", "").replace(",", ""): + raise KsftSkipEx(f"RPS/RFS is configured: {f}: {setting}") + except FileNotFoundError: + pass + + # 1 is the default, if someone changed it we probably shouldn"t mess with it + af = cmd("cat /proc/sys/net/ipv6/auto_flowlabels", host=cfg.remote).stdout + if af.strip() != "1": + raise KsftSkipEx("Remote does not have auto_flowlabels enabled") + + +def _ethtool_get_cfg(cfg, fl_type): + descr = cmd(f"ethtool -n {cfg.ifname} rx-flow-hash {fl_type}").stdout + + converter = { + "IP SA": "s", + "IP DA": "d", + "L3 proto": "t", + "L4 bytes 0 & 1 [TCP/UDP src port]": "f", + "L4 bytes 2 & 3 [TCP/UDP dst port]": "n", + "IPv6 Flow Label": "l", + } + + ret = "" + for line in descr.split("\n")[1:-2]: + # if this raises we probably need to add more keys to converter above + ret += converter[line] + return ret + + +def _traffic(cfg, one_sock, one_cpu): + local_port = rand_port(socket.SOCK_DGRAM) + remote_port = rand_port(socket.SOCK_DGRAM) + + sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) + sock.bind(("", local_port)) + sock.connect((cfg.remote_addr_v["6"], 0)) + if one_sock: + send = f"exec 5<>/dev/udp/{cfg.addr_v['6']}/{local_port}; " \ + "for i in `seq 20`; do echo a >&5; sleep 0.02; done; exec 5>&-" + else: + send = "for i in `seq 20`; do echo a | socat -t0.02 - UDP6:" \ + f"[{cfg.addr_v['6']}]:{local_port},sourceport={remote_port}; done" + + cpus = set() + with bkg(send, shell=True, host=cfg.remote, exit_wait=True): + for _ in range(20): + fd_read_timeout(sock.fileno(), 1) + cpu = sock.getsockopt(socket.SOL_SOCKET, socket.SO_INCOMING_CPU) + cpus.add(cpu) + + if one_cpu: + ksft_eq(len(cpus), 1, + f"{one_sock=} - expected one CPU, got traffic on: {cpus=}") + else: + ksft_ge(len(cpus), 2, + f"{one_sock=} - expected many CPUs, got traffic on: {cpus=}") + + +def test_rss_flow_label(cfg): + """ + Test hashing on IPv6 flow label. Send traffic over a single socket + and over multiple sockets. Depend on the remote having auto-label + enabled so that it randomizes the label per socket. + """ + + cfg.require_ipver("6") + cfg.require_cmd("socat", remote=True) + _check_system(cfg) + + # Enable flow label hashing for UDP6 + initial = _ethtool_get_cfg(cfg, "udp6") + no_lbl = initial.replace("l", "") + if "l" not in initial: + try: + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 l{no_lbl}") + except CmdExitFailure as exc: + raise KsftSkipEx("Device doesn't support Flow Label for UDP6") from exc + + defer(cmd, f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {initial}") + + _traffic(cfg, one_sock=True, one_cpu=True) + _traffic(cfg, one_sock=False, one_cpu=False) + + # Disable it, we should see no hashing (reset was already defer()ed) + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {no_lbl}") + + _traffic(cfg, one_sock=False, one_cpu=True) + + +def _check_v4_flow_types(cfg): + for fl_type in ["tcp4", "udp4", "ah4", "esp4", "sctp4"]: + try: + cur = cmd(f"ethtool -n {cfg.ifname} rx-flow-hash {fl_type}").stdout + ksft_not_in("Flow Label", cur, + comment=f"{fl_type=} has Flow Label:" + cur) + except CmdExitFailure: + # Probably does not support this flow type + pass + + +def test_rss_flow_label_6only(cfg): + """ + Test interactions with IPv4 flow types. It should not be possible to set + IPv6 Flow Label hashing for an IPv4 flow type. The Flow Label should also + not appear in the IPv4 "current config". + """ + + with ksft_raises(CmdExitFailure) as cm: + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash tcp4 sdfnl") + ksft_in("Invalid argument", cm.exception.cmd.stderr) + + _check_v4_flow_types(cfg) + + # Try to enable Flow Labels and check again, in case it leaks thru + initial = _ethtool_get_cfg(cfg, "udp6") + changed = initial.replace("l", "") if "l" in initial else initial + "l" + + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {changed}") + restore = defer(cmd, f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {initial}") + + _check_v4_flow_types(cfg) + restore.exec() + _check_v4_flow_types(cfg) + + +def main() -> None: + with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + ksft_run([test_rss_flow_label, + test_rss_flow_label_6only], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() -- 2.50.1

5 months, 1 week

1
0
0 0

[PATCH RFC 0/4] procfs: make reference pidns more user-visible

by Aleksa Sarai

Ever since the introduction of pid namespaces, procfs has had very implicit behaviour surrounding them (the pidns used by a procfs mount is auto-selected based on the mounting process's active pidns, and the pidns itself is basically hidden once the mount has been constructed). This has historically meant that userspace was required to do some special dances in order to configure the pidns of a procfs mount as desired. Examples include: * In order to bypass the mnt_too_revealing() check, Kubernetes creates a procfs mount from an empty pidns so that user namespaced containers can be nested (without this, the nested containers would fail to mount procfs). But this requires forking off a helper process because you cannot just one-shot this using mount(2). * Container runtimes in general need to fork into a container before configuring its mounts, which can lead to security issues in the case of shared-pidns containers (a privileged process in the pidns can interact with your container runtime process). While SUID_DUMP_DISABLE and user namespaces make this less of an issue, the strict need for this due to a minor uAPI wart is kind of unfortunate. Things would be much easier if there was a way for userspace to just specify the pidns they want. Patch 1 implements a new "pidns" argument which can be set using fsconfig(2): fsconfig(procfd, FSCONFIG_SET_FD, "pidns", NULL, nsfd); fsconfig(procfd, FSCONFIG_SET_STRING, "pidns", "/proc/self/ns/pid", 0); or classic mount(2) / mount(8): // mount -t proc -o pidns=/proc/self/ns/pid proc /tmp/proc mount("proc", "/tmp/proc", "proc", MS_..., "pidns=/proc/self/ns/pid"); The initial security model I have in this RFC is to be as conservative as possible and just mirror the security model for setns(2) -- which means that you can only set pidns=... to pid namespaces that your current pid namespace is a direct ancestor of. This fulfils the requirements of container runtimes, but I suspect that this may be too strict for some usecases. The pidns argument is not displayed in mountinfo -- it's not clear to me what value it would make sense to show (maybe we could just use ns_dname to provide an identifier for the namespace, but this number would be fairly useless to userspace). I'm open to suggestions. In addition, being able to figure out what pid namespace is being used by a procfs mount is quite useful when you have an administrative process (such as a container runtime) which wants to figure out the correct way of mapping PIDs between its own namespace and the namespace for procfs (using NS_GET_{PID,TGID}_{IN,FROM}_PIDNS). There are alternative ways to do this, but they all rely on ancillary information that third-party libraries and tools do not necessarily have access to. To make this easier, add a new ioctl (PROCFS_GET_PID_NAMESPACE) which can be used to get a reference to the pidns that a procfs is using. It's not quite clear what is the correct security model for this API, but the current approach I've taken is to: * Make the ioctl only valid on the root (meaning that a process without access to the procfs root -- such as only having an fd to a procfs file or some open_tree(2)-like subset -- cannot use this API). * Require that the process requesting either has access to /proc/1/ns/pid anyway (i.e. has ptrace-read access to the pidns pid1), has CAP_SYS_ADMIN access to the pidns (i.e. has administrative access to it and can join it if they had a handle), or is in a pidns that is a direct ancestor of the target pidns (i.e. all of the pids are already visible in the procfs for the current process's pidns). The security model for this is a little loose, as it seems to me that all of the cases mentioned are valid cases to allow access, but I'm open to suggestions for whether we need to make this stricter or looser. Signed-off-by: Aleksa Sarai <cyphar(a)cyphar.com> --- Aleksa Sarai (4): pidns: move is-ancestor logic to helper procfs: add pidns= mount option procfs: add PROCFS_GET_PID_NAMESPACE ioctl selftests/proc: add tests for new pidns APIs Documentation/filesystems/proc.rst | 10 ++ fs/proc/root.c | 132 +++++++++++++- include/linux/pid_namespace.h | 9 + include/uapi/linux/fs.h | 3 + kernel/pid_namespace.c | 21 ++- tools/testing/selftests/proc/.gitignore | 1 + tools/testing/selftests/proc/Makefile | 1 + tools/testing/selftests/proc/proc-pidns.c | 286 ++++++++++++++++++++++++++++++ 8 files changed, 448 insertions(+), 15 deletions(-) --- base-commit: 4c838c7672c39ec6ec48456c6ce22d14a68f4cda change-id: 20250717-procfs-pidns-api-8ed1583431f0 Best regards, -- Aleksa Sarai <cyphar(a)cyphar.com>

5 months, 1 week

2
7
0 0

[PATCH v2 0/5] KVM: Improve VMware guest support

by Zack Rusin

This is the second version of a series that lets us run VMware Workstation on Linux on top of KVM. The most significant change in this series is the introduction of CONFIG_KVM_VMWARE which is, in general, a nice cleanup for various bits of VMware compatibility code that have been scattered around KVM. (first patch) The rest of the series builds upon the VMware platform to implement features that are needed to run VMware guests without any modifications on top of KVM: - ability to turn on the VMware backdoor at runtime on a per-vm basis (used to be a kernel boot argument only) - support for VMware hypercalls - VMware products have a huge collection of hypercalls, all of which are handled in userspace, - support for handling legacy VMware backdoor in L0 in nested configs - in cases where we have WS running a Windows VBS guest, the L0 would be KVM, L1 Hyper-V so by default VMware Tools backdoor calls endup in Hyper-V which can not handle them, so introduce a cap to let L0 handle those. The final change in the series is a kselftest of the VMware hypercall functionality. Cc: Paolo Bonzini <pbonzini(a)redhat.com> Cc: Jonathan Corbet <corbet(a)lwn.net> Cc: Sean Christopherson <seanjc(a)google.com> Cc: Thomas Gleixner <tglx(a)linutronix.de> Cc: Ingo Molnar <mingo(a)redhat.com> Cc: Borislav Petkov <bp(a)alien8.de> Cc: Dave Hansen <dave.hansen(a)linux.intel.com> Cc: x86(a)kernel.org Cc: "H. Peter Anvin" <hpa(a)zytor.com> Cc: Zack Rusin <zack.rusin(a)broadcom.com> Cc: Doug Covelli <doug.covelli(a)broadcom.com> Cc: Shuah Khan <shuah(a)kernel.org> Cc: Namhyung Kim <namhyung(a)kernel.org> Cc: Arnaldo Carvalho de Melo <acme(a)redhat.com> Cc: Michael Ellerman <mpe(a)ellerman.id.au> Cc: Joel Stanley <joel(a)jms.id.au> Cc: Isaku Yamahata <isaku.yamahata(a)intel.com> Cc: kvm(a)vger.kernel.org Cc: linux-doc(a)vger.kernel.org Cc: linux-kernel(a)vger.kernel.org Cc: linux-kselftest(a)vger.kernel.org Zack Rusin (5): KVM: x86: Centralize KVM's VMware code KVM: x86: Allow enabling of the vmware backdoor via a cap KVM: x86: Add support for VMware guest specific hypercalls KVM: x86: Add support for legacy VMware backdoors in nested setups KVM: selftests: x86: Add a test for KVM_CAP_X86_VMWARE_HYPERCALL Documentation/virt/kvm/api.rst | 86 +++++++- MAINTAINERS | 9 + arch/x86/include/asm/kvm_host.h | 13 ++ arch/x86/kvm/Kconfig | 16 ++ arch/x86/kvm/Makefile | 1 + arch/x86/kvm/emulate.c | 11 +- arch/x86/kvm/kvm_vmware.c | 85 ++++++++ arch/x86/kvm/kvm_vmware.h | 189 ++++++++++++++++++ arch/x86/kvm/pmu.c | 39 +--- arch/x86/kvm/pmu.h | 4 - arch/x86/kvm/svm/nested.c | 6 + arch/x86/kvm/svm/svm.c | 10 +- arch/x86/kvm/vmx/nested.c | 6 + arch/x86/kvm/vmx/vmx.c | 5 +- arch/x86/kvm/x86.c | 74 +++---- arch/x86/kvm/x86.h | 2 - include/uapi/linux/kvm.h | 27 +++ tools/include/uapi/linux/kvm.h | 3 + tools/testing/selftests/kvm/Makefile.kvm | 1 + .../selftests/kvm/x86/vmware_hypercall_test.c | 121 +++++++++++ 20 files changed, 614 insertions(+), 94 deletions(-) create mode 100644 arch/x86/kvm/kvm_vmware.c create mode 100644 arch/x86/kvm/kvm_vmware.h create mode 100644 tools/testing/selftests/kvm/x86/vmware_hypercall_test.c -- 2.48.1

5 months, 1 week

2
2
0 0

[PATCH v2] rtc: Rename lib_test to test_rtc_lib

by Geert Uytterhoeven

When compiling the RTC library functions test as a module, the module has the non-descriptive name "lib_test.ko". Fix this by renaming it to "test_rtc_lib.ko". Signed-off-by: Geert Uytterhoeven <geert(a)linux-m68k.org> --- v2: - s/rtc_lib_test/test_rtc_lib/. --- drivers/rtc/Makefile | 2 +- drivers/rtc/{lib_test.c => test_rtc_lib.c} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename drivers/rtc/{lib_test.c => test_rtc_lib.c} (100%) diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 4619aa2ac4697591..789bddfea99d8fcd 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -15,7 +15,7 @@ rtc-core-$(CONFIG_RTC_INTF_DEV) += dev.o rtc-core-$(CONFIG_RTC_INTF_PROC) += proc.o rtc-core-$(CONFIG_RTC_INTF_SYSFS) += sysfs.o -obj-$(CONFIG_RTC_LIB_KUNIT_TEST) += lib_test.o +obj-$(CONFIG_RTC_LIB_KUNIT_TEST) += test_rtc_lib.o # Keep the list ordered. diff --git a/drivers/rtc/lib_test.c b/drivers/rtc/test_rtc_lib.c similarity index 100% rename from drivers/rtc/lib_test.c rename to drivers/rtc/test_rtc_lib.c -- 2.43.0

5 months, 1 week

2
1
0 0

[PATCH][next] tools/testing/selftests: Fix spelling mistake "unnmap" -> "unmap"

by Colin Ian King

There is a spelling mistake in ksft_test_result_fail messages. Fix them. Signed-off-by: Colin Ian King <colin.i.king(a)gmail.com> --- tools/testing/selftests/mm/mremap_test.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index fccf9e797a0c..774cdba102fc 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -525,10 +525,10 @@ static void mremap_move_multiple_vmas(unsigned int pattern_seed, out: if (success) ksft_test_result_pass("%s%s\n", test_name, - dont_unmap ? " [dontunnmap]" : ""); + dont_unmap ? " [dontunmap]" : ""); else ksft_test_result_fail("%s%s\n", test_name, - dont_unmap ? " [dontunnmap]" : ""); + dont_unmap ? " [dontunmap]" : ""); } static void mremap_shrink_multiple_vmas(unsigned long page_size, @@ -727,10 +727,10 @@ static void mremap_move_multiple_vmas_split(unsigned int pattern_seed, out: if (success) ksft_test_result_pass("%s%s\n", test_name, - dont_unmap ? " [dontunnmap]" : ""); + dont_unmap ? " [dontunmap]" : ""); else ksft_test_result_fail("%s%s\n", test_name, - dont_unmap ? " [dontunnmap]" : ""); + dont_unmap ? " [dontunmap]" : ""); } /* Returns the time taken for the remap on success else returns -1. */ -- 2.50.0

5 months, 1 week

5
4
0 0

[PATCH/RFC] kunit/rtc: Add real support for very slow tests

by Geert Uytterhoeven

When running rtc_lib_test ("lib_test" before my "[PATCH] rtc: Rename lib_test to rtc_lib_test") on m68k/ARAnyM: KTAP version 1 1..1 KTAP version 1 # Subtest: rtc_lib_test_cases # module: rtc_lib_test 1..2 # rtc_time64_to_tm_test_date_range_1000: Test should be marked slow (runtime: 3.222371420s) ok 1 rtc_time64_to_tm_test_date_range_1000 # rtc_time64_to_tm_test_date_range_160000: try timed out # rtc_time64_to_tm_test_date_range_160000: test case timed out # rtc_time64_to_tm_test_date_range_160000.speed: slow not ok 2 rtc_time64_to_tm_test_date_range_160000 # rtc_lib_test_cases: pass:1 fail:1 skip:0 total:2 # Totals: pass:1 fail:1 skip:0 total:2 not ok 1 rtc_lib_test_cases Commit 02c2d0c2a84172c3 ("kunit: Add speed attribute") added the notion of "very slow" tests, but this is further unused and unhandled. Hence: 1. Introduce KUNIT_CASE_VERY_SLOW(), 2. Increase timeout by ten; ideally this should only be done for very slow tests, but I couldn't find how to access kunit_case.attr.case from kunit_try_catch_run(), 3. Mark rtc_time64_to_tm_test_date_range_1000 slow, 4. Mark rtc_time64_to_tm_test_date_range_160000 very slow. Afterwards: KTAP version 1 1..1 KTAP version 1 # Subtest: rtc_lib_test_cases # module: rtc_lib_test 1..2 # rtc_time64_to_tm_test_date_range_1000.speed: slow ok 1 rtc_time64_to_tm_test_date_range_1000 # rtc_time64_to_tm_test_date_range_160000.speed: very_slow ok 2 rtc_time64_to_tm_test_date_range_160000 # rtc_lib_test_cases: pass:2 fail:0 skip:0 total:2 # Totals: pass:2 fail:0 skip:0 total:2 ok 1 rtc_lib_test_cases Signed-off-by: Geert Uytterhoeven <geert(a)linux-m68k.org> --- drivers/rtc/rtc_lib_test.c | 4 ++-- include/kunit/test.h | 11 +++++++++++ lib/kunit/try-catch.c | 3 ++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc_lib_test.c b/drivers/rtc/rtc_lib_test.c index c30c759662e39b48..fd3210e39d37dbc6 100644 --- a/drivers/rtc/rtc_lib_test.c +++ b/drivers/rtc/rtc_lib_test.c @@ -85,8 +85,8 @@ static void rtc_time64_to_tm_test_date_range_1000(struct kunit *test) } static struct kunit_case rtc_lib_test_cases[] = { - KUNIT_CASE(rtc_time64_to_tm_test_date_range_1000), - KUNIT_CASE_SLOW(rtc_time64_to_tm_test_date_range_160000), + KUNIT_CASE_SLOW(rtc_time64_to_tm_test_date_range_1000), + KUNIT_CASE_VERY_SLOW(rtc_time64_to_tm_test_date_range_160000), {} }; diff --git a/include/kunit/test.h b/include/kunit/test.h index 9b773406e01f3c43..4e3c1cae5b41466e 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -183,6 +183,17 @@ static inline char *kunit_status_to_ok_not_ok(enum kunit_status status) { .run_case = test_name, .name = #test_name, \ .attr.speed = KUNIT_SPEED_SLOW, .module_name = KBUILD_MODNAME} +/** + * KUNIT_CASE_VERY_SLOW - A helper for creating a &struct kunit_case + * with the very slow attribute + * + * @test_name: a reference to a test case function. + */ + +#define KUNIT_CASE_VERY_SLOW(test_name) \ + { .run_case = test_name, .name = #test_name, \ + .attr.speed = KUNIT_SPEED_VERY_SLOW, .module_name = KBUILD_MODNAME} + /** * KUNIT_CASE_PARAM - A helper for creation a parameterized &struct kunit_case * diff --git a/lib/kunit/try-catch.c b/lib/kunit/try-catch.c index 6bbe0025b0790bd2..92099c67bb21d0a4 100644 --- a/lib/kunit/try-catch.c +++ b/lib/kunit/try-catch.c @@ -56,7 +56,8 @@ static unsigned long kunit_test_timeout(void) * If tests timeout due to exceeding sysctl_hung_task_timeout_secs, * the task will be killed and an oops generated. */ - return 300 * msecs_to_jiffies(MSEC_PER_SEC); /* 5 min */ + // FIXME times ten for KUNIT_SPEED_VERY_SLOW? + return 10 * 300 * msecs_to_jiffies(MSEC_PER_SEC); /* 5 min */ } void kunit_try_catch_run(struct kunit_try_catch *try_catch, void *context) -- 2.43.0

5 months, 1 week

3
3
0 0

[PATCH net-next v2 4/4] selftests: drv-net: add test for RSS on flow label

by Jakub Kicinski

Add a simple test for checking that RSS on flow label works, and that its rejected for IPv4 flows. # ./tools/testing/selftests/drivers/net/hw/rss_flow_label.py TAP version 13 1..2 ok 1 rss_flow_label.test_rss_flow_label ok 2 rss_flow_label.test_rss_flow_label_6only # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> --- CC: shuah(a)kernel.org CC: sdf(a)fomichev.me CC: linux-kselftest(a)vger.kernel.org --- .../testing/selftests/drivers/net/hw/Makefile | 1 + .../drivers/net/hw/rss_flow_label.py | 151 ++++++++++++++++++ 2 files changed, 152 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/hw/rss_flow_label.py diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index fdc97355588c..5159fd34cb33 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -18,6 +18,7 @@ TEST_PROGS = \ pp_alloc_fail.py \ rss_api.py \ rss_ctx.py \ + rss_flow_label.py \ rss_input_xfrm.py \ tso.py \ xsk_reconfig.py \ diff --git a/tools/testing/selftests/drivers/net/hw/rss_flow_label.py b/tools/testing/selftests/drivers/net/hw/rss_flow_label.py new file mode 100755 index 000000000000..e471e13160ae --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/rss_flow_label.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +Tests for RSS hashing on IPv6 Flow Label. +""" + +import glob +import socket +from lib.py import CmdExitFailure +from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_ge, ksft_in, \ + ksft_not_in, ksft_raises, KsftSkipEx +from lib.py import bkg, cmd, defer, fd_read_timeout, rand_port +from lib.py import NetDrvEpEnv + + +def _ethtool_get_cfg(cfg, fl_type): + descr = cmd(f"ethtool -n {cfg.ifname} rx-flow-hash {fl_type}").stdout + + converter = { + "IP SA": "s", + "IP DA": "d", + "L3 proto": "t", + "L4 bytes 0 & 1 [TCP/UDP src port]": "f", + "L4 bytes 2 & 3 [TCP/UDP dst port]": "n", + "IPv6 Flow Label": "l", + } + + ret = "" + for line in descr.split("\n")[1:-2]: + # if this raises we probably need to add more keys to converter above + ret += converter[line] + return ret + + +def _traffic(cfg, one_sock, one_cpu): + local_port = rand_port(socket.SOCK_DGRAM) + remote_port = rand_port(socket.SOCK_DGRAM) + + sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) + sock.bind(("", local_port)) + sock.connect((cfg.remote_addr_v["6"], 0)) + if one_sock: + send = f"exec 5<>/dev/udp/{cfg.addr_v['6']}/{local_port}; " \ + "for i in `seq 20`; do echo a >&5; sleep 0.02; done; exec 5>&-" + else: + send = "for i in `seq 20`; do echo a | socat -t0.02 - UDP6:" \ + f"[{cfg.addr_v['6']}]:{local_port},sourceport={remote_port}; done" + + cpus = set() + with bkg(send, shell=True, host=cfg.remote, exit_wait=True): + for _ in range(20): + fd_read_timeout(sock.fileno(), 1) + cpu = sock.getsockopt(socket.SOL_SOCKET, socket.SO_INCOMING_CPU) + cpus.add(cpu) + + if one_cpu: + ksft_eq(len(cpus), 1, + f"{one_sock=} - expected one CPU, got traffic on: {cpus=}") + else: + ksft_ge(len(cpus), 2, + f"{one_sock=} - expected many CPUs, got traffic on: {cpus=}") + + +def test_rss_flow_label(cfg): + """ + Test hashing on IPv6 flow label. Send traffic over a single socket + and over multiple sockets. Depend on the remote having auto-label + enabled so that it randomizes the label per socket. + """ + + cfg.require_ipver("6") + cfg.require_cmd("socat", remote=True) + if not hasattr(socket, "SO_INCOMING_CPU"): + raise KsftSkipEx("socket.SO_INCOMING_CPU was added in Python 3.11") + + # 1 is the default, if someone changed it we probably shouldn"t mess with it + af = cmd("cat /proc/sys/net/ipv6/auto_flowlabels", host=cfg.remote).stdout + if af.strip() != "1": + raise KsftSkipEx("Remote does not have auto_flowlabels enabled") + + qcnt = len(glob.glob(f"/sys/class/net/{cfg.ifname}/queues/rx-*")) + if qcnt < 2: + raise KsftSkipEx(f"Local has only {qcnt} queues") + + # Enable flow label hashing for UDP6 + initial = _ethtool_get_cfg(cfg, "udp6") + no_lbl = initial.replace("l", "") + if "l" not in initial: + try: + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 l{no_lbl}") + except CmdExitFailure as exc: + raise KsftSkipEx("Device doesn't support Flow Label for UDP6") from exc + + defer(cmd, f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {initial}") + + _traffic(cfg, one_sock=True, one_cpu=True) + _traffic(cfg, one_sock=False, one_cpu=False) + + # Disable it, we should see no hashing (reset was already defer()ed) + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {no_lbl}") + + _traffic(cfg, one_sock=False, one_cpu=True) + + +def _check_v4_flow_types(cfg): + for fl_type in ["tcp4", "udp4", "ah4", "esp4", "sctp4"]: + try: + cur = cmd(f"ethtool -n {cfg.ifname} rx-flow-hash {fl_type}").stdout + ksft_not_in("Flow Label", cur, + comment=f"{fl_type=} has Flow Label:" + cur) + except CmdExitFailure: + # Probably does not support this flow type + pass + + +def test_rss_flow_label_6only(cfg): + """ + Test interactions with IPv4 flow types. It should not be possible to set + IPv6 Flow Label hashing for an IPv4 flow type. The Flow Label should also + not appear in the IPv4 "current config". + """ + + with ksft_raises(CmdExitFailure) as cm: + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash tcp4 sdfnl") + ksft_in("Invalid argument", cm.exception.cmd.stderr) + + _check_v4_flow_types(cfg) + + # Try to enable Flow Labels and check again, in case it leaks thru + initial = _ethtool_get_cfg(cfg, "udp6") + changed = initial.replace("l", "") if "l" in initial else initial + "l" + + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {changed}") + restore = defer(cmd, f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {initial}") + + _check_v4_flow_types(cfg) + restore.exec() + _check_v4_flow_types(cfg) + + +def main() -> None: + with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + ksft_run([test_rss_flow_label, + test_rss_flow_label_6only], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() -- 2.50.1

5 months, 1 week

2
1
0 0

[PATCH v6] selftests/mm: add process_madvise() tests

by wang lian

Add tests for process_madvise(), focusing on verifying behavior under various conditions including valid usage and error cases. Signed-off-by: wang lian <lianux.mm(a)gmail.com> Suggested-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com> Suggested-by: David Hildenbrand <david(a)redhat.com> Suggested-by: Zi Yan <ziy(a)nvidia.com> Suggested-by: Mark Brown <broonie(a)kernel.org> Acked-by: SeongJae Park <sj(a)kernel.org> --- Changelog v6: - Refactor child process and pidfd management to use the kselftest fixture's setup and teardown mechanism. This ensures that child processes are reliably terminated and file descriptors are closed, even when a test is aborted by an ASSERT or SKIP macro. This resolves the issue where a failed assertion could lead to a leaked child process. Changelog v5: https://lore.kernel.org/lkml/20250714122533.3135-1-lianux.mm@gmail.com/ - Refactor the remote_collapse test to concentrate on its primary goal confirming the successful remote invocation of process_madvise() on a child process. - Split the validation logic for invalid pidfds out of the remote test and into two new (`exited_process_pidfd` and `bad_pidfd`). - Based mm-new branch, can ensure clean application Changelog v4: https://lore.kernel.org/lkml/20250710112249.58722-1-lianux.mm@gmail.com/ - Refine resource cleanup logic in test teardown to be more robust. - Improve remote_collapse test to correctly handle different THP (Transparent Huge Page) policies ('always', 'madvise', 'never'), including handling race conditions with khugepaged. - Resolve build errors Changelog v3: https://lore.kernel.org/lkml/20250703044326.65061-1-lianux.mm@gmail.com/ - Rebased onto the latest mm-stable branch to ensure clean application. - Refactor common signal handling logic into vm_util to reduce code duplication. - Improve test robustness and diagnostics based on community feedback. - Address minor code style and script corrections. Changelog v2: https://lore.kernel.org/lkml/20250630140957.4000-1-lianux.mm@gmail.com/ - Drop MADV_DONTNEED tests based on feedback. - Focus solely on process_madvise() syscall. - Improve error handling and structure. - Add future-proof flag test. - Style and comment cleanups. -V1: https://lore.kernel.org/lkml/20250621133003.4733-1-lianux.mm@gmail.com/ tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/process_madv.c | 302 ++++++++++++++++++++++ tools/testing/selftests/mm/run_vmtests.sh | 5 + 4 files changed, 309 insertions(+) create mode 100644 tools/testing/selftests/mm/process_madv.c diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index f2dafa0b700b..e7b23a8a05fe --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -21,6 +21,7 @@ on-fault-limit transhuge-stress pagemap_ioctl pfnmap +process_madv *.tmp* protection_keys protection_keys_32 diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index ae6f994d3add..d13b3cef2a2b 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += pagemap_ioctl TEST_GEN_FILES += pfnmap +TEST_GEN_FILES += process_madv TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += uffd-stress diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c new file mode 100644 index 000000000000..8a83eac3bfab --- /dev/null +++ b/tools/testing/selftests/mm/process_madv.c @@ -0,0 +1,302 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#define _GNU_SOURCE +#include "../kselftest_harness.h" +#include <errno.h> +#include <setjmp.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/mman.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <sched.h> +#include "vm_util.h" + +#include "../pidfd/pidfd.h" + +FIXTURE(process_madvise) +{ + unsigned long page_size; + pid_t child_pid; + int remote_pidfd; + int pidfd; +}; + +FIXTURE_SETUP(process_madvise) +{ + self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); + self->pidfd = PIDFD_SELF; + self->remote_pidfd = -1; + self->child_pid = -1; +}; + +FIXTURE_TEARDOWN_PARENT(process_madvise) +{ + /* This teardown is guaranteed to run, even if tests SKIP or ASSERT */ + if (self->child_pid > 0) { + kill(self->child_pid, SIGKILL); + waitpid(self->child_pid, NULL, 0); + } + + if (self->remote_pidfd >= 0) + close(self->remote_pidfd); +} + +static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, + size_t vlen, int advice, unsigned int flags) +{ + return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags); +} + +/* + * This test uses PIDFD_SELF to target the current process. The main + * goal is to verify the basic behavior of process_madvise() with + * a vector of non-contiguous memory ranges, not its cross-process + * capabilities. + */ +TEST_F(process_madvise, basic) +{ + const unsigned long pagesize = self->page_size; + const int madvise_pages = 4; + struct iovec vec[madvise_pages]; + int pidfd = self->pidfd; + ssize_t ret; + char *map; + + /* + * Create a single large mapping. We will pick pages from this + * mapping to advise on. This ensures we test non-contiguous iovecs. + */ + map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + /* Fill the entire region with a known pattern. */ + memset(map, 'A', pagesize * 10); + + /* + * Setup the iovec to point to 4 non-contiguous pages + * within the mapping. + */ + vec[0].iov_base = &map[0 * pagesize]; + vec[0].iov_len = pagesize; + vec[1].iov_base = &map[3 * pagesize]; + vec[1].iov_len = pagesize; + vec[2].iov_base = &map[5 * pagesize]; + vec[2].iov_len = pagesize; + vec[3].iov_base = &map[8 * pagesize]; + vec[3].iov_len = pagesize; + + ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0); + if (ret == -1 && errno == EPERM) + SKIP(return, + "process_madvise() unsupported or permission denied, try running as root.\n"); + else if (errno == EINVAL) + SKIP(return, + "process_madvise() unsupported or parameter invalid, please check arguments.\n"); + + /* The call should succeed and report the total bytes processed. */ + ASSERT_EQ(ret, madvise_pages * pagesize); + + /* Check that advised pages are now zero. */ + for (int i = 0; i < madvise_pages; i++) { + char *advised_page = (char *)vec[i].iov_base; + + /* Content must be 0, not 'A'. */ + ASSERT_EQ(*advised_page, '\0'); + } + + /* Check that an un-advised page in between is still 'A'. */ + char *unadvised_page = &map[1 * pagesize]; + + for (int i = 0; i < pagesize; i++) + ASSERT_EQ(unadvised_page[i], 'A'); + + /* Cleanup. */ + ASSERT_EQ(munmap(map, pagesize * 10), 0); +} + +/* + * This test deterministically validates process_madvise() with MADV_COLLAPSE + * on a remote process, other advices are difficult to verify reliably. + * + * The test verifies that a memory region in a child process, + * focus on process_madv remote result, only check addresses and lengths. + * The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged. + */ +TEST_F(process_madvise, remote_collapse) +{ + const unsigned long pagesize = self->page_size; + long huge_page_size; + int pipe_info[2]; + ssize_t ret; + struct iovec vec; + + struct child_info { + pid_t pid; + void *map_addr; + } info; + + huge_page_size = default_huge_page_size(); + if (huge_page_size <= 0) + SKIP(return, "Could not determine a valid huge page size.\n"); + + ASSERT_EQ(pipe(pipe_info), 0); + + self->child_pid = fork(); + ASSERT_NE(self->child_pid, -1); + + if (self->child_pid == 0) { + char *map; + size_t map_size = 2 * huge_page_size; + + close(pipe_info[0]); + + map = mmap(NULL, map_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Fault in as small pages */ + for (size_t i = 0; i < map_size; i += pagesize) + map[i] = 'A'; + + /* Send info and pause */ + info.pid = getpid(); + info.map_addr = map; + ret = write(pipe_info[1], &info, sizeof(info)); + ASSERT_EQ(ret, sizeof(info)); + close(pipe_info[1]); + + pause(); + exit(0); + } + + close(pipe_info[1]); + + /* Receive child info */ + ret = read(pipe_info[0], &info, sizeof(info)); + if (ret <= 0) { + waitpid(self->child_pid, NULL, 0); + SKIP(return, "Failed to read child info from pipe.\n"); + } + ASSERT_EQ(ret, sizeof(info)); + close(pipe_info[0]); + self->child_pid = info.pid; + + self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); + ASSERT_GE(self->remote_pidfd, 0); + + vec.iov_base = info.map_addr; + vec.iov_len = huge_page_size; + + ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE, 0); + if (ret == -1) { + if (errno == EINVAL) + SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n"); + else if (errno == EPERM) + SKIP(return, + "No process_madvise() permissions, try running as root.\n"); + return; + } + + ASSERT_EQ(ret, huge_page_size); +} + +/* + * Test process_madvise() with a pidfd for a process that has already + * exited to ensure correct error handling. + */ +TEST_F(process_madvise, exited_process_pidfd) +{ + struct iovec vec; + ssize_t ret; + int pidfd; + + vec.iov_base = (void *)0x1234; + vec.iov_len = 4096; + + /* + * Using a pidfd for a process that has already exited should fail + * with ESRCH. + */ + self->child_pid = fork(); + ASSERT_NE(self->child_pid, -1); + + if (self->child_pid == 0) + exit(0); + + pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); + ASSERT_GE(pidfd, 0); + + /* Wait for the child to ensure it has terminated. */ + waitpid(self->child_pid, NULL, 0); + + ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, ESRCH); + close(pidfd); +} + +/* + * Test process_madvise() with bad pidfds to ensure correct error + * handling. + */ +TEST_F(process_madvise, bad_pidfd) +{ + struct iovec vec; + ssize_t ret; + + vec.iov_base = (void *)0x1234; + vec.iov_len = 4096; + + /* Using an invalid fd number (-1) should fail with EBADF. */ + ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EBADF); + + /* + * Using a valid fd that is not a pidfd (e.g. stdin) should fail + * with EBADF. + */ + ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EBADF); +} + +/* + * Test process_madvise() with an invalid flag value. Currently, only a flag + * value of 0 is supported. This test is reserved for the future, e.g., if + * synchronous flags are added. + */ +TEST_F(process_madvise, flag) +{ + const unsigned long pagesize = self->page_size; + unsigned int invalid_flag; + int pidfd = self->pidfd; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + invalid_flag = 0x80000000; + + ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); + + /* Cleanup. */ + ASSERT_EQ(munmap(map, pagesize), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index a38c984103ce..471e539d82b8 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -65,6 +65,8 @@ separated by spaces: test pagemap_scan IOCTL - pfnmap tests for VM_PFNMAP handling +- process_madv + test for process_madv - cow test copy-on-write semantics - thp @@ -425,6 +427,9 @@ CATEGORY="madv_guard" run_test ./guard-regions # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests CATEGORY="madv_populate" run_test ./madv_populate +# PROCESS_MADV test +CATEGORY="process_madv" run_test ./process_madv + CATEGORY="vma_merge" run_test ./merge if [ -x ./memfd_secret ] -- 2.43.0

5 months, 1 week

2
2
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-kselftest-mirror July 2025