The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x f2b492b04a167261e1c38eb76f78fb4294473a49
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062235-census-ramp-a602@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f2b492b04a167261e1c38eb76f78fb4294473a49 Mon Sep 17 00:00:00 2001
From: Matthieu Baerts <matthieu.baerts(a)tessares.net>
Date: Sat, 10 Jun 2023 18:11:47 +0200
Subject: [PATCH] selftests: mptcp: join: skip userspace PM tests if not
supported
Selftests are supposed to run on any kernels, including the old ones not
supporting all MPTCP features.
One of them is the support of the userspace PM introduced by commit
4638de5aefe5 ("mptcp: handle local addrs announced by userspace PMs")
and the following ones.
It is possible to look for the MPTCP pm_type's sysctl knob to know in
advance if the userspace PM is available.
Link: https://github.com/multipath-tcp/mptcp_net-next/issues/368
Fixes: 5ac1d2d63451 ("selftests: mptcp: Add tests for userspace PM type")
Cc: stable(a)vger.kernel.org
Signed-off-by: Matthieu Baerts <matthieu.baerts(a)tessares.net>
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index f8e58ebcdd54..f9161ed69b86 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -84,7 +84,7 @@ init_partial()
ip netns add $netns || exit $ksft_skip
ip -net $netns link set lo up
ip netns exec $netns sysctl -q net.mptcp.enabled=1
- ip netns exec $netns sysctl -q net.mptcp.pm_type=0
+ ip netns exec $netns sysctl -q net.mptcp.pm_type=0 2>/dev/null || true
ip netns exec $netns sysctl -q net.ipv4.conf.all.rp_filter=0
ip netns exec $netns sysctl -q net.ipv4.conf.default.rp_filter=0
if [ $checksum -eq 1 ]; then
@@ -3191,7 +3191,8 @@ fail_tests()
userspace_tests()
{
# userspace pm type prevents add_addr
- if reset "userspace pm type prevents add_addr"; then
+ if reset "userspace pm type prevents add_addr" &&
+ continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns1
pm_nl_set_limits $ns1 0 2
pm_nl_set_limits $ns2 0 2
@@ -3202,7 +3203,8 @@ userspace_tests()
fi
# userspace pm type does not echo add_addr without daemon
- if reset "userspace pm no echo w/o daemon"; then
+ if reset "userspace pm no echo w/o daemon" &&
+ continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns2
pm_nl_set_limits $ns1 0 2
pm_nl_set_limits $ns2 0 2
@@ -3213,7 +3215,8 @@ userspace_tests()
fi
# userspace pm type rejects join
- if reset "userspace pm type rejects join"; then
+ if reset "userspace pm type rejects join" &&
+ continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns1
pm_nl_set_limits $ns1 1 1
pm_nl_set_limits $ns2 1 1
@@ -3223,7 +3226,8 @@ userspace_tests()
fi
# userspace pm type does not send join
- if reset "userspace pm type does not send join"; then
+ if reset "userspace pm type does not send join" &&
+ continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns2
pm_nl_set_limits $ns1 1 1
pm_nl_set_limits $ns2 1 1
@@ -3233,7 +3237,8 @@ userspace_tests()
fi
# userspace pm type prevents mp_prio
- if reset "userspace pm type prevents mp_prio"; then
+ if reset "userspace pm type prevents mp_prio" &&
+ continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns1
pm_nl_set_limits $ns1 1 1
pm_nl_set_limits $ns2 1 1
@@ -3244,7 +3249,8 @@ userspace_tests()
fi
# userspace pm type prevents rm_addr
- if reset "userspace pm type prevents rm_addr"; then
+ if reset "userspace pm type prevents rm_addr" &&
+ continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns1
set_userspace_pm $ns2
pm_nl_set_limits $ns1 0 1
@@ -3256,7 +3262,8 @@ userspace_tests()
fi
# userspace pm add & remove address
- if reset_with_events "userspace pm add & remove address"; then
+ if reset_with_events "userspace pm add & remove address" &&
+ continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns1
pm_nl_set_limits $ns2 1 1
run_tests $ns1 $ns2 10.0.1.1 0 userspace_1 0 slow
@@ -3267,7 +3274,8 @@ userspace_tests()
fi
# userspace pm create destroy subflow
- if reset_with_events "userspace pm create destroy subflow"; then
+ if reset_with_events "userspace pm create destroy subflow" &&
+ continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns2
pm_nl_set_limits $ns1 0 1
run_tests $ns1 $ns2 10.0.1.1 0 0 userspace_1 slow
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 4a0b866a3f7d3c22033f40e93e94befc6fe51bce
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062218-dude-unshaven-b4cb@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4a0b866a3f7d3c22033f40e93e94befc6fe51bce Mon Sep 17 00:00:00 2001
From: Matthieu Baerts <matthieu.baerts(a)tessares.net>
Date: Sat, 10 Jun 2023 18:11:40 +0200
Subject: [PATCH] selftests: mptcp: join: skip test if iptables/tc cmds fail
Selftests are supposed to run on any kernels, including the old ones not
supporting all MPTCP features.
Some tests are using IPTables and/or TC commands to force some
behaviours. If one of these commands fails -- likely because some
features are not available due to missing kernel config -- we should
intercept the error and skip the tests requiring these features.
Note that if we expect to have these features available and if
SELFTESTS_MPTCP_LIB_EXPECT_ALL_FEATURES env var is set to 1, the tests
will be marked as failed instead of skipped.
This patch also replaces the 'exit 1' by 'return 1' not to stop the
selftest in the middle without the conclusion if there is an issue with
NF or TC.
Link: https://github.com/multipath-tcp/mptcp_net-next/issues/368
Fixes: 8d014eaa9254 ("selftests: mptcp: add ADD_ADDR timeout test case")
Cc: stable(a)vger.kernel.org
Signed-off-by: Matthieu Baerts <matthieu.baerts(a)tessares.net>
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 276396cbe60c..c471934ad5e0 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -286,11 +286,15 @@ reset_with_add_addr_timeout()
fi
ip netns exec $ns1 sysctl -q net.mptcp.add_addr_timeout=1
- ip netns exec $ns2 $tables -A OUTPUT -p tcp \
- -m tcp --tcp-option 30 \
- -m bpf --bytecode \
- "$CBPF_MPTCP_SUBOPTION_ADD_ADDR" \
- -j DROP
+
+ if ! ip netns exec $ns2 $tables -A OUTPUT -p tcp \
+ -m tcp --tcp-option 30 \
+ -m bpf --bytecode \
+ "$CBPF_MPTCP_SUBOPTION_ADD_ADDR" \
+ -j DROP; then
+ mark_as_skipped "unable to set the 'add addr' rule"
+ return 1
+ fi
}
# $1: test name
@@ -334,17 +338,12 @@ reset_with_allow_join_id0()
# tc action pedit offset 162 out of bounds
#
# Netfilter is used to mark packets with enough data.
-reset_with_fail()
+setup_fail_rules()
{
- reset "${1}" || return 1
-
- ip netns exec $ns1 sysctl -q net.mptcp.checksum_enabled=1
- ip netns exec $ns2 sysctl -q net.mptcp.checksum_enabled=1
-
check_invert=1
validate_checksum=1
- local i="$2"
- local ip="${3:-4}"
+ local i="$1"
+ local ip="${2:-4}"
local tables
tables="${iptables}"
@@ -359,15 +358,32 @@ reset_with_fail()
-p tcp \
-m length --length 150:9999 \
-m statistic --mode nth --packet 1 --every 99999 \
- -j MARK --set-mark 42 || exit 1
+ -j MARK --set-mark 42 || return ${ksft_skip}
- tc -n $ns2 qdisc add dev ns2eth$i clsact || exit 1
+ tc -n $ns2 qdisc add dev ns2eth$i clsact || return ${ksft_skip}
tc -n $ns2 filter add dev ns2eth$i egress \
protocol ip prio 1000 \
handle 42 fw \
action pedit munge offset 148 u8 invert \
pipe csum tcp \
- index 100 || exit 1
+ index 100 || return ${ksft_skip}
+}
+
+reset_with_fail()
+{
+ reset "${1}" || return 1
+ shift
+
+ ip netns exec $ns1 sysctl -q net.mptcp.checksum_enabled=1
+ ip netns exec $ns2 sysctl -q net.mptcp.checksum_enabled=1
+
+ local rc=0
+ setup_fail_rules "${@}" || rc=$?
+
+ if [ ${rc} -eq ${ksft_skip} ]; then
+ mark_as_skipped "unable to set the 'fail' rules"
+ return 1
+ fi
}
reset_with_events()
@@ -382,6 +398,25 @@ reset_with_events()
evts_ns2_pid=$!
}
+reset_with_tcp_filter()
+{
+ reset "${1}" || return 1
+ shift
+
+ local ns="${!1}"
+ local src="${2}"
+ local target="${3}"
+
+ if ! ip netns exec "${ns}" ${iptables} \
+ -A INPUT \
+ -s "${src}" \
+ -p tcp \
+ -j "${target}"; then
+ mark_as_skipped "unable to set the filter rules"
+ return 1
+ fi
+}
+
fail_test()
{
ret=1
@@ -745,15 +780,6 @@ pm_nl_check_endpoint()
fi
}
-filter_tcp_from()
-{
- local ns="${1}"
- local src="${2}"
- local target="${3}"
-
- ip netns exec "${ns}" ${iptables} -A INPUT -s "${src}" -p tcp -j "${target}"
-}
-
do_transfer()
{
local listener_ns="$1"
@@ -1975,23 +2001,23 @@ subflows_error_tests()
fi
# multiple subflows, with subflow creation error
- if reset "multi subflows, with failing subflow"; then
+ if reset_with_tcp_filter "multi subflows, with failing subflow" ns1 10.0.3.2 REJECT &&
+ continue_if mptcp_lib_kallsyms_has "mptcp_pm_subflow_check_next$"; then
pm_nl_set_limits $ns1 0 2
pm_nl_set_limits $ns2 0 2
pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
- filter_tcp_from $ns1 10.0.3.2 REJECT
run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow
chk_join_nr 1 1 1
fi
# multiple subflows, with subflow timeout on MPJ
- if reset "multi subflows, with subflow timeout"; then
+ if reset_with_tcp_filter "multi subflows, with subflow timeout" ns1 10.0.3.2 DROP &&
+ continue_if mptcp_lib_kallsyms_has "mptcp_pm_subflow_check_next$"; then
pm_nl_set_limits $ns1 0 2
pm_nl_set_limits $ns2 0 2
pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
- filter_tcp_from $ns1 10.0.3.2 DROP
run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow
chk_join_nr 1 1 1
fi
@@ -1999,11 +2025,11 @@ subflows_error_tests()
# multiple subflows, check that the endpoint corresponding to
# closed subflow (due to reset) is not reused if additional
# subflows are added later
- if reset "multi subflows, fair usage on close"; then
+ if reset_with_tcp_filter "multi subflows, fair usage on close" ns1 10.0.3.2 REJECT &&
+ continue_if mptcp_lib_kallsyms_has "mptcp_pm_subflow_check_next$"; then
pm_nl_set_limits $ns1 0 1
pm_nl_set_limits $ns2 0 1
pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
- filter_tcp_from $ns1 10.0.3.2 REJECT
run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow &
# mpj subflow will be in TW after the reset
commit eb0764b822b9 ("cxl/port: Enable the HDM decoder capability for switch ports")
...was added on the observation of CXL memory not being accessible after
setting up a region on a "cold-plugged" device. A "cold-plugged" CXL
device is one that was not present at boot, so platform-firmware/BIOS
has no chance to set it up.
While it is true that the debug found the enable bit clear in the
host-bridge's instance of the global control register (CXL 3.0
8.2.4.19.2 CXL HDM Decoder Global Control Register), that bit is
described as:
"This bit is only applicable to CXL.mem devices and shall
return 0 on CXL Host Bridges and Upstream Switch Ports."
So it is meant to be zero, and further testing confirmed that this "fix"
had no effect on the failure. Revert it, and be more vigilant about
proposed fixes in the future. Since the original copied stable@, flag
this revert for stable@ as well.
Cc: <stable(a)vger.kernel.org>
Fixes: eb0764b822b9 ("cxl/port: Enable the HDM decoder capability for switch ports")
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
---
drivers/cxl/core/pci.c | 27 ++++-----------------------
drivers/cxl/cxl.h | 1 -
drivers/cxl/port.c | 14 +++++---------
tools/testing/cxl/Kbuild | 1 -
tools/testing/cxl/test/mock.c | 15 ---------------
5 files changed, 9 insertions(+), 49 deletions(-)
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 7440f84be6c8..552203c13b39 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -308,36 +308,17 @@ static void disable_hdm(void *_cxlhdm)
hdm + CXL_HDM_DECODER_CTRL_OFFSET);
}
-int devm_cxl_enable_hdm(struct cxl_port *port, struct cxl_hdm *cxlhdm)
+static int devm_cxl_enable_hdm(struct device *host, struct cxl_hdm *cxlhdm)
{
- void __iomem *hdm;
+ void __iomem *hdm = cxlhdm->regs.hdm_decoder;
u32 global_ctrl;
- /*
- * If the hdm capability was not mapped there is nothing to enable and
- * the caller is responsible for what happens next. For example,
- * emulate a passthrough decoder.
- */
- if (IS_ERR(cxlhdm))
- return 0;
-
- hdm = cxlhdm->regs.hdm_decoder;
global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET);
-
- /*
- * If the HDM decoder capability was enabled on entry, skip
- * registering disable_hdm() since this decode capability may be
- * owned by platform firmware.
- */
- if (global_ctrl & CXL_HDM_DECODER_ENABLE)
- return 0;
-
writel(global_ctrl | CXL_HDM_DECODER_ENABLE,
hdm + CXL_HDM_DECODER_CTRL_OFFSET);
- return devm_add_action_or_reset(&port->dev, disable_hdm, cxlhdm);
+ return devm_add_action_or_reset(host, disable_hdm, cxlhdm);
}
-EXPORT_SYMBOL_NS_GPL(devm_cxl_enable_hdm, CXL);
int cxl_dvsec_rr_decode(struct device *dev, int d,
struct cxl_endpoint_dvsec_info *info)
@@ -511,7 +492,7 @@ int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
if (info->mem_enabled)
return 0;
- rc = devm_cxl_enable_hdm(port, cxlhdm);
+ rc = devm_cxl_enable_hdm(&port->dev, cxlhdm);
if (rc)
return rc;
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 74548f8f5f4c..d743df66a582 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -717,7 +717,6 @@ struct cxl_endpoint_dvsec_info {
struct cxl_hdm;
struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port,
struct cxl_endpoint_dvsec_info *info);
-int devm_cxl_enable_hdm(struct cxl_port *port, struct cxl_hdm *cxlhdm);
int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm,
struct cxl_endpoint_dvsec_info *info);
int devm_cxl_add_passthrough_decoder(struct cxl_port *port);
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index 5ffe3c7d2f5e..43718d0396d7 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -60,17 +60,13 @@ static int discover_region(struct device *dev, void *root)
static int cxl_switch_port_probe(struct cxl_port *port)
{
struct cxl_hdm *cxlhdm;
- int rc, nr_dports;
-
- nr_dports = devm_cxl_port_enumerate_dports(port);
- if (nr_dports < 0)
- return nr_dports;
+ int rc;
- cxlhdm = devm_cxl_setup_hdm(port, NULL);
- rc = devm_cxl_enable_hdm(port, cxlhdm);
- if (rc)
+ rc = devm_cxl_port_enumerate_dports(port);
+ if (rc < 0)
return rc;
+ cxlhdm = devm_cxl_setup_hdm(port, NULL);
if (!IS_ERR(cxlhdm))
return devm_cxl_enumerate_decoders(cxlhdm, NULL);
@@ -79,7 +75,7 @@ static int cxl_switch_port_probe(struct cxl_port *port)
return PTR_ERR(cxlhdm);
}
- if (nr_dports == 1) {
+ if (rc == 1) {
dev_dbg(&port->dev, "Fallback to passthrough decoder\n");
return devm_cxl_add_passthrough_decoder(port);
}
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 6f9347ade82c..fba7bec96acd 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -6,7 +6,6 @@ ldflags-y += --wrap=acpi_pci_find_root
ldflags-y += --wrap=nvdimm_bus_register
ldflags-y += --wrap=devm_cxl_port_enumerate_dports
ldflags-y += --wrap=devm_cxl_setup_hdm
-ldflags-y += --wrap=devm_cxl_enable_hdm
ldflags-y += --wrap=devm_cxl_add_passthrough_decoder
ldflags-y += --wrap=devm_cxl_enumerate_decoders
ldflags-y += --wrap=cxl_await_media_ready
diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c
index 284416527644..de3933a776fd 100644
--- a/tools/testing/cxl/test/mock.c
+++ b/tools/testing/cxl/test/mock.c
@@ -149,21 +149,6 @@ struct cxl_hdm *__wrap_devm_cxl_setup_hdm(struct cxl_port *port,
}
EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_setup_hdm, CXL);
-int __wrap_devm_cxl_enable_hdm(struct cxl_port *port, struct cxl_hdm *cxlhdm)
-{
- int index, rc;
- struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
-
- if (ops && ops->is_mock_port(port->uport))
- rc = 0;
- else
- rc = devm_cxl_enable_hdm(port, cxlhdm);
- put_cxl_mock_ops(index);
-
- return rc;
-}
-EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_enable_hdm, CXL);
-
int __wrap_devm_cxl_add_passthrough_decoder(struct cxl_port *port)
{
int rc, index;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x b631e3a4e94c77c9007d60b577a069c203ce9594
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062206-muzzle-pope-0802@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b631e3a4e94c77c9007d60b577a069c203ce9594 Mon Sep 17 00:00:00 2001
From: Matthieu Baerts <matthieu.baerts(a)tessares.net>
Date: Thu, 8 Jun 2023 18:38:53 +0200
Subject: [PATCH] selftests: mptcp: sockopt: skip TCP_INQ checks if not
supported
Selftests are supposed to run on any kernels, including the old ones not
supporting all MPTCP features.
One of them is TCP_INQ cmsg support introduced in commit 2c9e77659a0c
("mptcp: add TCP_INQ cmsg support").
It is possible to look for "mptcp_ioctl" in kallsyms because it was
needed to introduce the mentioned feature. We can skip these tests and
not set TCPINQ option if the feature is not supported.
Link: https://github.com/multipath-tcp/mptcp_net-next/issues/368
Fixes: 5cbd886ce2a9 ("selftests: mptcp: add TCP_INQ support")
Cc: stable(a)vger.kernel.org
Signed-off-by: Matthieu Baerts <matthieu.baerts(a)tessares.net>
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
index 1d4ae8792227..f295a371ff14 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
@@ -187,9 +187,14 @@ do_transfer()
local_addr="0.0.0.0"
fi
+ cmsg="TIMESTAMPNS"
+ if mptcp_lib_kallsyms_has "mptcp_ioctl$"; then
+ cmsg+=",TCPINQ"
+ fi
+
timeout ${timeout_test} \
ip netns exec ${listener_ns} \
- $mptcp_connect -t ${timeout_poll} -l -M 1 -p $port -s ${srv_proto} -c TIMESTAMPNS,TCPINQ \
+ $mptcp_connect -t ${timeout_poll} -l -M 1 -p $port -s ${srv_proto} -c "${cmsg}" \
${local_addr} < "$sin" > "$sout" &
local spid=$!
@@ -197,7 +202,7 @@ do_transfer()
timeout ${timeout_test} \
ip netns exec ${connector_ns} \
- $mptcp_connect -t ${timeout_poll} -M 2 -p $port -s ${cl_proto} -c TIMESTAMPNS,TCPINQ \
+ $mptcp_connect -t ${timeout_poll} -M 2 -p $port -s ${cl_proto} -c "${cmsg}" \
$connect_addr < "$cin" > "$cout" &
local cpid=$!
@@ -313,6 +318,11 @@ do_tcpinq_tests()
{
local lret=0
+ if ! mptcp_lib_kallsyms_has "mptcp_ioctl$"; then
+ echo "INFO: TCP_INQ not supported: SKIP"
+ return
+ fi
+
local args
for args in "-t tcp" "-r tcp"; do
do_tcpinq_test $args
Sometimes I miss the stable announcements cos of the delights of our
corporate email setup, so add me to the 0th mail CC list.
Signed-off-by: Conor Dooley <conor.dooley(a)microchip.com>
---
I dunno how to test this, but touch wood I've not made a hames of
something trivial...
scripts/quilt-mail | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/scripts/quilt-mail b/scripts/quilt-mail
index 5eb70af702..7b77e2970d 100755
--- a/scripts/quilt-mail
+++ b/scripts/quilt-mail
@@ -177,7 +177,8 @@ CC_NAMES=("linux-kernel(a)vger\.kernel\.org"
"f\.fainelli(a)gmail\.com"
"sudipm\.mukherjee(a)gmail\.com"
"srw(a)sladewatkins\.net"
- "rwarsow(a)gmx\.de")
+ "rwarsow(a)gmx\.de"
+ "conor(a)kernel\.org")
#CC_LIST="stable(a)vger\.kernel\.org"
CC_LIST="patches(a)lists.linux.dev"
--
2.40.1
Ahoj!
Vytvořili jsme nástroj pro automatický sběr dat z výrobních stanic, který může zlepšit vaši práci a zvýšit efektivitu procesů.
Toto řešení se rychle instaluje a snadno se používá a poskytne vám cenné údaje o ukazatelích výkonu pro celý závod a jednotlivé stroje. Nástroj jasně ukazuje, kdy se stroj/linka zpomaluje, jaké má prostoje, a vy tak víte, kdy zareagovat.
Získáte cenné informace o stavu výroby a důvodech možných zpoždění. Díky schopnosti okamžitě reagovat na vznikající problémy a příznaky zvyšujete efektivitu své práce o několik desítek procent.
Chcete takové řešení zdarma otestovat ve vaší firmě?
Pozdravy
Michal Rmoutil
commit d8e45bf1aed2 upstream.
(selftests/mount_setattr: fix redefine struct mount_attr build error)
Backport this commit from v6.2.0-rc5 to v6.3, v6.1, and v5.15 to resolve
the struct redefinition error:
mount_setattr_test.c:107:8: error: redefinition of 'struct mount_attr'
107 | struct mount_attr {
| ^~~~~~~~~~
In file included from /usr/include/x86_64-linux-gnu/sys/mount.h:32,
from mount_setattr_test.c:10:
../../../../usr/include/linux/mount.h:129:8: note: originally defined here
129 | struct mount_attr {
This error is caused by the upstream commit f1594bc67657
(selftests mount: Fix mount_setattr_test builds failed) backported
to v5.15
Thanks,
Hardik
----- Forwarded message from "Michael S. Tsirkin" <mst(a)redhat.com> -----
From: "Michael S. Tsirkin" <mst(a)redhat.com>
Date: Fri, 9 Jun 2023 03:27:28 -0400
To: linux-kernel(a)vger.kernel.org
Cc: kernel test robot <lkp(a)intel.com>, Suwan Kim <suwan.kim027(a)gmail.com>, "Roberts, Martin" <martin.roberts(a)intel.com>, Jason Wang
<jasowang(a)redhat.com>, Paolo Bonzini <pbonzini(a)redhat.com>, Stefan Hajnoczi <stefanha(a)redhat.com>, Xuan Zhuo
<xuanzhuo(a)linux.alibaba.com>, Jens Axboe <axboe(a)kernel.dk>, virtualization(a)lists.linux-foundation.org,
linux-block(a)vger.kernel.org
Subject: [PATCH v2] Revert "virtio-blk: support completion batching for the IRQ path"
Message-ID: <336455b4f630f329380a8f53ee8cad3868764d5c.1686295549.git.mst(a)redhat.com>
This reverts commit 07b679f70d73483930e8d3c293942416d9cd5c13.
This change appears to have broken things...
We now see applications hanging during disk accesses.
e.g.
multi-port virtio-blk device running in h/w (FPGA)
Host running a simple 'fio' test.
[global]
thread=1
direct=1
ioengine=libaio
norandommap=1
group_reporting=1
bs=4K
rw=read
iodepth=128
runtime=1
numjobs=4
time_based
[job0]
filename=/dev/vda
[job1]
filename=/dev/vdb
[job2]
filename=/dev/vdc
...
[job15]
filename=/dev/vdp
i.e. 16 disks; 4 queues per disk; simple burst of 4KB reads
This is repeatedly run in a loop.
After a few, normally <10 seconds, fio hangs.
With 64 queues (16 disks), failure occurs within a few seconds; with 8 queues (2 disks) it may take ~hour before hanging.
Last message:
fio-3.19
Starting 8 threads
Jobs: 1 (f=1): [_(7),R(1)][68.3%][eta 03h:11m:06s]
I think this means at the end of the run 1 queue was left incomplete.
'diskstats' (run while fio is hung) shows no outstanding transactions.
e.g.
$ cat /proc/diskstats
...
252 0 vda 1843140071 0 14745120568 712568645 0 0 0 0 0 3117947 712568645 0 0 0 0 0 0
252 16 vdb 1816291511 0 14530332088 704905623 0 0 0 0 0 3117711 704905623 0 0 0 0 0 0
...
Other stats (in the h/w, and added to the virtio-blk driver ([a]virtio_queue_rq(), [b]virtblk_handle_req(), [c]virtblk_request_done()) all agree, and show every request had a completion, and that virtblk_request_done() never gets called.
e.g.
PF= 0 vq=0 1 2 3
[a]request_count - 839416590 813148916 105586179 84988123
[b]completion1_count - 839416590 813148916 105586179 84988123
[c]completion2_count - 0 0 0 0
PF= 1 vq=0 1 2 3
[a]request_count - 823335887 812516140 104582672 75856549
[b]completion1_count - 823335887 812516140 104582672 75856549
[c]completion2_count - 0 0 0 0
i.e. the issue is after the virtio-blk driver.
This change was introduced in kernel 6.3.0.
I am seeing this using 6.3.3.
If I run with an earlier kernel (5.15), it does not occur.
If I make a simple patch to the 6.3.3 virtio-blk driver, to skip the blk_mq_add_to_batch()call, it does not fail.
e.g.
kernel 5.15 - this is OK
virtio_blk.c,virtblk_done() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
blk_mq_complete_request(req);
}
kernel 6.3.3 - this fails
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
If I do, kernel 6.3.3 - this is OK
virtio_blk.c,virtblk_handle_req() [irq handler]
if (likely(!blk_should_fake_timeout(req->q))) {
if (!blk_mq_complete_request_remote(req)) {
virtblk_request_done(req); //force this here...
if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) {
virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed
}
}
}
Perhaps you might like to fix/test/revert this change...
Martin
Reported-by: kernel test robot <lkp(a)intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306090826.C1fZmdMe-lkp@intel.com/
Cc: Suwan Kim <suwan.kim027(a)gmail.com>
Reported-by: "Roberts, Martin" <martin.roberts(a)intel.com>
Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>
---
Since v1:
fix build error
Still completely untested as I'm traveling.
Martin, Suwan, could you please test and report?
Suwan if you have a better revert in mind pls post and
I will be happy to drop this.
Thanks!
drivers/block/virtio_blk.c | 82 +++++++++++++++++---------------------
1 file changed, 37 insertions(+), 45 deletions(-)
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2b918e28acaa..b47358da92a2 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -348,63 +348,33 @@ static inline void virtblk_request_done(struct request *req)
blk_mq_end_request(req, status);
}
-static void virtblk_complete_batch(struct io_comp_batch *iob)
-{
- struct request *req;
-
- rq_list_for_each(&iob->req_list, req) {
- virtblk_unmap_data(req, blk_mq_rq_to_pdu(req));
- virtblk_cleanup_cmd(req);
- }
- blk_mq_end_request_batch(iob);
-}
-
-static int virtblk_handle_req(struct virtio_blk_vq *vq,
- struct io_comp_batch *iob)
-{
- struct virtblk_req *vbr;
- int req_done = 0;
- unsigned int len;
-
- while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) {
- struct request *req = blk_mq_rq_from_pdu(vbr);
-
- if (likely(!blk_should_fake_timeout(req->q)) &&
- !blk_mq_complete_request_remote(req) &&
- !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr),
- virtblk_complete_batch))
- virtblk_request_done(req);
- req_done++;
- }
-
- return req_done;
-}
-
static void virtblk_done(struct virtqueue *vq)
{
struct virtio_blk *vblk = vq->vdev->priv;
- struct virtio_blk_vq *vblk_vq = &vblk->vqs[vq->index];
- int req_done = 0;
+ bool req_done = false;
+ int qid = vq->index;
+ struct virtblk_req *vbr;
unsigned long flags;
- DEFINE_IO_COMP_BATCH(iob);
+ unsigned int len;
- spin_lock_irqsave(&vblk_vq->lock, flags);
+ spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
do {
virtqueue_disable_cb(vq);
- req_done += virtblk_handle_req(vblk_vq, &iob);
+ while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
+ struct request *req = blk_mq_rq_from_pdu(vbr);
+ if (likely(!blk_should_fake_timeout(req->q)))
+ blk_mq_complete_request(req);
+ req_done = true;
+ }
if (unlikely(virtqueue_is_broken(vq)))
break;
} while (!virtqueue_enable_cb(vq));
- if (req_done) {
- if (!rq_list_empty(iob.req_list))
- iob.complete(&iob);
-
- /* In case queue is stopped waiting for more buffers. */
+ /* In case queue is stopped waiting for more buffers. */
+ if (req_done)
blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
- }
- spin_unlock_irqrestore(&vblk_vq->lock, flags);
+ spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
}
static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx)
@@ -1283,15 +1253,37 @@ static void virtblk_map_queues(struct blk_mq_tag_set *set)
}
}
+static void virtblk_complete_batch(struct io_comp_batch *iob)
+{
+ struct request *req;
+
+ rq_list_for_each(&iob->req_list, req) {
+ virtblk_unmap_data(req, blk_mq_rq_to_pdu(req));
+ virtblk_cleanup_cmd(req);
+ }
+ blk_mq_end_request_batch(iob);
+}
+
static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
{
struct virtio_blk *vblk = hctx->queue->queuedata;
struct virtio_blk_vq *vq = get_virtio_blk_vq(hctx);
+ struct virtblk_req *vbr;
unsigned long flags;
+ unsigned int len;
int found = 0;
spin_lock_irqsave(&vq->lock, flags);
- found = virtblk_handle_req(vq, iob);
+
+ while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) {
+ struct request *req = blk_mq_rq_from_pdu(vbr);
+
+ found++;
+ if (!blk_mq_complete_request_remote(req) &&
+ !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr),
+ virtblk_complete_batch))
+ virtblk_request_done(req);
+ }
if (found)
blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
--
MST
----- End forwarded message -----
Patch 1 correctly handles disconnect() failures that can happen in some
specific cases: now the socket state is set as unconnected as expected.
That fixes an issue introduced in v6.2.
Patch 2 fixes a divide by zero bug in mptcp_recvmsg() with a fix similar
to a recent one from Eric Dumazet for TCP introducing sk_wait_pending
flag. It should address an issue present in MPTCP from almost the
beginning, from v5.9.
Patch 3 fixes a possible list corruption on passive MPJ even if the race
seems very unlikely, better be safe than sorry. The possible issue is
present from v5.17.
Patch 4 consolidates fallback and non fallback state machines to avoid
leaking some MPTCP sockets. The fix is likely needed for versions from
v5.11.
Patch 5 drops code that is no longer used after the introduction of
patch 4/6. This is not really a fix but this patch can probably land in
the -net tree as well not to leave unused code.
Patch 6 ensures listeners are unhashed before updating their sk status
to avoid possible deadlocks when diag info are going to be retrieved
with a lock. Even if it should not be visible with the way we are
currently getting diag info, the issue is present from v5.17.
Signed-off-by: Matthieu Baerts <matthieu.baerts(a)tessares.net>
---
Paolo Abeni (6):
mptcp: handle correctly disconnect() failures
mptcp: fix possible divide by zero in recvmsg()
mptcp: fix possible list corruption on passive MPJ
mptcp: consolidate fallback and non fallback state machine
mptcp: drop legacy code around RX EOF
mptcp: ensure listener is unhashed before updating the sk status
net/mptcp/pm_netlink.c | 1 +
net/mptcp/protocol.c | 160 ++++++++++++++++++++-----------------------------
net/mptcp/protocol.h | 5 +-
net/mptcp/subflow.c | 17 +++---
4 files changed, 76 insertions(+), 107 deletions(-)
---
base-commit: 9a43827e876c9a071826cc81783aa2222b020f1d
change-id: 20230620-upstream-net-20230620-misc-fixes-for-v6-4-55ef43802324
Best regards,
--
Matthieu Baerts <matthieu.baerts(a)tessares.net>
Currently, it is possible for us to access memory that we shouldn't.
Since, we acquire (possibly dangling) pointers to dirty rectangles
before doing a bounds check to make sure we can actually accommodate the
number of dirty rectangles userspace has requested to fill. This issue
is especially evident if a compositor requests both MPO and damage clips
at the same time, in which case I have observed a soft-hang. So, to
avoid this issue, perform the bounds check before filling a single dirty
rectangle and WARN() about it, if it is ever attempted in
fill_dc_dirty_rect().
Cc: stable(a)vger.kernel.org # 6.1+
Fixes: 30ebe41582d1 ("drm/amd/display: add FB_DAMAGE_CLIPS support")
Signed-off-by: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
---
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 13 ++++---------
1 file changed, 4 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 64b8dcf8dbda..66bb03d503ea 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -5065,11 +5065,7 @@ static inline void fill_dc_dirty_rect(struct drm_plane *plane,
s32 y, s32 width, s32 height,
int *i, bool ffu)
{
- if (*i > DC_MAX_DIRTY_RECTS)
- return;
-
- if (*i == DC_MAX_DIRTY_RECTS)
- goto out;
+ WARN_ON(*i >= DC_MAX_DIRTY_RECTS);
dirty_rect->x = x;
dirty_rect->y = y;
@@ -5085,7 +5081,6 @@ static inline void fill_dc_dirty_rect(struct drm_plane *plane,
"[PLANE:%d] PSR SU dirty rect at (%d, %d) size (%d, %d)",
plane->base.id, x, y, width, height);
-out:
(*i)++;
}
@@ -5172,6 +5167,9 @@ static void fill_dc_dirty_rects(struct drm_plane *plane,
*dirty_regions_changed = bb_changed;
+ if ((num_clips + (bb_changed ? 2 : 0)) > DC_MAX_DIRTY_RECTS)
+ goto ffu;
+
if (bb_changed) {
fill_dc_dirty_rect(new_plane_state->plane, &dirty_rects[i],
new_plane_state->crtc_x,
@@ -5201,9 +5199,6 @@ static void fill_dc_dirty_rects(struct drm_plane *plane,
new_plane_state->crtc_h, &i, false);
}
- if (i > DC_MAX_DIRTY_RECTS)
- goto ffu;
-
flip_addrs->dirty_rect_count = i;
return;
--
2.40.1
The restoring of TPIDR2 signal context has been broken since it was
merged, fix this and add a test case covering it. This is a result of
TPIDR2 context management following a different flow to any of the other
state that we provide and the fact that we don't expose TPIDR (which
follows the same pattern) to signals.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Mark Brown (2):
arm64/signal: Restore TPIDR2 register rather than memory state
kselftest/arm64: Add a test case for TPIDR2 restore
arch/arm64/kernel/signal.c | 2 +-
tools/testing/selftests/arm64/signal/.gitignore | 2 +-
.../arm64/signal/testcases/tpidr2_restore.c | 85 ++++++++++++++++++++++
3 files changed, 87 insertions(+), 2 deletions(-)
---
base-commit: 858fd168a95c5b9669aac8db6c14a9aeab446375
change-id: 20230621-arm64-fix-tpidr2-signal-restore-713d93798f99
Best regards,
--
Mark Brown <broonie(a)kernel.org>
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x 306320034e8fbe7ee1cc4f5269c55658b4612048
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061830-rubbed-stubble-2775@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 306320034e8fbe7ee1cc4f5269c55658b4612048 Mon Sep 17 00:00:00 2001
From: Bernhard Seibold <mail(a)bernhard-seibold.de>
Date: Fri, 2 Jun 2023 15:30:29 +0200
Subject: [PATCH] serial: lantiq: add missing interrupt ack
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Currently, the error interrupt is never acknowledged, so once active it
will stay active indefinitely, causing the handler to be called in an
infinite loop.
Fixes: 2f0fc4159a6a ("SERIAL: Lantiq: Add driver for MIPS Lantiq SOCs.")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Bernhard Seibold <mail(a)bernhard-seibold.de>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Message-ID: <20230602133029.546-1-mail(a)bernhard-seibold.de>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/lantiq.c b/drivers/tty/serial/lantiq.c
index a58e9277dfad..f1387f1024db 100644
--- a/drivers/tty/serial/lantiq.c
+++ b/drivers/tty/serial/lantiq.c
@@ -250,6 +250,7 @@ lqasc_err_int(int irq, void *_port)
struct ltq_uart_port *ltq_port = to_ltq_uart_port(port);
spin_lock_irqsave(<q_port->lock, flags);
+ __raw_writel(ASC_IRNCR_EIR, port->membase + LTQ_ASC_IRNCR);
/* clear any pending interrupts */
asc_update_bits(0, ASCWHBSTATE_CLRPE | ASCWHBSTATE_CLRFE |
ASCWHBSTATE_CLRROE, port->membase + LTQ_ASC_WHBSTATE);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 5005bcb4219156f1bf7587b185080ec1da08518e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062114-jet-underwire-9543@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5005bcb4219156f1bf7587b185080ec1da08518e Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon(a)kernel.org>
Date: Thu, 15 Jun 2023 22:05:29 +0900
Subject: [PATCH] ksmbd: validate session id and tree id in the compound
request
This patch validate session id and tree id in compound request.
If first operation in the compound is SMB2 ECHO request, ksmbd bypass
session and tree validation. So work->sess and work->tcon could be NULL.
If secound request in the compound access work->sess or tcon, It cause
NULL pointer dereferecing error.
Cc: stable(a)vger.kernel.org
Reported-by: zdi-disclosures(a)trendmicro.com # ZDI-CAN-21165
Signed-off-by: Namjae Jeon <linkinjeon(a)kernel.org>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index f9b2e0f19b03..ced7a9e916f0 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -185,24 +185,31 @@ static void __handle_ksmbd_work(struct ksmbd_work *work,
goto send;
}
- if (conn->ops->check_user_session) {
- rc = conn->ops->check_user_session(work);
- if (rc < 0) {
- command = conn->ops->get_cmd_val(work);
- conn->ops->set_rsp_status(work,
- STATUS_USER_SESSION_DELETED);
- goto send;
- } else if (rc > 0) {
- rc = conn->ops->get_ksmbd_tcon(work);
+ do {
+ if (conn->ops->check_user_session) {
+ rc = conn->ops->check_user_session(work);
if (rc < 0) {
- conn->ops->set_rsp_status(work,
- STATUS_NETWORK_NAME_DELETED);
+ if (rc == -EINVAL)
+ conn->ops->set_rsp_status(work,
+ STATUS_INVALID_PARAMETER);
+ else
+ conn->ops->set_rsp_status(work,
+ STATUS_USER_SESSION_DELETED);
goto send;
+ } else if (rc > 0) {
+ rc = conn->ops->get_ksmbd_tcon(work);
+ if (rc < 0) {
+ if (rc == -EINVAL)
+ conn->ops->set_rsp_status(work,
+ STATUS_INVALID_PARAMETER);
+ else
+ conn->ops->set_rsp_status(work,
+ STATUS_NETWORK_NAME_DELETED);
+ goto send;
+ }
}
}
- }
- do {
rc = __process_request(work, conn, &command);
if (rc == SERVER_HANDLER_ABORT)
break;
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index ccecdb71d2bc..da1787c68ba0 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -91,7 +91,6 @@ int smb2_get_ksmbd_tcon(struct ksmbd_work *work)
unsigned int cmd = le16_to_cpu(req_hdr->Command);
int tree_id;
- work->tcon = NULL;
if (cmd == SMB2_TREE_CONNECT_HE ||
cmd == SMB2_CANCEL_HE ||
cmd == SMB2_LOGOFF_HE) {
@@ -105,10 +104,28 @@ int smb2_get_ksmbd_tcon(struct ksmbd_work *work)
}
tree_id = le32_to_cpu(req_hdr->Id.SyncId.TreeId);
+
+ /*
+ * If request is not the first in Compound request,
+ * Just validate tree id in header with work->tcon->id.
+ */
+ if (work->next_smb2_rcv_hdr_off) {
+ if (!work->tcon) {
+ pr_err("The first operation in the compound does not have tcon\n");
+ return -EINVAL;
+ }
+ if (work->tcon->id != tree_id) {
+ pr_err("tree id(%u) is different with id(%u) in first operation\n",
+ tree_id, work->tcon->id);
+ return -EINVAL;
+ }
+ return 1;
+ }
+
work->tcon = ksmbd_tree_conn_lookup(work->sess, tree_id);
if (!work->tcon) {
pr_err("Invalid tid %d\n", tree_id);
- return -EINVAL;
+ return -ENOENT;
}
return 1;
@@ -547,7 +564,6 @@ int smb2_check_user_session(struct ksmbd_work *work)
unsigned int cmd = conn->ops->get_cmd_val(work);
unsigned long long sess_id;
- work->sess = NULL;
/*
* SMB2_ECHO, SMB2_NEGOTIATE, SMB2_SESSION_SETUP command do not
* require a session id, so no need to validate user session's for
@@ -558,15 +574,33 @@ int smb2_check_user_session(struct ksmbd_work *work)
return 0;
if (!ksmbd_conn_good(conn))
- return -EINVAL;
+ return -EIO;
sess_id = le64_to_cpu(req_hdr->SessionId);
+
+ /*
+ * If request is not the first in Compound request,
+ * Just validate session id in header with work->sess->id.
+ */
+ if (work->next_smb2_rcv_hdr_off) {
+ if (!work->sess) {
+ pr_err("The first operation in the compound does not have sess\n");
+ return -EINVAL;
+ }
+ if (work->sess->id != sess_id) {
+ pr_err("session id(%llu) is different with the first operation(%lld)\n",
+ sess_id, work->sess->id);
+ return -EINVAL;
+ }
+ return 1;
+ }
+
/* Check for validity of user session */
work->sess = ksmbd_session_lookup_all(conn, sess_id);
if (work->sess)
return 1;
ksmbd_debug(SMB, "Invalid user session, Uid %llu\n", sess_id);
- return -EINVAL;
+ return -ENOENT;
}
static void destroy_previous_session(struct ksmbd_conn *conn,
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 5fe7f7b78290638806211046a99f031ff26164e1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062136-xbox-tidal-465b@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5fe7f7b78290638806211046a99f031ff26164e1 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon(a)kernel.org>
Date: Thu, 15 Jun 2023 22:04:40 +0900
Subject: [PATCH] ksmbd: fix out-of-bound read in smb2_write
ksmbd_smb2_check_message doesn't validate hdr->NextCommand. If
->NextCommand is bigger than Offset + Length of smb2 write, It will
allow oversized smb2 write length. It will cause OOB read in smb2_write.
Cc: stable(a)vger.kernel.org
Reported-by: zdi-disclosures(a)trendmicro.com # ZDI-CAN-21164
Signed-off-by: Namjae Jeon <linkinjeon(a)kernel.org>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c
index 57749f41b991..33b7e6c4ceff 100644
--- a/fs/smb/server/smb2misc.c
+++ b/fs/smb/server/smb2misc.c
@@ -351,10 +351,16 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
int command;
__u32 clc_len; /* calculated length */
__u32 len = get_rfc1002_len(work->request_buf);
- __u32 req_struct_size;
+ __u32 req_struct_size, next_cmd = le32_to_cpu(hdr->NextCommand);
- if (le32_to_cpu(hdr->NextCommand) > 0)
- len = le32_to_cpu(hdr->NextCommand);
+ if ((u64)work->next_smb2_rcv_hdr_off + next_cmd > len) {
+ pr_err("next command(%u) offset exceeds smb msg size\n",
+ next_cmd);
+ return 1;
+ }
+
+ if (next_cmd > 0)
+ len = next_cmd;
else if (work->next_smb2_rcv_hdr_off)
len -= work->next_smb2_rcv_hdr_off;
Hi,
ea2062dd1f03 ("drm/amd/display: fix the system hang while disable PSR")
was tagged for stable, but failed to apply to 6.3.y, 6.1.y and 5.15.y.
I've looked into the missing dependencies, and here are the dependencies
needed for the stable backport:
5.15.y:
-------
97ca308925a5 ("drm/amd/display: Add minimal pipe split transition state")
f7511289821f ("drm/amd/display: Use dc_update_planes_and_stream")
81f743a08f3b ("drm/amd/display: Add wrapper to call planes and stream
update")
ea2062dd1f03 ("drm/amd/display: fix the system hang while disable PSR")
6.1.y / 6.3.y
-------------
ea2062dd1f03 ("drm/amd/display: fix the system hang while disable PSR")
f7511289821f ("drm/amd/display: Use dc_update_planes_and_stream")
81f743a08f3b ("drm/amd/display: Add wrapper to call planes and stream
update")
ea2062dd1f03 ("drm/amd/display: fix the system hang while disable PSR")
Thanks!
Hi,
This patch fixes a deadlock that can render a system frozen when
reading user memory from BPF.
Ideally, it should be applied to any supported revision equal to or
greater than 5.19.
patch subject: mm: Fix copy_from_user_nofault().
git revision: d319f344561de23e810515d109c7278919bff7b0
Thanks,
--
Fco. Javier Honduvilla Coto
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
commit e18eb8783ec4949adebc7d7b0fdb65f65bfeefd9 upstream.
Currently the tracing_reset_all_online_cpus() requires the
trace_types_lock held. But only one caller of this function actually has
that lock held before calling it, and the other just takes the lock so
that it can call it. More users of this function is needed where the lock
is not held.
Add a tracing_reset_all_online_cpus_unlocked() function for the one use
case that calls it without being held, and also add a lockdep_assert to
make sure it is held when called.
Then have tracing_reset_all_online_cpus() take the lock internally, such
that callers do not need to worry about taking it.
Link: https://lkml.kernel.org/r/20221123192741.658273220@goodmis.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Zheng Yejian <zhengyejian1(a)huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
[Refers to commit message of 417d5ea6e735e5d88ffb6c436cf2938f3f476dd1,
this patch is pre-depended, and tracing_reset_all_online_cpus() should
be called after trace_types_lock is held as its comment describes.]
Fixes: 417d5ea6e735 ("tracing: Free buffers when a used dynamic event is removed")
Signed-off-by: Zheng Yejian <zhengyejian1(a)huawei.com>
---
kernel/trace/trace.c | 11 ++++++++++-
kernel/trace/trace.h | 1 +
kernel/trace/trace_events.c | 2 +-
kernel/trace/trace_events_synth.c | 2 --
4 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f8b5abd6e36c..0202f23ae960 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2175,10 +2175,12 @@ void tracing_reset_online_cpus(struct array_buffer *buf)
}
/* Must have trace_types_lock held */
-void tracing_reset_all_online_cpus(void)
+void tracing_reset_all_online_cpus_unlocked(void)
{
struct trace_array *tr;
+ lockdep_assert_held(&trace_types_lock);
+
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (!tr->clear_trace)
continue;
@@ -2190,6 +2192,13 @@ void tracing_reset_all_online_cpus(void)
}
}
+void tracing_reset_all_online_cpus(void)
+{
+ mutex_lock(&trace_types_lock);
+ tracing_reset_all_online_cpus_unlocked();
+ mutex_unlock(&trace_types_lock);
+}
+
/*
* The tgid_map array maps from pid to tgid; i.e. the value stored at index i
* is the tgid last observed corresponding to pid=i.
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 66b6c8395fbc..2c3d9b6ce148 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -580,6 +580,7 @@ int tracing_is_enabled(void);
void tracing_reset_online_cpus(struct array_buffer *buf);
void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
+void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1aadc9a6487b..160298d285c0 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2974,7 +2974,7 @@ static void trace_module_remove_events(struct module *mod)
* over from this module may be passed to the new module events and
* unexpected results may occur.
*/
- tracing_reset_all_online_cpus();
+ tracing_reset_all_online_cpus_unlocked();
}
static int trace_module_notify(struct notifier_block *self,
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 2fdf3fd591e1..08c7df42ade7 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -1416,7 +1416,6 @@ int synth_event_delete(const char *event_name)
mutex_unlock(&event_mutex);
if (mod) {
- mutex_lock(&trace_types_lock);
/*
* It is safest to reset the ring buffer if the module
* being unloaded registered any events that were
@@ -1428,7 +1427,6 @@ int synth_event_delete(const char *event_name)
* occur.
*/
tracing_reset_all_online_cpus();
- mutex_unlock(&trace_types_lock);
}
return ret;
--
2.25.1
Please backport the following changes to the 4.18 stable kernel:
e1891cffd4c4 "ipmi: Make the smi watcher be disabled immediately when not needed"
383035211c79 "ipmi: move message error checking to avoid deadlock"
e1891cffd4c4 doesn't apply completely cleanly because of other changes,
but you just need to leave in the free_user_work() function and delete
the other function in the conflict. I can also supply a patch if
necessary.
Change
b4a34aa6d "ipmi: Fix how the lower layers are told to watch for messages"
was backported to fullfill a dependency for another backport, but there
was another change:
e1891cffd4c4 "ipmi: Make the smi watcher be disabled immediately when not needed"
That is needed to avoid calling a lower layer function with
xmit_msgs_lock held. In addition to that, you will also need:
383035211c79 "ipmi: move message error checking to avoid deadlock"
to fix a bug in that change.
e1891cffd4c4 came in 5.1 and 383035211c79 came in 5.4 (and I believe was
backported) so everything should be good for 5.4 and later. b4a34aa6d
was not backported to 4.14, so it is also ok. So 4.19 is the only
kernel that needs the change.
Thanks to Janne Huttunen for quick work on this.
-corey
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 97b6b9cbba40a21c1d9a344d5c1991f8cfbf136e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061700-surplus-art-1fef@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 97b6b9cbba40a21c1d9a344d5c1991f8cfbf136e Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda <ribalda(a)chromium.org>
Date: Fri, 19 May 2023 16:47:37 +0200
Subject: [PATCH] x86/purgatory: remove PGO flags
If profile-guided optimization is enabled, the purgatory ends up with
multiple .text sections. This is not supported by kexec and crashes the
system.
Link: https://lkml.kernel.org/r/20230321-kexec_clang16-v7-2-b05c520b7296@chromium…
Fixes: 930457057abe ("kernel/kexec_file.c: split up __kexec_load_puragory")
Signed-off-by: Ricardo Ribalda <ribalda(a)chromium.org>
Cc: <stable(a)vger.kernel.org>
Cc: Albert Ou <aou(a)eecs.berkeley.edu>
Cc: Baoquan He <bhe(a)redhat.com>
Cc: Borislav Petkov (AMD) <bp(a)alien8.de>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Dave Young <dyoung(a)redhat.com>
Cc: Eric W. Biederman <ebiederm(a)xmission.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Nathan Chancellor <nathan(a)kernel.org>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Nick Desaulniers <ndesaulniers(a)google.com>
Cc: Palmer Dabbelt <palmer(a)dabbelt.com>
Cc: Palmer Dabbelt <palmer(a)rivosinc.com>
Cc: Paul Walmsley <paul.walmsley(a)sifive.com>
Cc: Philipp Rudo <prudo(a)redhat.com>
Cc: Ross Zwisler <zwisler(a)google.com>
Cc: Simon Horman <horms(a)kernel.org>
Cc: Steven Rostedt (Google) <rostedt(a)goodmis.org>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Tom Rix <trix(a)redhat.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 82fec66d46d2..42abd6af1198 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -14,6 +14,11 @@ $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE
CFLAGS_sha256.o := -D__DISABLE_EXPORTS
+# When profile-guided optimization is enabled, llvm emits two different
+# overlapping text sections, which is not supported by kexec. Remove profile
+# optimization flags.
+KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%,$(KBUILD_CFLAGS))
+
# When linking purgatory.ro with -r unresolved symbols are not checked,
# also link a purgatory.chk binary without -r to check for unresolved symbols.
PURGATORY_LDFLAGS := -e purgatory_start -z nodefaultlib
commit 92c5d1b860e9581d64baca76779576c0ab0d943d upstream.
The current sanity check for nilfs2 geometry information lacks checks for
the number of segments stored in superblocks, so even for device images
that have been destructively truncated or have an unusually high number of
segments, the mount operation may succeed.
This causes out-of-bounds block I/O on file system block reads or log
writes to the segments, the latter in particular causing
"a_ops->writepages" to repeatedly fail, resulting in sync_inodes_sb() to
hang.
Fix this issue by checking the number of segments stored in the superblock
and avoiding mounting devices that can cause out-of-bounds accesses. To
eliminate the possibility of overflow when calculating the number of
blocks required for the device from the number of segments, this also adds
a helper function to calculate the upper bound on the number of segments
and inserts a check using it.
Link: https://lkml.kernel.org/r/20230526021332.3431-1-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+7d50f1e54a12ba3aeae2(a)syzkaller.appspotmail.com
Link: https://syzkaller.appspot.com/bug?extid=7d50f1e54a12ba3aeae2
Tested-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
Please apply this patch to the above stable trees instead of the patch
that could not be applied to them. The hang issue reported by syzbot was
confirmed to reproduce on these stable kernels using its reproducer.
This fixes it.
In this patch, "sb_bdev_nr_blocks()" and "nilfs_err()" are replaced with
their equivalents since they don't yet exist in these kernels. With these
tweaks, this patch is applicable from v4.8 to v5.8. Also, this patch has
been tested against the title stable trees.
fs/nilfs2/the_nilfs.c | 44 ++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 43 insertions(+), 1 deletion(-)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 24f626e7d012..d550a564645e 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -375,6 +375,18 @@ unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs)
100));
}
+/**
+ * nilfs_max_segment_count - calculate the maximum number of segments
+ * @nilfs: nilfs object
+ */
+static u64 nilfs_max_segment_count(struct the_nilfs *nilfs)
+{
+ u64 max_count = U64_MAX;
+
+ do_div(max_count, nilfs->ns_blocks_per_segment);
+ return min_t(u64, max_count, ULONG_MAX);
+}
+
void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs)
{
nilfs->ns_nsegments = nsegs;
@@ -384,6 +396,8 @@ void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs)
static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
struct nilfs_super_block *sbp)
{
+ u64 nsegments, nblocks;
+
if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
nilfs_msg(nilfs->ns_sb, KERN_ERR,
"unsupported revision (superblock rev.=%d.%d, current rev.=%d.%d). Please check the version of mkfs.nilfs(2).",
@@ -430,7 +444,35 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
return -EINVAL;
}
- nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments));
+ nsegments = le64_to_cpu(sbp->s_nsegments);
+ if (nsegments > nilfs_max_segment_count(nilfs)) {
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "segment count %llu exceeds upper limit (%llu segments)",
+ (unsigned long long)nsegments,
+ (unsigned long long)nilfs_max_segment_count(nilfs));
+ return -EINVAL;
+ }
+
+ nblocks = (u64)i_size_read(nilfs->ns_sb->s_bdev->bd_inode) >>
+ nilfs->ns_sb->s_blocksize_bits;
+ if (nblocks) {
+ u64 min_block_count = nsegments * nilfs->ns_blocks_per_segment;
+ /*
+ * To avoid failing to mount early device images without a
+ * second superblock, exclude that block count from the
+ * "min_block_count" calculation.
+ */
+
+ if (nblocks < min_block_count) {
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "total number of segment blocks %llu exceeds device size (%llu blocks)",
+ (unsigned long long)min_block_count,
+ (unsigned long long)nblocks);
+ return -EINVAL;
+ }
+ }
+
+ nilfs_set_nsegments(nilfs, nsegments);
nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
return 0;
}
--
2.39.3
Dear Sir,
Need funding for your project or your business ? We are looking for
foreign direct investment partners in any of the sectors stated below and we are
willing to provide financing for up to US$ ten Billion to corporate
bodies, companies, industries and entrepreneurs with profitable
business ideas and investment projects that can generate the required
ROI, so you can draw from this opportunity. We are currently providing
funds in any of the sectors stated below. Energy & Power,
construction, Agriculture, Acquisitions, Healthcare or Hospital, Real
Estate, Oil & Gas, IT, technology, transport, mining,marine
transportation and manufacturing, Education, hotels, etc. We are
willing to finance your projects. We have developed a new funding
method that does not take longer to receive funding from our
customers. If you are seriously pursuing Foreign Direct Investment or
Joint Venture for your projects in any of the sectors above or are you
seeking a Loan to expand your Business or seeking funds to finance
your business or project ? We are willing to fund your business and we
would like you to provide us with your comprehensive business plan for
our team of investment experts to review. Kindly contact me with below
email: yousefahmedalgosaibi(a)consultant.com
Regards
Mr. Yousef Ahmed
If the core is left to remove the LEDs via devm_, it is performed too
late, after the PHY driver is removed from the PHY. This results in
dereferencing a NULL pointer when the LED core tries to turn the LED
off before destroying the LED.
Manually unregister the LEDs at a safe point in phy_remove.
Cc: stable(a)vger.kernel.org
Reported-by: Florian Fainelli <f.fainelli(a)gmail.com>
Suggested-by: Florian Fainelli <f.fainelli(a)gmail.com>
Fixes: 01e5b728e9e4 ("net: phy: Add a binding for PHY LEDs")
Signed-off-by: Andrew Lunn <andrew(a)lunn.ch>
---
drivers/net/phy/phy_device.c | 15 ++++++++++++++-
1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 17d0d0555a79..53598210be6c 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -3021,6 +3021,15 @@ static int phy_led_blink_set(struct led_classdev *led_cdev,
return err;
}
+static void phy_leds_unregister(struct phy_device *phydev)
+{
+ struct phy_led *phyled;
+
+ list_for_each_entry(phyled, &phydev->leds, list) {
+ led_classdev_unregister(&phyled->led_cdev);
+ }
+}
+
static int of_phy_led(struct phy_device *phydev,
struct device_node *led)
{
@@ -3054,7 +3063,7 @@ static int of_phy_led(struct phy_device *phydev,
init_data.fwnode = of_fwnode_handle(led);
init_data.devname_mandatory = true;
- err = devm_led_classdev_register_ext(dev, cdev, &init_data);
+ err = led_classdev_register_ext(dev, cdev, &init_data);
if (err)
return err;
@@ -3083,6 +3092,7 @@ static int of_phy_leds(struct phy_device *phydev)
err = of_phy_led(phydev, led);
if (err) {
of_node_put(led);
+ phy_leds_unregister(phydev);
return err;
}
}
@@ -3305,6 +3315,9 @@ static int phy_remove(struct device *dev)
cancel_delayed_work_sync(&phydev->state_queue);
+ if (IS_ENABLED(CONFIG_PHYLIB_LEDS))
+ phy_leds_unregister(phydev);
+
phydev->state = PHY_DOWN;
sfp_bus_del_upstream(phydev->sfp_bus);
--
2.40.1
Fix the test for the AST2200 in the DRAM initialization. The value
in ast->chip has to be compared against an enum constant instead of
a numerical value.
This bug got introduced when the driver was first imported into the
kernel.
Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Fixes: 312fec1405dd ("drm: Initial KMS driver for AST (ASpeed Technologies) 2000 series (v2)")
Cc: Dave Airlie <airlied(a)redhat.com>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v3.5+
Reviewed-by: Sui Jingfeng <suijingfeng(a)loongson.cn>
Reviewed-by: Jocelyn Falempe <jfalempe(a)redhat.com>
Tested-by: Jocelyn Falempe <jfalempe(a)redhat.com> # AST2600
---
drivers/gpu/drm/ast/ast_post.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/ast/ast_post.c b/drivers/gpu/drm/ast/ast_post.c
index a005aec18a020..0262aaafdb1c5 100644
--- a/drivers/gpu/drm/ast/ast_post.c
+++ b/drivers/gpu/drm/ast/ast_post.c
@@ -291,7 +291,7 @@ static void ast_init_dram_reg(struct drm_device *dev)
;
} while (ast_read32(ast, 0x10100) != 0xa8);
} else {/* AST2100/1100 */
- if (ast->chip == AST2100 || ast->chip == 2200)
+ if (ast->chip == AST2100 || ast->chip == AST2200)
dram_reg_info = ast2100_dram_table_data;
else
dram_reg_info = ast1100_dram_table_data;
--
2.41.0
The 'qcom_swrm_ctrl->pconfig' has size of QCOM_SDW_MAX_PORTS (14),
however we index it starting from 1, not 0, to match real port numbers.
This can lead to writing port config past 'pconfig' bounds and
overwriting next member of 'qcom_swrm_ctrl' struct. Reported also by
smatch:
drivers/soundwire/qcom.c:1269 qcom_swrm_get_port_config() error: buffer overflow 'ctrl->pconfig' 14 <= 14
Fixes: 9916c02ccd74 ("soundwire: qcom: cleanup internal port config indexing")
Cc: <stable(a)vger.kernel.org>
Reported-by: kernel test robot <lkp(a)intel.com>
Reported-by: Dan Carpenter <error27(a)gmail.com>
Link: https://lore.kernel.org/r/202305201301.sCJ8UDKV-lkp@intel.com/
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
---
drivers/soundwire/qcom.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/soundwire/qcom.c b/drivers/soundwire/qcom.c
index 7cb1b7eba814..88a772075907 100644
--- a/drivers/soundwire/qcom.c
+++ b/drivers/soundwire/qcom.c
@@ -202,7 +202,8 @@ struct qcom_swrm_ctrl {
u32 intr_mask;
u8 rcmd_id;
u8 wcmd_id;
- struct qcom_swrm_port_config pconfig[QCOM_SDW_MAX_PORTS];
+ /* Port numbers are 1 - 14 */
+ struct qcom_swrm_port_config pconfig[QCOM_SDW_MAX_PORTS + 1];
struct sdw_stream_runtime *sruntime[SWRM_MAX_DAIS];
enum sdw_slave_status status[SDW_MAX_DEVICES + 1];
int (*reg_read)(struct qcom_swrm_ctrl *ctrl, int reg, u32 *val);
--
2.34.1
From: Jim Wylder <jwylder(a)google.com>
[ Upstream commit 3981514180c987a79ea98f0ae06a7cbf58a9ac0f ]
Currently, when regmap_raw_write() splits the data, it uses the
max_raw_write value defined for the bus. For any bus that includes
the target register address in the max_raw_write value, the chunked
transmission will always exceed the maximum transmission length.
To avoid this problem, subtract the length of the register and the
padding from the maximum transmission.
Signed-off-by: Jim Wylder <jwylder(a)google.com
Link: https://lore.kernel.org/r/20230517152444.3690870-2-jwylder@google.com
Signed-off-by: Mark Brown <broonie(a)kernel.org
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/base/regmap/regmap.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 55a30afc14a00..2a3c3dfefdcec 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -1998,6 +1998,8 @@ int _regmap_raw_write(struct regmap *map, unsigned int reg,
size_t val_count = val_len / val_bytes;
size_t chunk_count, chunk_bytes;
size_t chunk_regs = val_count;
+ size_t max_data = map->max_raw_write - map->format.reg_bytes -
+ map->format.pad_bytes;
int ret, i;
if (!val_count)
@@ -2005,8 +2007,8 @@ int _regmap_raw_write(struct regmap *map, unsigned int reg,
if (map->use_single_write)
chunk_regs = 1;
- else if (map->max_raw_write && val_len > map->max_raw_write)
- chunk_regs = map->max_raw_write / val_bytes;
+ else if (map->max_raw_write && val_len > max_data)
+ chunk_regs = max_data / val_bytes;
chunk_count = val_count / chunk_regs;
chunk_bytes = chunk_regs * val_bytes;
--
2.39.2
Hi,
Some (consumer) Phoenix laptops are showing up on the market that don't
have discrete TPMs and are choosing not to enable the AMD security
processor firmware TPM (fTPM).
In these laptops they offer Pluton, and are relying upon Pluton for TPM
functionality.
This was introduced in kernel 6.3 with:
4d2732882703 ("tpm_crb: Add support for CRB devices based on Pluton")
I double checked with a backport of this to 6.1.y and at least basic TPM
functionality does work properly.
Could this be brought back to 6.1.y so that users with these laptops
have TPM working with the LTS kernel?
Thanks,
The Amiga partition parser module uses signed int for partition sector
address and count, which will overflow for disks larger than 1 TB.
Use u64 as type for sector address and size to allow using disks up to
2 TB without LBD support, and disks larger than 2 TB with LBD. The RBD
format allows to specify disk sizes up to 2^128 bytes (though native
OS limitations reduce this somewhat, to max 2^68 bytes), so check for
u64 overflow carefully to protect against overflowing sector_t.
Bail out if sector addresses overflow 32 bits on kernels without LBD
support.
This bug was reported originally in 2012, and the fix was created by
the RDB author, Joanne Dow <jdow(a)earthlink.net>. A patch had been
discussed and reviewed on linux-m68k at that time but never officially
submitted (now resubmitted as patch 1 in this series).
This patch adds additional error checking and warning messages.
Reported-by: Martin Steigerwald <Martin(a)lichtvoll.de>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=43511
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Message-ID: <201206192146.09327.Martin(a)lichtvoll.de>
Cc: <stable(a)vger.kernel.org> # 5.2
Signed-off-by: Michael Schmitz <schmitzmic(a)gmail.com>
Reviewed-by: Geert Uytterhoeven <geert(a)linux-m68k.org>
Reviewed-by: Christoph Hellwig <hch(a)infradead.org>
---
Changes from RFC:
- use u64 instead of sector_t, since that may be u32 without LBD support
- check multiplication overflows each step - 3 u32 values may exceed u64!
- warn against use on AmigaDOS if partition data overflow u32 sector count.
- warn if partition CylBlocks larger than what's stored in the RDSK header.
- bail out if we were to overflow sector_t (32 or 64 bit).
Changes from v1:
Kars de Jong:
- use defines for magic offsets in DosEnvec struct
Geert Uytterhoeven:
- use u64 cast for multiplications of u32 numbers
- use array3_size for overflow checks
- change pr_err to pr_warn
- discontinue use of pr_cont
- reword log messages
- drop redundant nr_sects overflow test
- warn against 32 bit overflow for each affected partition
- skip partitions that overflow sector_t size instead of aborting scan
Changes from v2:
- further trim 32 bit overflow test
- correct duplicate types.h inclusion introduced in v2
Changes from v3:
- split off sector address type fix for independent review
- change blksize to unsigned
- use check_mul_overflow() instead of array3_size()
- rewrite checks to avoid 64 bit divisions in check_mul_overflow()
Changes from v5:
Geert Uytterhoeven:
- correct ineffective u64 cast to avoid 32 bit mult. overflow
- fix mult. overflow in partition block address calculation
Changes from v6:
Geert Uytterhoeven:
- don't fail hard on partition block address overflow
Changes from v7:
- replace bdevname(state->bdev, b) by state->disk->disk_name
- drop warn_no_part conditionals
- remove remaining warn_no_part
Changes from v8:
Christoph Hellwig:
- whitespace fix, drop unnecessary u64 casts
kbuild warning:
- sparse warning fix
Changes from v9:
- revert ineffective sparse warning fix, and rely on
change to annotation of rdb_CylBlocks (patch 2 this
series) instead.
- add Fixes: tags and stable backport prereq
---
block/partitions/amiga.c | 103 ++++++++++++++++++++++++++++++++-------
1 file changed, 85 insertions(+), 18 deletions(-)
diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c
index 85c5c79aae48..ed222b9c901b 100644
--- a/block/partitions/amiga.c
+++ b/block/partitions/amiga.c
@@ -11,10 +11,18 @@
#define pr_fmt(fmt) fmt
#include <linux/types.h>
+#include <linux/mm_types.h>
+#include <linux/overflow.h>
#include <linux/affs_hardblocks.h>
#include "check.h"
+/* magic offsets in partition DosEnvVec */
+#define NR_HD 3
+#define NR_SECT 5
+#define LO_CYL 9
+#define HI_CYL 10
+
static __inline__ u32
checksum_block(__be32 *m, int size)
{
@@ -31,9 +39,12 @@ int amiga_partition(struct parsed_partitions *state)
unsigned char *data;
struct RigidDiskBlock *rdb;
struct PartitionBlock *pb;
- sector_t start_sect, nr_sects;
- int blk, part, res = 0;
- int blksize = 1; /* Multiplier for disk block size */
+ u64 start_sect, nr_sects;
+ sector_t blk, end_sect;
+ u32 cylblk; /* rdb_CylBlocks = nr_heads*sect_per_track */
+ u32 nr_hd, nr_sect, lo_cyl, hi_cyl;
+ int part, res = 0;
+ unsigned int blksize = 1; /* Multiplier for disk block size */
int slot = 1;
for (blk = 0; ; blk++, put_dev_sector(sect)) {
@@ -41,7 +52,7 @@ int amiga_partition(struct parsed_partitions *state)
goto rdb_done;
data = read_part_sector(state, blk, §);
if (!data) {
- pr_err("Dev %s: unable to read RDB block %d\n",
+ pr_err("Dev %s: unable to read RDB block %llu\n",
state->disk->disk_name, blk);
res = -1;
goto rdb_done;
@@ -58,12 +69,12 @@ int amiga_partition(struct parsed_partitions *state)
*(__be32 *)(data+0xdc) = 0;
if (checksum_block((__be32 *)data,
be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) {
- pr_err("Trashed word at 0xd0 in block %d ignored in checksum calculation\n",
+ pr_err("Trashed word at 0xd0 in block %llu ignored in checksum calculation\n",
blk);
break;
}
- pr_err("Dev %s: RDB in block %d has bad checksum\n",
+ pr_err("Dev %s: RDB in block %llu has bad checksum\n",
state->disk->disk_name, blk);
}
@@ -80,10 +91,15 @@ int amiga_partition(struct parsed_partitions *state)
blk = be32_to_cpu(rdb->rdb_PartitionList);
put_dev_sector(sect);
for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
- blk *= blksize; /* Read in terms partition table understands */
+ /* Read in terms partition table understands */
+ if (check_mul_overflow(blk, (sector_t) blksize, &blk)) {
+ pr_err("Dev %s: overflow calculating partition block %llu! Skipping partitions %u and beyond\n",
+ state->disk->disk_name, blk, part);
+ break;
+ }
data = read_part_sector(state, blk, §);
if (!data) {
- pr_err("Dev %s: unable to read partition block %d\n",
+ pr_err("Dev %s: unable to read partition block %llu\n",
state->disk->disk_name, blk);
res = -1;
goto rdb_done;
@@ -95,19 +111,70 @@ int amiga_partition(struct parsed_partitions *state)
if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 )
continue;
- /* Tell Kernel about it */
+ /* RDB gives us more than enough rope to hang ourselves with,
+ * many times over (2^128 bytes if all fields max out).
+ * Some careful checks are in order, so check for potential
+ * overflows.
+ * We are multiplying four 32 bit numbers to one sector_t!
+ */
+
+ nr_hd = be32_to_cpu(pb->pb_Environment[NR_HD]);
+ nr_sect = be32_to_cpu(pb->pb_Environment[NR_SECT]);
+
+ /* CylBlocks is total number of blocks per cylinder */
+ if (check_mul_overflow(nr_hd, nr_sect, &cylblk)) {
+ pr_err("Dev %s: heads*sects %u overflows u32, skipping partition!\n",
+ state->disk->disk_name, cylblk);
+ continue;
+ }
+
+ /* check for consistency with RDB defined CylBlocks */
+ if (cylblk > be32_to_cpu(rdb->rdb_CylBlocks)) {
+ pr_warn("Dev %s: cylblk %u > rdb_CylBlocks %u!\n",
+ state->disk->disk_name, cylblk,
+ be32_to_cpu(rdb->rdb_CylBlocks));
+ }
+
+ /* RDB allows for variable logical block size -
+ * normalize to 512 byte blocks and check result.
+ */
+
+ if (check_mul_overflow(cylblk, blksize, &cylblk)) {
+ pr_err("Dev %s: partition %u bytes per cyl. overflows u32, skipping partition!\n",
+ state->disk->disk_name, part);
+ continue;
+ }
+
+ /* Calculate partition start and end. Limit of 32 bit on cylblk
+ * guarantees no overflow occurs if LBD support is enabled.
+ */
+
+ lo_cyl = be32_to_cpu(pb->pb_Environment[LO_CYL]);
+ start_sect = ((u64) lo_cyl * cylblk);
+
+ hi_cyl = be32_to_cpu(pb->pb_Environment[HI_CYL]);
+ nr_sects = (((u64) hi_cyl - lo_cyl + 1) * cylblk);
- nr_sects = ((sector_t)be32_to_cpu(pb->pb_Environment[10]) + 1 -
- be32_to_cpu(pb->pb_Environment[9])) *
- be32_to_cpu(pb->pb_Environment[3]) *
- be32_to_cpu(pb->pb_Environment[5]) *
- blksize;
if (!nr_sects)
continue;
- start_sect = (sector_t)be32_to_cpu(pb->pb_Environment[9]) *
- be32_to_cpu(pb->pb_Environment[3]) *
- be32_to_cpu(pb->pb_Environment[5]) *
- blksize;
+
+ /* Warn user if partition end overflows u32 (AmigaDOS limit) */
+
+ if ((start_sect + nr_sects) > UINT_MAX) {
+ pr_warn("Dev %s: partition %u (%llu-%llu) needs 64 bit device support!\n",
+ state->disk->disk_name, part,
+ start_sect, start_sect + nr_sects);
+ }
+
+ if (check_add_overflow(start_sect, nr_sects, &end_sect)) {
+ pr_err("Dev %s: partition %u (%llu-%llu) needs LBD device support, skipping partition!\n",
+ state->disk->disk_name, part,
+ start_sect, end_sect);
+ continue;
+ }
+
+ /* Tell Kernel about it */
+
put_partition(state,slot++,start_sect,nr_sects);
{
/* Be even more informative to aid mounting */
--
2.17.1
The Amiga partition parser module uses signed int for partition sector
address and count, which will overflow for disks larger than 1 TB.
Use sector_t as type for sector address and size to allow using disks
up to 2 TB without LBD support, and disks larger than 2 TB with LBD.
This bug was reported originally in 2012, and the fix was created by
the RDB author, Joanne Dow <jdow(a)earthlink.net>. A patch had been
discussed and reviewed on linux-m68k at that time but never officially
submitted. This patch differs from Joanne's patch only in its use of
sector_t instead of unsigned int. No checking for overflows is done
(see patch 3 of this series for that).
Reported-by: Martin Steigerwald <Martin(a)lichtvoll.de>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=43511
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Message-ID: <201206192146.09327.Martin(a)lichtvoll.de>
Cc: <stable(a)vger.kernel.org> # 5.2
Signed-off-by: Michael Schmitz <schmitzmic(a)gmail.com>
Tested-by: Martin Steigerwald <Martin(a)lichtvoll.de>
Reviewed-by: Geert Uytterhoeven <geert(a)linux-m68k.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
---
Changes from v3:
- split off change of sector address type as quick fix.
- cast to sector_t in sector address calculations.
- move overflow checking to separate patch for more thorough review.
Changes from v4:
Andreas Schwab:
- correct cast to sector_t in sector address calculations
Changes from v7:
Christoph Hellwig
- correct style issues
Changes from v9:
- add Fixes: tags and stable backport prereq
---
block/partitions/amiga.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c
index 5c8624e26a54..85c5c79aae48 100644
--- a/block/partitions/amiga.c
+++ b/block/partitions/amiga.c
@@ -31,7 +31,8 @@ int amiga_partition(struct parsed_partitions *state)
unsigned char *data;
struct RigidDiskBlock *rdb;
struct PartitionBlock *pb;
- int start_sect, nr_sects, blk, part, res = 0;
+ sector_t start_sect, nr_sects;
+ int blk, part, res = 0;
int blksize = 1; /* Multiplier for disk block size */
int slot = 1;
@@ -96,14 +97,14 @@ int amiga_partition(struct parsed_partitions *state)
/* Tell Kernel about it */
- nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 -
- be32_to_cpu(pb->pb_Environment[9])) *
+ nr_sects = ((sector_t)be32_to_cpu(pb->pb_Environment[10]) + 1 -
+ be32_to_cpu(pb->pb_Environment[9])) *
be32_to_cpu(pb->pb_Environment[3]) *
be32_to_cpu(pb->pb_Environment[5]) *
blksize;
if (!nr_sects)
continue;
- start_sect = be32_to_cpu(pb->pb_Environment[9]) *
+ start_sect = (sector_t)be32_to_cpu(pb->pb_Environment[9]) *
be32_to_cpu(pb->pb_Environment[3]) *
be32_to_cpu(pb->pb_Environment[5]) *
blksize;
--
2.17.1
A crash was reported in amd-sfh related to hid core initialization
before SFH initialization has run.
```
amdtp_hid_request+0x36/0x50 [amd_sfh
2e3095779aada9fdb1764f08ca578ccb14e41fe4]
sensor_hub_get_feature+0xad/0x170 [hid_sensor_hub
d6157999c9d260a1bfa6f27d4a0dc2c3e2c5654e]
hid_sensor_parse_common_attributes+0x217/0x310 [hid_sensor_iio_common
07a7935272aa9c7a28193b574580b3e953a64ec4]
hid_gyro_3d_probe+0x7f/0x2e0 [hid_sensor_gyro_3d
9f2eb51294a1f0c0315b365f335617cbaef01eab]
platform_probe+0x44/0xa0
really_probe+0x19e/0x3e0
```
Ensure that sensors have been set up before calling into
amd_sfh_get_report() or amd_sfh_set_report().
Cc: stable(a)vger.kernel.org
Cc: Linux regression tracking (Thorsten Leemhuis) <regressions(a)leemhuis.info>
Fixes: 7bcfdab3f0c6 ("HID: amd_sfh: if no sensors are enabled, clean up")
Reported-by: Haochen Tong <linux(a)hexchain.org>
Link: https://lore.kernel.org/all/3250319.ancTxkQ2z5@zen/T/
Signed-off-by: Mario Limonciello <mario.limonciello(a)amd.com>
---
drivers/hid/amd-sfh-hid/amd_sfh_client.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_client.c b/drivers/hid/amd-sfh-hid/amd_sfh_client.c
index d9b7b01900b5..88f3d913eaa1 100644
--- a/drivers/hid/amd-sfh-hid/amd_sfh_client.c
+++ b/drivers/hid/amd-sfh-hid/amd_sfh_client.c
@@ -25,6 +25,9 @@ void amd_sfh_set_report(struct hid_device *hid, int report_id,
struct amdtp_cl_data *cli_data = hid_data->cli_data;
int i;
+ if (!cli_data->is_any_sensor_enabled)
+ return;
+
for (i = 0; i < cli_data->num_hid_devices; i++) {
if (cli_data->hid_sensor_hubs[i] == hid) {
cli_data->cur_hid_dev = i;
@@ -41,6 +44,9 @@ int amd_sfh_get_report(struct hid_device *hid, int report_id, int report_type)
struct request_list *req_list = &cli_data->req_list;
int i;
+ if (!cli_data->is_any_sensor_enabled)
+ return -ENODEV;
+
for (i = 0; i < cli_data->num_hid_devices; i++) {
if (cli_data->hid_sensor_hubs[i] == hid) {
struct request_list *new = kzalloc(sizeof(*new), GFP_KERNEL);
--
2.34.1
I was able to reproduce crash on 5.15.y kernel during COW, and
when the grandchild process attempts a write to a private page
inherited from the child process and the private page contains
a memory uncorrectable error. The way to reproduce is described
in Tony's patch, using his ras-tools/einj_mem_uc.
And the patch series fixed the panic issue in 5.15.y.
The backport has encountered trivial conflicts due to missing
dependencies, details are provided in each patch.
Please let me know whether the backport is acceptable.
Tony Luck (2):
mm, hwpoison: try to recover from copy-on write faults
mm, hwpoison: when copy-on-write hits poison, take page offline
include/linux/highmem.h | 24 ++++++++++++++++++++++++
include/linux/mm.h | 5 ++++-
mm/memory.c | 33 +++++++++++++++++++++++----------
3 files changed, 51 insertions(+), 11 deletions(-)
--
2.18.4
From: "Uladzislau Rezki (Sony)" <urezki(a)gmail.com>
From: "Uladzislau Rezki (Sony)" <urezki(a)gmail.com>
commit 5da7cb193db32da783a3f3e77d8b639989321d48 upstream.
Memory passed to kvfree_rcu() that is to be freed is tracked by a
per-CPU kfree_rcu_cpu structure, which in turn contains pointers
to kvfree_rcu_bulk_data structures that contain pointers to memory
that has not yet been handed to RCU, along with an kfree_rcu_cpu_work
structure that tracks the memory that has already been handed to RCU.
These structures track three categories of memory: (1) Memory for
kfree(), (2) Memory for kvfree(), and (3) Memory for both that arrived
during an OOM episode. The first two categories are tracked in a
cache-friendly manner involving a dynamically allocated page of pointers
(the aforementioned kvfree_rcu_bulk_data structures), while the third
uses a simple (but decidedly cache-unfriendly) linked list through the
rcu_head structures in each block of memory.
On a given CPU, these three categories are handled as a unit, with that
CPU's kfree_rcu_cpu_work structure having one pointer for each of the
three categories. Clearly, new memory for a given category cannot be
placed in the corresponding kfree_rcu_cpu_work structure until any old
memory has had its grace period elapse and thus has been removed. And
the kfree_rcu_monitor() function does in fact check for this.
Except that the kfree_rcu_monitor() function checks these pointers one
at a time. This means that if the previous kfree_rcu() memory passed
to RCU had only category 1 and the current one has only category 2, the
kfree_rcu_monitor() function will send that current category-2 memory
along immediately. This can result in memory being freed too soon,
that is, out from under unsuspecting RCU readers.
To see this, consider the following sequence of events, in which:
o Task A on CPU 0 calls rcu_read_lock(), then uses "from_cset",
then is preempted.
o CPU 1 calls kfree_rcu(cset, rcu_head) in order to free "from_cset"
after a later grace period. Except that "from_cset" is freed
right after the previous grace period ended, so that "from_cset"
is immediately freed. Task A resumes and references "from_cset"'s
member, after which nothing good happens.
In full detail:
CPU 0 CPU 1
---------------------- ----------------------
count_memcg_event_mm()
|rcu_read_lock() <---
|mem_cgroup_from_task()
|// css_set_ptr is the "from_cset" mentioned on CPU 1
|css_set_ptr = rcu_dereference((task)->cgroups)
|// Hard irq comes, current task is scheduled out.
cgroup_attach_task()
|cgroup_migrate()
|cgroup_migrate_execute()
|css_set_move_task(task, from_cset, to_cset, true)
|cgroup_move_task(task, to_cset)
|rcu_assign_pointer(.., to_cset)
|...
|cgroup_migrate_finish()
|put_css_set_locked(from_cset)
|from_cset->refcount return 0
|kfree_rcu(cset, rcu_head) // free from_cset after new gp
|add_ptr_to_bulk_krc_lock()
|schedule_delayed_work(&krcp->monitor_work, ..)
kfree_rcu_monitor()
|krcp->bulk_head[0]'s work attached to krwp->bulk_head_free[]
|queue_rcu_work(system_wq, &krwp->rcu_work)
|if rwork->rcu.work is not in WORK_STRUCT_PENDING_BIT state,
|call_rcu(&rwork->rcu, rcu_work_rcufn) <--- request new gp
// There is a perious call_rcu(.., rcu_work_rcufn)
// gp end, rcu_work_rcufn() is called.
rcu_work_rcufn()
|__queue_work(.., rwork->wq, &rwork->work);
|kfree_rcu_work()
|krwp->bulk_head_free[0] bulk is freed before new gp end!!!
|The "from_cset" is freed before new gp end.
// the task resumes some time later.
|css_set_ptr->subsys[(subsys_id) <--- Caused kernel crash, because css_set_ptr is freed.
This commit therefore causes kfree_rcu_monitor() to refrain from moving
kfree_rcu() memory to the kfree_rcu_cpu_work structure until the RCU
grace period has completed for all three categories.
v2: Use helper function instead of inserted code block at kfree_rcu_monitor().
[UR: backport to 5.10-stable]
[UR: Added missing need_offload_krc() function]
Fixes: 34c881745549 ("rcu: Support kfree_bulk() interface in kfree_rcu()")
Fixes: 5f3c8d620447 ("rcu/tree: Maintain separate array for vmalloc ptrs")
Reported-by: Mukesh Ojha <quic_mojha(a)quicinc.com>
Signed-off-by: Ziwei Dai <ziwei.dai(a)unisoc.com>
Reviewed-by: Uladzislau Rezki (Sony) <urezki(a)gmail.com>
Tested-by: Uladzislau Rezki (Sony) <urezki(a)gmail.com>
Signed-off-by: Paul E. McKenney <paulmck(a)kernel.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki(a)gmail.com>
Signed-off-by: Suren Baghdasaryan <surenb(a)google.com>
---
Resending per Greg's request.
Original posting: https://lore.kernel.org/all/20230418102518.5911-1-urezki@gmail.com/
kernel/rcu/tree.c | 49 +++++++++++++++++++++++++++++++++--------------
1 file changed, 35 insertions(+), 14 deletions(-)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 30e1d7fedb5f..eec8e2f7537e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3281,6 +3281,30 @@ static void kfree_rcu_work(struct work_struct *work)
}
}
+static bool
+need_offload_krc(struct kfree_rcu_cpu *krcp)
+{
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (krcp->bkvhead[i])
+ return true;
+
+ return !!krcp->head;
+}
+
+static bool
+need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
+{
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (krwp->bkvhead_free[i])
+ return true;
+
+ return !!krwp->head_free;
+}
+
/*
* Schedule the kfree batch RCU work to run in workqueue context after a GP.
*
@@ -3298,16 +3322,13 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
for (i = 0; i < KFREE_N_BATCHES; i++) {
krwp = &(krcp->krw_arr[i]);
- /*
- * Try to detach bkvhead or head and attach it over any
- * available corresponding free channel. It can be that
- * a previous RCU batch is in progress, it means that
- * immediately to queue another one is not possible so
- * return false to tell caller to retry.
- */
- if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
- (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
- (krcp->head && !krwp->head_free)) {
+ // Try to detach bulk_head or head and attach it, only when
+ // all channels are free. Any channel is not free means at krwp
+ // there is on-going rcu work to handle krwp's free business.
+ if (need_wait_for_krwp_work(krwp))
+ continue;
+
+ if (need_offload_krc(krcp)) {
// Channel 1 corresponds to SLAB ptrs.
// Channel 2 corresponds to vmalloc ptrs.
for (j = 0; j < FREE_N_CHANNELS; j++) {
@@ -3334,12 +3355,12 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
*/
queue_rcu_work(system_wq, &krwp->rcu_work);
}
-
- // Repeat if any "free" corresponding channel is still busy.
- if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
- repeat = true;
}
+ // Repeat if any "free" corresponding channel is still busy.
+ if (need_offload_krc(krcp))
+ repeat = true;
+
return !repeat;
}
--
2.41.0.162.gfafddb0af9-goog
This patch fixes a stable only patch, so it has no direct upstream
equivalent.
After a stable only patch to explicitly handle the '.got' section to
handle an orphan section warning from the linker, certain configurations
error when linking with ld.lld, which enables relro by default:
ld.lld: error: section: .got is not contiguous with other relro sections
This has come up with other architectures before, such as arm and arm64
in commit 0cda9bc15dfc ("ARM: 9038/1: Link with '-z norelro'") and
commit 3b92fa7485eb ("arm64: link with -z norelro regardless of
CONFIG_RELOCATABLE"). Additionally, '-z norelro' is used unconditionally
for RISC-V upstream after commit 26e7aacb83df ("riscv: Allow to
downgrade paging mode from the command line"), which alluded to this
issue for the same reason. Bring 6.3 in line with mainline and link with
'-z norelro', which resolves the above link failure.
Fixes: e6d1562dd4e9 ("riscv: vmlinux.lds.S: Explicitly handle '.got' section")
Reported-by: kernel test robot <lkp(a)intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306192231.DJmWr6BX-lkp@intel.com/
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
---
arch/riscv/Makefile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index b05e833a022d..d46b6722710f 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -7,7 +7,7 @@
#
OBJCOPYFLAGS := -O binary
-LDFLAGS_vmlinux :=
+LDFLAGS_vmlinux := -z norelro
ifeq ($(CONFIG_DYNAMIC_FTRACE),y)
LDFLAGS_vmlinux := --no-relax
KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
---
base-commit: f2427f9a3730e9a1a11b69f6b767f7f2fad87523
change-id: 20230620-6-3-fix-got-relro-error-lld-397f3112860b
Best regards,
--
Nathan Chancellor <nathan(a)kernel.org>
In jfs_dmap.c at line 381, BLKTODMAP is used to get a logical block
number inside dbFree(). db_l2nbperpage, which is the log2 number of
blocks per page, is passed as an argument to BLKTODMAP which uses it
for shifting.
Syzbot reported a shift out-of-bounds crash because db_l2nbperpage is
too big. This happens because the large value is set without any
validation in dbMount() at line 181.
Thus, make sure that db_l2nbperpage is correct while mounting.
Max number of blocks per page = Page size / Min block size
=> log2(Max num_block per page) = log2(Page size / Min block size)
= log2(Page size) - log2(Min block size)
=> Max db_l2nbperpage = L2PSIZE - L2MINBLOCKSIZE
Reported-and-tested-by: syzbot+d2cd27dcf8e04b232eb2(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?id=2a70a453331db32ed491f5cbb07e81bf2d2257…
Cc: stable(a)vger.kernel.org
Suggested-by: Dave Kleikamp <dave.kleikamp(a)oracle.com>
Signed-off-by: Siddh Raman Pant <code(a)siddh.me>
---
Changes in v3:
- Fix typo in commit message (number of pages -> number of blocks per page).
Changes in v2:
- Fix upper bound as pointed out in v1 by Shaggy.
- Add an explanation for the same in commit message for completeness.
fs/jfs/jfs_dmap.c | 6 ++++++
fs/jfs/jfs_filsys.h | 2 ++
2 files changed, 8 insertions(+)
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index a3eb1e826947..da6a2bc6bf02 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -178,7 +178,13 @@ int dbMount(struct inode *ipbmap)
dbmp_le = (struct dbmap_disk *) mp->data;
bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
+
bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
+ if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE) {
+ err = -EINVAL;
+ goto err_release_metapage;
+ }
+
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
if (!bmp->db_numag) {
err = -EINVAL;
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index b5d702df7111..33ef13a0b110 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -122,7 +122,9 @@
#define NUM_INODE_PER_IAG INOSPERIAG
#define MINBLOCKSIZE 512
+#define L2MINBLOCKSIZE 9
#define MAXBLOCKSIZE 4096
+#define L2MAXBLOCKSIZE 12
#define MAXFILESIZE ((s64)1 << 52)
#define JFS_LINK_MAX 0xffffffff
--
2.39.2
In jfs_dmap.c at line 381, BLKTODMAP is used to get a logical block
number inside dbFree(). db_l2nbperpage, which is the log2 number of
blocks per page, is passed as an argument to BLKTODMAP which uses it
for shifting.
Syzbot reported a shift out-of-bounds crash because db_l2nbperpage is
too big. This happens because the large value is set without any
validation in dbMount() at line 181.
Thus, make sure that db_l2nbperpage is correct while mounting.
Max number of pages = Page size / Min block size
=> log2(Max number of pages) = log2(Page size / Min block size)
= log2(Page size) - log2(Min block size)
=> Max db_l2nbperpage = L2PSIZE - L2MINBLOCKSIZE
Reported-and-tested-by: syzbot+d2cd27dcf8e04b232eb2(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?id=2a70a453331db32ed491f5cbb07e81bf2d2257…
Cc: stable(a)vger.kernel.org
Suggested-by: Dave Kleikamp <dave.kleikamp(a)oracle.com>
Signed-off-by: Siddh Raman Pant <code(a)siddh.me>
---
Changes in v2:
- Fix upper bound as pointed out in v1 by Shaggy.
- Add an explanation for the same in commit message for completeness.
fs/jfs/jfs_dmap.c | 6 ++++++
fs/jfs/jfs_filsys.h | 2 ++
2 files changed, 8 insertions(+)
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index a3eb1e826947..da6a2bc6bf02 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -178,7 +178,13 @@ int dbMount(struct inode *ipbmap)
dbmp_le = (struct dbmap_disk *) mp->data;
bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
+
bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
+ if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE) {
+ err = -EINVAL;
+ goto err_release_metapage;
+ }
+
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
if (!bmp->db_numag) {
err = -EINVAL;
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index b5d702df7111..33ef13a0b110 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -122,7 +122,9 @@
#define NUM_INODE_PER_IAG INOSPERIAG
#define MINBLOCKSIZE 512
+#define L2MINBLOCKSIZE 9
#define MAXBLOCKSIZE 4096
+#define L2MAXBLOCKSIZE 12
#define MAXFILESIZE ((s64)1 << 52)
#define JFS_LINK_MAX 0xffffffff
--
2.39.2
From: Jim Wylder <jwylder(a)google.com>
[ Upstream commit 3981514180c987a79ea98f0ae06a7cbf58a9ac0f ]
Currently, when regmap_raw_write() splits the data, it uses the
max_raw_write value defined for the bus. For any bus that includes
the target register address in the max_raw_write value, the chunked
transmission will always exceed the maximum transmission length.
To avoid this problem, subtract the length of the register and the
padding from the maximum transmission.
Signed-off-by: Jim Wylder <jwylder(a)google.com
Link: https://lore.kernel.org/r/20230517152444.3690870-2-jwylder@google.com
Signed-off-by: Mark Brown <broonie(a)kernel.org
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/base/regmap/regmap.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 7de1f27d0323d..8359164bff903 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -2064,6 +2064,8 @@ int _regmap_raw_write(struct regmap *map, unsigned int reg,
size_t val_count = val_len / val_bytes;
size_t chunk_count, chunk_bytes;
size_t chunk_regs = val_count;
+ size_t max_data = map->max_raw_write - map->format.reg_bytes -
+ map->format.pad_bytes;
int ret, i;
if (!val_count)
@@ -2071,8 +2073,8 @@ int _regmap_raw_write(struct regmap *map, unsigned int reg,
if (map->use_single_write)
chunk_regs = 1;
- else if (map->max_raw_write && val_len > map->max_raw_write)
- chunk_regs = map->max_raw_write / val_bytes;
+ else if (map->max_raw_write && val_len > max_data)
+ chunk_regs = max_data / val_bytes;
chunk_count = val_count / chunk_regs;
chunk_bytes = chunk_regs * val_bytes;
--
2.39.2
The current way how lowcomms is configured is due configfs entries. Each
comms configfs entry will create a lowcomms connection. Even the local
connection itself will be stored as a lowcomms connection, although most
functionality for a local lowcomms connection struct is not necessary.
Now in some scenarios we will see that dlm_controld reports a -EEXIST
when configure a node via configfs:
... /sys/kernel/config/dlm/cluster/comms/1/addr: write failed: 17 -1
Doing a:
cat /sys/kernel/config/dlm/cluster/comms/1/addr_list
reported nothing. This was being seen on cluster with nodeid 1 and it's
local configuration. To be sure the configfs entries are in sync with
lowcomms connection structures we always call dlm_midcomms_close() to be
sure the lowcomms connection gets removed when the configfs entry gets
dropped.
Before commit 07ee38674a0b ("fs: dlm: filter ourself midcomms calls") it
was just doing this by accident and the filter by doing:
if (nodeid == dlm_our_nodeid())
return 0;
inside dlm_midcomms_close() was never been hit because drop_comm() sets
local_comm to NULL and cause that dlm_our_nodeid() returns always the
invalid nodeid 0.
Fixes: 07ee38674a0b ("fs: dlm: filter ourself midcomms calls")
Cc: stable(a)vger.kernel.org
Signed-off-by: Alexander Aring <aahringo(a)redhat.com>
---
changes since v2:
- add fixes tag
- cc stable
fs/dlm/config.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 4246cd425671..2beceff024e3 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -532,8 +532,7 @@ static void drop_comm(struct config_group *g, struct config_item *i)
struct dlm_comm *cm = config_item_to_comm(i);
if (local_comm == cm)
local_comm = NULL;
- if (!cm->local)
- dlm_midcomms_close(cm->nodeid);
+ dlm_midcomms_close(cm->nodeid);
while (cm->addr_count--)
kfree(cm->addr[cm->addr_count]);
config_item_put(i);
--
2.31.1
Don't assume that the device is fully under the control of PCI core.
Use RMW capability accessors in link retraining which do proper locking
to avoid losing concurrent updates to the register values.
Fixes: 4ec73791a64b ("PCI: Work around Pericom PCIe-to-PCI bridge Retrain Link erratum")
Fixes: 7d715a6c1ae5 ("PCI: add PCI Express ASPM support")
Suggested-by: Lukas Wunner <lukas(a)wunner.de>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Acked-by: Rafael J. Wysocki <rafael(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
pci/enumeration branch moves the link retraining code into PCI core and
also conflicts with a link retraining fix in pci/aspm. The changelog
(and patch splitting) takes the move into account by not referring to
ASPM while the change itself is not based on pci/enumeration (as per
Bjorn's preference).
---
drivers/pci/pcie/aspm.c | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 66d7514ca111..50e32bda4656 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -199,17 +199,14 @@ static bool pcie_retrain_link(struct pcie_link_state *link)
unsigned long end_jiffies;
u16 reg16;
- pcie_capability_read_word(parent, PCI_EXP_LNKCTL, ®16);
- reg16 |= PCI_EXP_LNKCTL_RL;
- pcie_capability_write_word(parent, PCI_EXP_LNKCTL, reg16);
+ pcie_capability_set_word(parent, PCI_EXP_LNKCTL, PCI_EXP_LNKCTL_RL);
if (parent->clear_retrain_link) {
/*
* Due to an erratum in some devices the Retrain Link bit
* needs to be cleared again manually to allow the link
* training to succeed.
*/
- reg16 &= ~PCI_EXP_LNKCTL_RL;
- pcie_capability_write_word(parent, PCI_EXP_LNKCTL, reg16);
+ pcie_capability_clear_word(parent, PCI_EXP_LNKCTL, PCI_EXP_LNKCTL_RL);
}
/* Wait for link training end. Break out after waiting for timeout */
--
2.30.2
Many places in the kernel write the Link Control and Root Control PCI
Express Capability Registers without proper concurrency control and
this could result in losing the changes one of the writers intended to
make.
Add pcie_cap_lock spinlock into the struct pci_dev and use it to
protect bit changes made in the RMW capability accessors. Protect only
a selected set of registers by differentiating the RMW accessor
internally to locked/unlocked variants using a wrapper which has the
same signature as pcie_capability_clear_and_set_word(). As the
Capability Register (pos) given to the wrapper is always a constant,
the compiler should be able to simplify all the dead-code away.
The RMW locking is only added to pcie_capability_clear_and_set_word()
because so far only the Link Control Register (ASPM, hotplug, link
retraining, various drivers) and the Root Control Register (AER & PME)
require RMW locking.
Fixes: c7f486567c1d ("PCI PM: PCIe PME root port service driver")
Fixes: f12eb72a268b ("PCI/ASPM: Use PCI Express Capability accessors")
Fixes: 7d715a6c1ae5 ("PCI: add PCI Express ASPM support")
Fixes: affa48de8417 ("staging/rdma/hfi1: Add support for enabling/disabling PCIe ASPM")
Fixes: 849a9366cba9 ("misc: rtsx: Add support new chip rts5228 mmc: rtsx: Add support MMC_CAP2_NO_MMC")
Fixes: 3d1e7aa80d1c ("misc: rtsx: Use pcie_capability_clear_and_set_word() for PCI_EXP_LNKCTL")
Fixes: c0e5f4e73a71 ("misc: rtsx: Add support for RTS5261")
Fixes: 3df4fce739e2 ("misc: rtsx: separate aspm mode into MODE_REG and MODE_CFG")
Fixes: 121e9c6b5c4c ("misc: rtsx: modify and fix init_hw function")
Fixes: 19f3bd548f27 ("mfd: rtsx: Remove LCTLR defination")
Fixes: 773ccdfd9cc6 ("mfd: rtsx: Read vendor setting from config space")
Fixes: 8275b77a1513 ("mfd: rts5249: Add support for RTS5250S power saving")
Fixes: 5da4e04ae480 ("misc: rtsx: Add support for RTS5260")
Fixes: 0f49bfbd0f2e ("tg3: Use PCI Express Capability accessors")
Fixes: 5e7dfd0fb94a ("tg3: Prevent corruption at 10 / 100Mbps w CLKREQ")
Fixes: b726e493e8dc ("r8169: sync existing 8168 device hardware start sequences with vendor driver")
Fixes: e6de30d63eb1 ("r8169: more 8168dp support.")
Fixes: 8a06127602de ("Bluetooth: hci_bcm4377: Add new driver for BCM4377 PCIe boards")
Fixes: 6f461f6c7c96 ("e1000e: enable/disable ASPM L0s and L1 and ERT according to hardware errata")
Fixes: 1eae4eb2a1c7 ("e1000e: Disable L1 ASPM power savings for 82573 mobile variants")
Fixes: 8060e169e02f ("ath9k: Enable extended synch for AR9485 to fix L0s recovery issue")
Fixes: 69ce674bfa69 ("ath9k: do btcoex ASPM disabling at initialization time")
Fixes: f37f05503575 ("mt76: mt76x2e: disable pcie_aspm by default")
Suggested-by: Lukas Wunner <lukas(a)wunner.de>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Reviewed-by: Rafael J. Wysocki <rafael(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
drivers/pci/access.c | 20 +++++++++++++++++---
drivers/pci/probe.c | 1 +
include/linux/pci.h | 34 ++++++++++++++++++++++++++++++++--
3 files changed, 50 insertions(+), 5 deletions(-)
diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 3c230ca3de58..0b2e90d2f04f 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -497,8 +497,8 @@ int pcie_capability_write_dword(struct pci_dev *dev, int pos, u32 val)
}
EXPORT_SYMBOL(pcie_capability_write_dword);
-int pcie_capability_clear_and_set_word(struct pci_dev *dev, int pos,
- u16 clear, u16 set)
+int pcie_capability_clear_and_set_word_unlocked(struct pci_dev *dev, int pos,
+ u16 clear, u16 set)
{
int ret;
u16 val;
@@ -512,7 +512,21 @@ int pcie_capability_clear_and_set_word(struct pci_dev *dev, int pos,
return ret;
}
-EXPORT_SYMBOL(pcie_capability_clear_and_set_word);
+EXPORT_SYMBOL(pcie_capability_clear_and_set_word_unlocked);
+
+int pcie_capability_clear_and_set_word_locked(struct pci_dev *dev, int pos,
+ u16 clear, u16 set)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&dev->pcie_cap_lock, flags);
+ ret = pcie_capability_clear_and_set_word_unlocked(dev, pos, clear, set);
+ spin_unlock_irqrestore(&dev->pcie_cap_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(pcie_capability_clear_and_set_word_locked);
int pcie_capability_clear_and_set_dword(struct pci_dev *dev, int pos,
u32 clear, u32 set)
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 0b2826c4a832..53ac0d3287a8 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2318,6 +2318,7 @@ struct pci_dev *pci_alloc_dev(struct pci_bus *bus)
.end = -1,
};
+ spin_lock_init(&dev->pcie_cap_lock);
#ifdef CONFIG_PCI_MSI
raw_spin_lock_init(&dev->msi_lock);
#endif
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 60b8772b5bd4..ab7682ed172f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -467,6 +467,7 @@ struct pci_dev {
pci_dev_flags_t dev_flags;
atomic_t enable_cnt; /* pci_enable_device has been called */
+ spinlock_t pcie_cap_lock; /* Protects RMW ops in capability accessors */
u32 saved_config_space[16]; /* Config space saved at suspend time */
struct hlist_head saved_cap_space;
int rom_attr_enabled; /* Display of ROM attribute enabled? */
@@ -1217,11 +1218,40 @@ int pcie_capability_read_word(struct pci_dev *dev, int pos, u16 *val);
int pcie_capability_read_dword(struct pci_dev *dev, int pos, u32 *val);
int pcie_capability_write_word(struct pci_dev *dev, int pos, u16 val);
int pcie_capability_write_dword(struct pci_dev *dev, int pos, u32 val);
-int pcie_capability_clear_and_set_word(struct pci_dev *dev, int pos,
- u16 clear, u16 set);
+int pcie_capability_clear_and_set_word_unlocked(struct pci_dev *dev, int pos,
+ u16 clear, u16 set);
+int pcie_capability_clear_and_set_word_locked(struct pci_dev *dev, int pos,
+ u16 clear, u16 set);
int pcie_capability_clear_and_set_dword(struct pci_dev *dev, int pos,
u32 clear, u32 set);
+/**
+ * pcie_capability_clear_and_set_word - RMW accessor for PCI Express Capability Registers
+ * @dev: PCI device structure of the PCI Express device
+ * @pos: PCI Express Capability Register
+ * @clear: Clear bitmask
+ * @set: Set bitmask
+ *
+ * Perform a Read-Modify-Write (RMW) operation using @clear and @set
+ * bitmasks on PCI Express Capability Register at @pos. Certain PCI Express
+ * Capability Registers are accessed concurrently in RMW fashion, hence
+ * require locking which is handled transparently to the caller.
+ */
+static inline int pcie_capability_clear_and_set_word(struct pci_dev *dev,
+ int pos,
+ u16 clear, u16 set)
+{
+ switch (pos) {
+ case PCI_EXP_LNKCTL:
+ case PCI_EXP_RTCTL:
+ return pcie_capability_clear_and_set_word_locked(dev, pos,
+ clear, set);
+ default:
+ return pcie_capability_clear_and_set_word_unlocked(dev, pos,
+ clear, set);
+ }
+}
+
static inline int pcie_capability_set_word(struct pci_dev *dev, int pos,
u16 set)
{
--
2.30.2
Currently, associating a loop device with a different file descriptor
does not increment its diskseq. This allows the following race
condition:
1. Program X opens a loop device
2. Program X gets the diskseq of the loop device.
3. Program X associates a file with the loop device.
4. Program X passes the loop device major, minor, and diskseq to
something.
5. Program X exits.
6. Program Y detaches the file from the loop device.
7. Program Y attaches a different file to the loop device.
8. The opener finally gets around to opening the loop device and checks
that the diskseq is what it expects it to be. Even though the
diskseq is the expected value, the result is that the opener is
accessing the wrong file.
From discussions with Christoph Hellwig, it appears that
disk_force_media_change() was supposed to call inc_diskseq(), but in
fact it does not. Adding a Fixes: tag to indicate this. Christoph's
Reported-by is because he stated that disk_force_media_change()
calls inc_diskseq(), which is what led me to discover that it should but
does not.
Reported-by: Christoph Hellwig <hch(a)infradead.org>
Signed-off-by: Demi Marie Obenour <demi(a)invisiblethingslab.com>
Fixes: e6138dc12de9 ("block: add a helper to raise a media changed event")
Cc: stable(a)vger.kernel.org # 5.15+
---
block/disk-events.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/block/disk-events.c b/block/disk-events.c
index aee25a7e1ab7de8cc82b3c3774e83489d3a86ff9..450c2cbe23d56cc0fa8fa40db9866cdae0e7a626 100644
--- a/block/disk-events.c
+++ b/block/disk-events.c
@@ -307,6 +307,7 @@ bool disk_force_media_change(struct gendisk *disk, unsigned int events)
if (!(events & DISK_EVENT_MEDIA_CHANGE))
return false;
+ inc_diskseq(disk);
if (__invalidate_device(disk->part0, true))
pr_warn("VFS: busy inodes on changed media %s\n",
disk->disk_name);
--
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab
The following commit has been merged into the x86/core branch of tip:
Commit-ID: 9b040453d4440659f33dc6f0aa26af418ebfe70b
Gitweb: https://git.kernel.org/tip/9b040453d4440659f33dc6f0aa26af418ebfe70b
Author: Tony Battersby <tonyb(a)cybernetics.com>
AuthorDate: Thu, 15 Jun 2023 22:33:52 +02:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Tue, 20 Jun 2023 14:51:46 +02:00
x86/smp: Dont access non-existing CPUID leaf
stop_this_cpu() tests CPUID leaf 0x8000001f::EAX unconditionally. Intel
CPUs return the content of the highest supported leaf when a non-existing
leaf is read, while AMD CPUs return all zeros for unsupported leafs.
So the result of the test on Intel CPUs is lottery.
While harmless it's incorrect and causes the conditional wbinvd() to be
issued where not required.
Check whether the leaf is supported before reading it.
[ tglx: Adjusted changelog ]
Fixes: 08f253ec3767 ("x86/cpu: Clear SME feature flag when not in use")
Signed-off-by: Tony Battersby <tonyb(a)cybernetics.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Mario Limonciello <mario.limonciello(a)amd.com>
Reviewed-by: Borislav Petkov (AMD) <bp(a)alien8.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/3817d810-e0f1-8ef8-0bbd-663b919ca49b@cybernetics.…
Link: https://lore.kernel.org/r/20230615193330.322186388@linutronix.de
---
arch/x86/kernel/process.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 05924bc..ff9b80a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -763,6 +763,7 @@ struct cpumask cpus_stop_mask;
void __noreturn stop_this_cpu(void *dummy)
{
+ struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info);
unsigned int cpu = smp_processor_id();
local_irq_disable();
@@ -777,7 +778,7 @@ void __noreturn stop_this_cpu(void *dummy)
*/
set_cpu_online(cpu, false);
disable_local_APIC();
- mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
+ mcheck_cpu_clear(c);
/*
* Use wbinvd on processors that support SME. This provides support
@@ -791,7 +792,7 @@ void __noreturn stop_this_cpu(void *dummy)
* Test the CPUID bit directly because the machine might've cleared
* X86_FEATURE_SME due to cmdline options.
*/
- if (cpuid_eax(0x8000001f) & BIT(0))
+ if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
native_wbinvd();
/*
The following commit has been merged into the x86/core branch of tip:
Commit-ID: 1f5e7eb7868e42227ac426c96d437117e6e06e8e
Gitweb: https://git.kernel.org/tip/1f5e7eb7868e42227ac426c96d437117e6e06e8e
Author: Thomas Gleixner <tglx(a)linutronix.de>
AuthorDate: Wed, 26 Apr 2023 18:37:00 +02:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Tue, 20 Jun 2023 14:51:46 +02:00
x86/smp: Make stop_other_cpus() more robust
Tony reported intermittent lockups on poweroff. His analysis identified the
wbinvd() in stop_this_cpu() as the culprit. This was added to ensure that
on SME enabled machines a kexec() does not leave any stale data in the
caches when switching from encrypted to non-encrypted mode or vice versa.
That wbinvd() is conditional on the SME feature bit which is read directly
from CPUID. But that readout does not check whether the CPUID leaf is
available or not. If it's not available the CPU will return the value of
the highest supported leaf instead. Depending on the content the "SME" bit
might be set or not.
That's incorrect but harmless. Making the CPUID readout conditional makes
the observed hangs go away, but it does not fix the underlying problem:
CPU0 CPU1
stop_other_cpus()
send_IPIs(REBOOT); stop_this_cpu()
while (num_online_cpus() > 1); set_online(false);
proceed... -> hang
wbinvd()
WBINVD is an expensive operation and if multiple CPUs issue it at the same
time the resulting delays are even larger.
But CPU0 already observed num_online_cpus() going down to 1 and proceeds
which causes the system to hang.
This issue exists independent of WBINVD, but the delays caused by WBINVD
make it more prominent.
Make this more robust by adding a cpumask which is initialized to the
online CPU mask before sending the IPIs and CPUs clear their bit in
stop_this_cpu() after the WBINVD completed. Check for that cpumask to
become empty in stop_other_cpus() instead of watching num_online_cpus().
The cpumask cannot plug all holes either, but it's better than a raw
counter and allows to restrict the NMI fallback IPI to be sent only the
CPUs which have not reported within the timeout window.
Fixes: 08f253ec3767 ("x86/cpu: Clear SME feature flag when not in use")
Reported-by: Tony Battersby <tonyb(a)cybernetics.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Borislav Petkov (AMD) <bp(a)alien8.de>
Reviewed-by: Ashok Raj <ashok.raj(a)intel.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/3817d810-e0f1-8ef8-0bbd-663b919ca49b@cybernetic…
Link: https://lore.kernel.org/r/87h6r770bv.ffs@tglx
---
arch/x86/include/asm/cpu.h | 2 +-
arch/x86/kernel/process.c | 23 ++++++++++++--
arch/x86/kernel/smp.c | 62 ++++++++++++++++++++++++-------------
3 files changed, 64 insertions(+), 23 deletions(-)
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index 78796b9..9ba3c3d 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -98,4 +98,6 @@ extern u64 x86_read_arch_cap_msr(void);
int intel_find_matching_signature(void *mc, unsigned int csig, int cpf);
int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type);
+extern struct cpumask cpus_stop_mask;
+
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index dac41a0..05924bc 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -759,13 +759,23 @@ bool xen_set_default_idle(void)
}
#endif
+struct cpumask cpus_stop_mask;
+
void __noreturn stop_this_cpu(void *dummy)
{
+ unsigned int cpu = smp_processor_id();
+
local_irq_disable();
+
/*
- * Remove this CPU:
+ * Remove this CPU from the online mask and disable it
+ * unconditionally. This might be redundant in case that the reboot
+ * vector was handled late and stop_other_cpus() sent an NMI.
+ *
+ * According to SDM and APM NMIs can be accepted even after soft
+ * disabling the local APIC.
*/
- set_cpu_online(smp_processor_id(), false);
+ set_cpu_online(cpu, false);
disable_local_APIC();
mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
@@ -783,6 +793,15 @@ void __noreturn stop_this_cpu(void *dummy)
*/
if (cpuid_eax(0x8000001f) & BIT(0))
native_wbinvd();
+
+ /*
+ * This brings a cache line back and dirties it, but
+ * native_stop_other_cpus() will overwrite cpus_stop_mask after it
+ * observed that all CPUs reported stop. This write will invalidate
+ * the related cache line on this CPU.
+ */
+ cpumask_clear_cpu(cpu, &cpus_stop_mask);
+
for (;;) {
/*
* Use native_halt() so that memory contents don't change
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 375b33e..935bc65 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -27,6 +27,7 @@
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/apic.h>
+#include <asm/cpu.h>
#include <asm/idtentry.h>
#include <asm/nmi.h>
#include <asm/mce.h>
@@ -146,31 +147,43 @@ static int register_stop_handler(void)
static void native_stop_other_cpus(int wait)
{
- unsigned long flags;
- unsigned long timeout;
+ unsigned int cpu = smp_processor_id();
+ unsigned long flags, timeout;
if (reboot_force)
return;
- /*
- * Use an own vector here because smp_call_function
- * does lots of things not suitable in a panic situation.
- */
+ /* Only proceed if this is the first CPU to reach this code */
+ if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1)
+ return;
/*
- * We start by using the REBOOT_VECTOR irq.
- * The irq is treated as a sync point to allow critical
- * regions of code on other cpus to release their spin locks
- * and re-enable irqs. Jumping straight to an NMI might
- * accidentally cause deadlocks with further shutdown/panic
- * code. By syncing, we give the cpus up to one second to
- * finish their work before we force them off with the NMI.
+ * 1) Send an IPI on the reboot vector to all other CPUs.
+ *
+ * The other CPUs should react on it after leaving critical
+ * sections and re-enabling interrupts. They might still hold
+ * locks, but there is nothing which can be done about that.
+ *
+ * 2) Wait for all other CPUs to report that they reached the
+ * HLT loop in stop_this_cpu()
+ *
+ * 3) If #2 timed out send an NMI to the CPUs which did not
+ * yet report
+ *
+ * 4) Wait for all other CPUs to report that they reached the
+ * HLT loop in stop_this_cpu()
+ *
+ * #3 can obviously race against a CPU reaching the HLT loop late.
+ * That CPU will have reported already and the "have all CPUs
+ * reached HLT" condition will be true despite the fact that the
+ * other CPU is still handling the NMI. Again, there is no
+ * protection against that as "disabled" APICs still respond to
+ * NMIs.
*/
- if (num_online_cpus() > 1) {
- /* did someone beat us here? */
- if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
- return;
+ cpumask_copy(&cpus_stop_mask, cpu_online_mask);
+ cpumask_clear_cpu(cpu, &cpus_stop_mask);
+ if (!cpumask_empty(&cpus_stop_mask)) {
/* sync above data before sending IRQ */
wmb();
@@ -183,12 +196,12 @@ static void native_stop_other_cpus(int wait)
* CPUs reach shutdown state.
*/
timeout = USEC_PER_SEC;
- while (num_online_cpus() > 1 && timeout--)
+ while (!cpumask_empty(&cpus_stop_mask) && timeout--)
udelay(1);
}
/* if the REBOOT_VECTOR didn't work, try with the NMI */
- if (num_online_cpus() > 1) {
+ if (!cpumask_empty(&cpus_stop_mask)) {
/*
* If NMI IPI is enabled, try to register the stop handler
* and send the IPI. In any case try to wait for the other
@@ -200,7 +213,8 @@ static void native_stop_other_cpus(int wait)
pr_emerg("Shutting down cpus with NMI\n");
- apic_send_IPI_allbutself(NMI_VECTOR);
+ for_each_cpu(cpu, &cpus_stop_mask)
+ apic->send_IPI(cpu, NMI_VECTOR);
}
/*
* Don't wait longer than 10 ms if the caller didn't
@@ -208,7 +222,7 @@ static void native_stop_other_cpus(int wait)
* one or more CPUs do not reach shutdown state.
*/
timeout = USEC_PER_MSEC * 10;
- while (num_online_cpus() > 1 && (wait || timeout--))
+ while (!cpumask_empty(&cpus_stop_mask) && (wait || timeout--))
udelay(1);
}
@@ -216,6 +230,12 @@ static void native_stop_other_cpus(int wait)
disable_local_APIC();
mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
local_irq_restore(flags);
+
+ /*
+ * Ensure that the cpus_stop_mask cache lines are invalidated on
+ * the other CPUs. See comment vs. SME in stop_this_cpu().
+ */
+ cpumask_clear(&cpus_stop_mask);
}
/*
The following commit has been merged into the x86/core branch of tip:
Commit-ID: 2affa6d6db28855e6340b060b809c23477aa546e
Gitweb: https://git.kernel.org/tip/2affa6d6db28855e6340b060b809c23477aa546e
Author: Thomas Gleixner <tglx(a)linutronix.de>
AuthorDate: Thu, 15 Jun 2023 22:33:54 +02:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Tue, 20 Jun 2023 14:51:46 +02:00
x86/smp: Remove pointless wmb()s from native_stop_other_cpus()
The wmb()s before sending the IPIs are not synchronizing anything.
If at all then the apic IPI functions have to provide or act as appropriate
barriers.
Remove these cargo cult barriers which have no explanation of what they are
synchronizing.
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Borislav Petkov (AMD) <bp(a)alien8.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20230615193330.378358382@linutronix.de
---
arch/x86/kernel/smp.c | 6 ------
1 file changed, 6 deletions(-)
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 935bc65..d842875 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -184,9 +184,6 @@ static void native_stop_other_cpus(int wait)
cpumask_clear_cpu(cpu, &cpus_stop_mask);
if (!cpumask_empty(&cpus_stop_mask)) {
- /* sync above data before sending IRQ */
- wmb();
-
apic_send_IPI_allbutself(REBOOT_VECTOR);
/*
@@ -208,9 +205,6 @@ static void native_stop_other_cpus(int wait)
* CPUs to stop.
*/
if (!smp_no_nmi_ipi && !register_stop_handler()) {
- /* Sync above data before sending IRQ */
- wmb();
-
pr_emerg("Shutting down cpus with NMI\n");
for_each_cpu(cpu, &cpus_stop_mask)
The following commit has been merged into the x86/core branch of tip:
Commit-ID: f9c9987bf52f4e42e940ae217333ebb5a4c3b506
Gitweb: https://git.kernel.org/tip/f9c9987bf52f4e42e940ae217333ebb5a4c3b506
Author: Thomas Gleixner <tglx(a)linutronix.de>
AuthorDate: Thu, 15 Jun 2023 22:33:55 +02:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Tue, 20 Jun 2023 14:51:47 +02:00
x86/smp: Use dedicated cache-line for mwait_play_dead()
Monitoring idletask::thread_info::flags in mwait_play_dead() has been an
obvious choice as all what is needed is a cache line which is not written
by other CPUs.
But there is a use case where a "dead" CPU needs to be brought out of
MWAIT: kexec().
This is required as kexec() can overwrite text, pagetables, stacks and the
monitored cacheline of the original kernel. The latter causes MWAIT to
resume execution which obviously causes havoc on the kexec kernel which
results usually in triple faults.
Use a dedicated per CPU storage to prepare for that.
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Ashok Raj <ashok.raj(a)intel.com>
Reviewed-by: Borislav Petkov (AMD) <bp(a)alien8.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20230615193330.434553750@linutronix.de
---
arch/x86/kernel/smpboot.c | 24 ++++++++++++++----------
1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 352f0ce..c5ac5d7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -101,6 +101,17 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map);
DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
+struct mwait_cpu_dead {
+ unsigned int control;
+ unsigned int status;
+};
+
+/*
+ * Cache line aligned data for mwait_play_dead(). Separate on purpose so
+ * that it's unlikely to be touched by other CPUs.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
+
/* Logical package management. We might want to allocate that dynamically */
unsigned int __max_logical_packages __read_mostly;
EXPORT_SYMBOL(__max_logical_packages);
@@ -1758,10 +1769,10 @@ EXPORT_SYMBOL_GPL(cond_wakeup_cpu0);
*/
static inline void mwait_play_dead(void)
{
+ struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
unsigned int eax, ebx, ecx, edx;
unsigned int highest_cstate = 0;
unsigned int highest_subcstate = 0;
- void *mwait_ptr;
int i;
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
@@ -1796,13 +1807,6 @@ static inline void mwait_play_dead(void)
(highest_subcstate - 1);
}
- /*
- * This should be a memory location in a cache line which is
- * unlikely to be touched by other processors. The actual
- * content is immaterial as it is not actually modified in any way.
- */
- mwait_ptr = ¤t_thread_info()->flags;
-
wbinvd();
while (1) {
@@ -1814,9 +1818,9 @@ static inline void mwait_play_dead(void)
* case where we return around the loop.
*/
mb();
- clflush(mwait_ptr);
+ clflush(md);
mb();
- __monitor(mwait_ptr, 0, 0);
+ __monitor(md, 0, 0);
mb();
__mwait(eax, 0);
The following commit has been merged into the x86/core branch of tip:
Commit-ID: d7893093a7417527c0d73c9832244e65c9d0114f
Gitweb: https://git.kernel.org/tip/d7893093a7417527c0d73c9832244e65c9d0114f
Author: Thomas Gleixner <tglx(a)linutronix.de>
AuthorDate: Thu, 15 Jun 2023 22:33:57 +02:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Tue, 20 Jun 2023 14:51:47 +02:00
x86/smp: Cure kexec() vs. mwait_play_dead() breakage
TLDR: It's a mess.
When kexec() is executed on a system with offline CPUs, which are parked in
mwait_play_dead() it can end up in a triple fault during the bootup of the
kexec kernel or cause hard to diagnose data corruption.
The reason is that kexec() eventually overwrites the previous kernel's text,
page tables, data and stack. If it writes to the cache line which is
monitored by a previously offlined CPU, MWAIT resumes execution and ends
up executing the wrong text, dereferencing overwritten page tables or
corrupting the kexec kernels data.
Cure this by bringing the offlined CPUs out of MWAIT into HLT.
Write to the monitored cache line of each offline CPU, which makes MWAIT
resume execution. The written control word tells the offlined CPUs to issue
HLT, which does not have the MWAIT problem.
That does not help, if a stray NMI, MCE or SMI hits the offlined CPUs as
those make it come out of HLT.
A follow up change will put them into INIT, which protects at least against
NMI and SMI.
Fixes: ea53069231f9 ("x86, hotplug: Use mwait to offline a processor, fix the legacy case")
Reported-by: Ashok Raj <ashok.raj(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Ashok Raj <ashok.raj(a)intel.com>
Reviewed-by: Ashok Raj <ashok.raj(a)intel.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20230615193330.492257119@linutronix.de
---
arch/x86/include/asm/smp.h | 2 +-
arch/x86/kernel/smp.c | 5 +++-
arch/x86/kernel/smpboot.c | 59 +++++++++++++++++++++++++++++++++++++-
3 files changed, 66 insertions(+)
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4e91054..d4ce5cb 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -132,6 +132,8 @@ void wbinvd_on_cpu(int cpu);
int wbinvd_on_all_cpus(void);
void cond_wakeup_cpu0(void);
+void smp_kick_mwait_play_dead(void);
+
void native_smp_send_reschedule(int cpu);
void native_send_call_func_ipi(const struct cpumask *mask);
void native_send_call_func_single_ipi(int cpu);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d842875..174d623 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -21,6 +21,7 @@
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/gfp.h>
+#include <linux/kexec.h>
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
@@ -157,6 +158,10 @@ static void native_stop_other_cpus(int wait)
if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1)
return;
+ /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
+ if (kexec_in_progress)
+ smp_kick_mwait_play_dead();
+
/*
* 1) Send an IPI on the reboot vector to all other CPUs.
*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c5ac5d7..483df04 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -53,6 +53,7 @@
#include <linux/tboot.h>
#include <linux/gfp.h>
#include <linux/cpuidle.h>
+#include <linux/kexec.h>
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
@@ -106,6 +107,9 @@ struct mwait_cpu_dead {
unsigned int status;
};
+#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF
+#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD
+
/*
* Cache line aligned data for mwait_play_dead(). Separate on purpose so
* that it's unlikely to be touched by other CPUs.
@@ -173,6 +177,10 @@ static void smp_callin(void)
{
int cpuid;
+ /* Mop up eventual mwait_play_dead() wreckage */
+ this_cpu_write(mwait_cpu_dead.status, 0);
+ this_cpu_write(mwait_cpu_dead.control, 0);
+
/*
* If waken up by an INIT in an 82489DX configuration
* cpu_callout_mask guarantees we don't get here before
@@ -1807,6 +1815,10 @@ static inline void mwait_play_dead(void)
(highest_subcstate - 1);
}
+ /* Set up state for the kexec() hack below */
+ md->status = CPUDEAD_MWAIT_WAIT;
+ md->control = CPUDEAD_MWAIT_WAIT;
+
wbinvd();
while (1) {
@@ -1824,10 +1836,57 @@ static inline void mwait_play_dead(void)
mb();
__mwait(eax, 0);
+ if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
+ /*
+ * Kexec is about to happen. Don't go back into mwait() as
+ * the kexec kernel might overwrite text and data including
+ * page tables and stack. So mwait() would resume when the
+ * monitor cache line is written to and then the CPU goes
+ * south due to overwritten text, page tables and stack.
+ *
+ * Note: This does _NOT_ protect against a stray MCE, NMI,
+ * SMI. They will resume execution at the instruction
+ * following the HLT instruction and run into the problem
+ * which this is trying to prevent.
+ */
+ WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
+ while(1)
+ native_halt();
+ }
+
cond_wakeup_cpu0();
}
}
+/*
+ * Kick all "offline" CPUs out of mwait on kexec(). See comment in
+ * mwait_play_dead().
+ */
+void smp_kick_mwait_play_dead(void)
+{
+ u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
+ struct mwait_cpu_dead *md;
+ unsigned int cpu, i;
+
+ for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
+ md = per_cpu_ptr(&mwait_cpu_dead, cpu);
+
+ /* Does it sit in mwait_play_dead() ? */
+ if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
+ continue;
+
+ /* Wait up to 5ms */
+ for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
+ /* Bring it out of mwait */
+ WRITE_ONCE(md->control, newstate);
+ udelay(5);
+ }
+
+ if (READ_ONCE(md->status) != newstate)
+ pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
+ }
+}
+
void __noreturn hlt_play_dead(void)
{
if (__this_cpu_read(cpu_info.x86) >= 4)
The errata sheets for both ksz9477 and ksz9567 begin with
IMPORTANT NOTE
Multiple errata workarounds in this document call for changing PHY
registers for each PHY port. PHY registers 0x0 to 0x1F are in the
address range 0xN100 to 0xN13F, while indirect (MMD) PHY registers
are accessed via the PHY MMD Setup Register and the PHY MMD Data
Register.
Before configuring the PHY MMD registers, it is necessary to set the
PHY to 100 Mbps speed with auto-negotiation disabled by writing to
register 0xN100-0xN101. After writing the MMD registers, and after
all errata workarounds that involve PHY register settings, write
register 0xN100-0xN101 again to enable and restart auto-negotiation.
Without that explicit auto-neg restart, we do sometimes have problems
establishing link.
Rather than writing back the hardcoded 0x1340 value the errata sheet
suggests (which likely just corresponds to the most common strap
configuration), restore the original value, setting the
PORT_AUTO_NEG_RESTART bit if PORT_AUTO_NEG_ENABLE is set.
Fixes: 1fc33199185d ("net: dsa: microchip: Add PHY errata workarounds")
Cc: stable(a)vger.kernel.org
Signed-off-by: Rasmus Villemoes <linux(a)rasmusvillemoes.dk>
---
While I do believe this is a fix, I don't think it's post-rc7
material, hence targeting net-next with cc stable.
drivers/net/dsa/microchip/ksz9477.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c
index bf13d47c26cf..9a712ea71ee7 100644
--- a/drivers/net/dsa/microchip/ksz9477.c
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -902,6 +902,16 @@ static void ksz9477_port_mmd_write(struct ksz_device *dev, int port,
static void ksz9477_phy_errata_setup(struct ksz_device *dev, int port)
{
+ u16 cr;
+
+ /* Errata document says the PHY must be configured to 100Mbps
+ * with auto-neg disabled before configuring the PHY MMD
+ * registers.
+ */
+ ksz_pread16(dev, port, REG_PORT_PHY_CTRL, &cr);
+ ksz_pwrite16(dev, port, REG_PORT_PHY_CTRL,
+ PORT_SPEED_100MBIT | PORT_FULL_DUPLEX);
+
/* Apply PHY settings to address errata listed in
* KSZ9477, KSZ9897, KSZ9896, KSZ9567, KSZ8565
* Silicon Errata and Data Sheet Clarification documents:
@@ -943,6 +953,13 @@ static void ksz9477_phy_errata_setup(struct ksz_device *dev, int port)
ksz9477_port_mmd_write(dev, port, 0x1c, 0x1d, 0xe7ff);
ksz9477_port_mmd_write(dev, port, 0x1c, 0x1e, 0xefff);
ksz9477_port_mmd_write(dev, port, 0x1c, 0x20, 0xeeee);
+
+ /* Restore PHY CTRL register, restart auto-negotiation if
+ * enabled in the original value.
+ */
+ if (cr & PORT_AUTO_NEG_ENABLE)
+ cr |= PORT_AUTO_NEG_RESTART;
+ ksz_pwrite16(dev, port, REG_PORT_PHY_CTRL, cr);
}
void ksz9477_get_caps(struct ksz_device *dev, int port,
--
2.37.2
On Sun 2023-06-18 07:43:10, Manuel Lauss wrote:
> On Fri, Jun 16, 2023 at 9:33 PM Pavel Machek <pavel(a)denx.de> wrote:
>
> > Hi!
> >
> > > From: Manuel Lauss <manuel.lauss(a)gmail.com>
> > >
> > > [ Upstream commit 2d645604f69f3a772d58ead702f9a8e84ab2b342 ]
> > >
> > > Various fixes for the Au1200/Au1550/Au1300 DBDMA2 code:
> > >
> > > - skip cache invalidation if chip has working coherency circuitry.
> > > - invalidate KSEG0-portion of the (physical) data address.
> > > - force the dma channel doorbell write out to bus immediately with
> > > a sync.
> > >
> > > Signed-off-by: Thomas Bogendoerfer <tsbogend(a)alpha.franken.de>
> > > Signed-off-by: Sasha Levin <sashal(a)kernel.org>
> >
> > I believe author's signoff is missing here.
> >
>
> As the author, I say this patch should not be applied to 4.xx at all. Same
> for my other 2 MIPS patches.
Thanks for info, where is the threshold, do we need them for 5.10?
Sasha, please drop.
Best regards,
Pavel
--
DENX Software Engineering GmbH, Managing Director: Erika Unter
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
commit e18eb8783ec4949adebc7d7b0fdb65f65bfeefd9 upstream.
Currently the tracing_reset_all_online_cpus() requires the
trace_types_lock held. But only one caller of this function actually has
that lock held before calling it, and the other just takes the lock so
that it can call it. More users of this function is needed where the lock
is not held.
Add a tracing_reset_all_online_cpus_unlocked() function for the one use
case that calls it without being held, and also add a lockdep_assert to
make sure it is held when called.
Then have tracing_reset_all_online_cpus() take the lock internally, such
that callers do not need to worry about taking it.
Link: https://lkml.kernel.org/r/20221123192741.658273220@goodmis.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Zheng Yejian <zhengyejian1(a)huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
[Refers to commit message of 1603feac154ff38514e8354e3079a455eb4801e2,
this patch is pre-depended, and tracing_reset_all_online_cpus() should
be called after trace_types_lock is held as its comment describes.]
Fixes: 1603feac154f ("tracing: Free buffers when a used dynamic event is removed")
Signed-off-by: Zheng Yejian <zhengyejian1(a)huawei.com>
---
kernel/trace/trace.c | 11 ++++++++++-
kernel/trace/trace.h | 1 +
kernel/trace/trace_events.c | 2 +-
3 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d068124815bc..219cd2c81936 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1931,10 +1931,12 @@ void tracing_reset_online_cpus(struct trace_buffer *buf)
}
/* Must have trace_types_lock held */
-void tracing_reset_all_online_cpus(void)
+void tracing_reset_all_online_cpus_unlocked(void)
{
struct trace_array *tr;
+ lockdep_assert_held(&trace_types_lock);
+
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (!tr->clear_trace)
continue;
@@ -1946,6 +1948,13 @@ void tracing_reset_all_online_cpus(void)
}
}
+void tracing_reset_all_online_cpus(void)
+{
+ mutex_lock(&trace_types_lock);
+ tracing_reset_all_online_cpus_unlocked();
+ mutex_unlock(&trace_types_lock);
+}
+
/*
* The tgid_map array maps from pid to tgid; i.e. the value stored at index i
* is the tgid last observed corresponding to pid=i.
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f2ff39353e03..edc17a640ab3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -677,6 +677,7 @@ int tracing_is_enabled(void);
void tracing_reset_online_cpus(struct trace_buffer *buf);
void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
+void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8f2cbc9ebb6e..a0675ecc8142 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2440,7 +2440,7 @@ static void trace_module_remove_events(struct module *mod)
* over from this module may be passed to the new module events and
* unexpected results may occur.
*/
- tracing_reset_all_online_cpus();
+ tracing_reset_all_online_cpus_unlocked();
}
static int trace_module_notify(struct notifier_block *self,
--
2.25.1
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
commit e18eb8783ec4949adebc7d7b0fdb65f65bfeefd9 upstream.
Currently the tracing_reset_all_online_cpus() requires the
trace_types_lock held. But only one caller of this function actually has
that lock held before calling it, and the other just takes the lock so
that it can call it. More users of this function is needed where the lock
is not held.
Add a tracing_reset_all_online_cpus_unlocked() function for the one use
case that calls it without being held, and also add a lockdep_assert to
make sure it is held when called.
Then have tracing_reset_all_online_cpus() take the lock internally, such
that callers do not need to worry about taking it.
Link: https://lkml.kernel.org/r/20221123192741.658273220@goodmis.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Zheng Yejian <zhengyejian1(a)huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
[Refers to commit message of be111ebd8868d4b7c041cb3c6102e1ae27d6dc1d,
this patch is pre-depended, and tracing_reset_all_online_cpus() should
be called after trace_types_lock is held as its comment describes.]
Fixes: be111ebd8868 ("tracing: Free buffers when a used dynamic event is removed")
Signed-off-by: Zheng Yejian <zhengyejian1(a)huawei.com>
---
kernel/trace/trace.c | 11 ++++++++++-
kernel/trace/trace.h | 1 +
kernel/trace/trace_events.c | 2 +-
kernel/trace/trace_events_synth.c | 2 --
4 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 482ec6606b7b..70526400e05c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2178,10 +2178,12 @@ void tracing_reset_online_cpus(struct array_buffer *buf)
}
/* Must have trace_types_lock held */
-void tracing_reset_all_online_cpus(void)
+void tracing_reset_all_online_cpus_unlocked(void)
{
struct trace_array *tr;
+ lockdep_assert_held(&trace_types_lock);
+
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (!tr->clear_trace)
continue;
@@ -2193,6 +2195,13 @@ void tracing_reset_all_online_cpus(void)
}
}
+void tracing_reset_all_online_cpus(void)
+{
+ mutex_lock(&trace_types_lock);
+ tracing_reset_all_online_cpus_unlocked();
+ mutex_unlock(&trace_types_lock);
+}
+
/*
* The tgid_map array maps from pid to tgid; i.e. the value stored at index i
* is the tgid last observed corresponding to pid=i.
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 37f616bf5fa9..e5b505b5b7d0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -725,6 +725,7 @@ int tracing_is_enabled(void);
void tracing_reset_online_cpus(struct array_buffer *buf);
void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
+void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index bac13f24a96e..f8ed66f38175 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2661,7 +2661,7 @@ static void trace_module_remove_events(struct module *mod)
* over from this module may be passed to the new module events and
* unexpected results may occur.
*/
- tracing_reset_all_online_cpus();
+ tracing_reset_all_online_cpus_unlocked();
}
static int trace_module_notify(struct notifier_block *self,
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 18291ab35657..ee174de0b8f6 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -1363,7 +1363,6 @@ int synth_event_delete(const char *event_name)
mutex_unlock(&event_mutex);
if (mod) {
- mutex_lock(&trace_types_lock);
/*
* It is safest to reset the ring buffer if the module
* being unloaded registered any events that were
@@ -1375,7 +1374,6 @@ int synth_event_delete(const char *event_name)
* occur.
*/
tracing_reset_all_online_cpus();
- mutex_unlock(&trace_types_lock);
}
return ret;
--
2.25.1
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 306320034e8fbe7ee1cc4f5269c55658b4612048
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061831-detention-overtime-783b@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 306320034e8fbe7ee1cc4f5269c55658b4612048 Mon Sep 17 00:00:00 2001
From: Bernhard Seibold <mail(a)bernhard-seibold.de>
Date: Fri, 2 Jun 2023 15:30:29 +0200
Subject: [PATCH] serial: lantiq: add missing interrupt ack
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Currently, the error interrupt is never acknowledged, so once active it
will stay active indefinitely, causing the handler to be called in an
infinite loop.
Fixes: 2f0fc4159a6a ("SERIAL: Lantiq: Add driver for MIPS Lantiq SOCs.")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Bernhard Seibold <mail(a)bernhard-seibold.de>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Message-ID: <20230602133029.546-1-mail(a)bernhard-seibold.de>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/lantiq.c b/drivers/tty/serial/lantiq.c
index a58e9277dfad..f1387f1024db 100644
--- a/drivers/tty/serial/lantiq.c
+++ b/drivers/tty/serial/lantiq.c
@@ -250,6 +250,7 @@ lqasc_err_int(int irq, void *_port)
struct ltq_uart_port *ltq_port = to_ltq_uart_port(port);
spin_lock_irqsave(<q_port->lock, flags);
+ __raw_writel(ASC_IRNCR_EIR, port->membase + LTQ_ASC_IRNCR);
/* clear any pending interrupts */
asc_update_bits(0, ASCWHBSTATE_CLRPE | ASCWHBSTATE_CLRFE |
ASCWHBSTATE_CLRROE, port->membase + LTQ_ASC_WHBSTATE);
The quilt patch titled
Subject: nilfs2: prevent general protection fault in nilfs_clear_dirty_page()
has been removed from the -mm tree. Its filename was
nilfs2-prevent-general-protection-fault-in-nilfs_clear_dirty_page.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Subject: nilfs2: prevent general protection fault in nilfs_clear_dirty_page()
Date: Mon, 12 Jun 2023 11:14:56 +0900
In a syzbot stress test that deliberately causes file system errors on
nilfs2 with a corrupted disk image, it has been reported that
nilfs_clear_dirty_page() called from nilfs_clear_dirty_pages() can cause a
general protection fault.
In nilfs_clear_dirty_pages(), when looking up dirty pages from the page
cache and calling nilfs_clear_dirty_page() for each dirty page/folio
retrieved, the back reference from the argument page to "mapping" may have
been changed to NULL (and possibly others). It is necessary to check this
after locking the page/folio.
So, fix this issue by not calling nilfs_clear_dirty_page() on a page/folio
after locking it in nilfs_clear_dirty_pages() if the back reference
"mapping" from the page/folio is different from the "mapping" that held
the page/folio just before.
Link: https://lkml.kernel.org/r/20230612021456.3682-1-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+53369d11851d8f26735c(a)syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/000000000000da4f6b05eb9bf593@google.com
Tested-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/nilfs2/page.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
--- a/fs/nilfs2/page.c~nilfs2-prevent-general-protection-fault-in-nilfs_clear_dirty_page
+++ a/fs/nilfs2/page.c
@@ -370,7 +370,15 @@ void nilfs_clear_dirty_pages(struct addr
struct folio *folio = fbatch.folios[i];
folio_lock(folio);
- nilfs_clear_dirty_page(&folio->page, silent);
+
+ /*
+ * This folio may have been removed from the address
+ * space by truncation or invalidation when the lock
+ * was acquired. Skip processing in that case.
+ */
+ if (likely(folio->mapping == mapping))
+ nilfs_clear_dirty_page(&folio->page, silent);
+
folio_unlock(folio);
}
folio_batch_release(&fbatch);
_
Patches currently in -mm which might be from konishi.ryusuke(a)gmail.com are
The quilt patch titled
Subject: nilfs2: fix buffer corruption due to concurrent device reads
has been removed from the -mm tree. Its filename was
nilfs2-fix-buffer-corruption-due-to-concurrent-device-reads.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Subject: nilfs2: fix buffer corruption due to concurrent device reads
Date: Fri, 9 Jun 2023 12:57:32 +0900
As a result of analysis of a syzbot report, it turned out that in three
cases where nilfs2 allocates block device buffers directly via sb_getblk,
concurrent reads to the device can corrupt the allocated buffers.
Nilfs2 uses sb_getblk for segment summary blocks, that make up a log
header, and the super root block, that is the trailer, and when moving and
writing the second super block after fs resize.
In any of these, since the uptodate flag is not set when storing metadata
to be written in the allocated buffers, the stored metadata will be
overwritten if a device read of the same block occurs concurrently before
the write. This causes metadata corruption and misbehavior in the log
write itself, causing warnings in nilfs_btree_assign() as reported.
Fix these issues by setting an uptodate flag on the buffer head on the
first or before modifying each buffer obtained with sb_getblk, and
clearing the flag on failure.
When setting the uptodate flag, the lock_buffer/unlock_buffer pair is used
to perform necessary exclusive control, and the buffer is filled to ensure
that uninitialized bytes are not mixed into the data read from others. As
for buffers for segment summary blocks, they are filled incrementally, so
if the uptodate flag was unset on their allocation, set the flag and zero
fill the buffer once at that point.
Also, regarding the superblock move routine, the starting point of the
memset call to zerofill the block is incorrectly specified, which can
cause a buffer overflow on file systems with block sizes greater than
4KiB. In addition, if the superblock is moved within a large block, it is
necessary to assume the possibility that the data in the superblock will
be destroyed by zero-filling before copying. So fix these potential
issues as well.
Link: https://lkml.kernel.org/r/20230609035732.20426-1-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+31837fe952932efc8fb9(a)syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/00000000000030000a05e981f475@google.com
Tested-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/nilfs2/segbuf.c | 6 ++++++
fs/nilfs2/segment.c | 7 +++++++
fs/nilfs2/super.c | 23 ++++++++++++++++++++++-
3 files changed, 35 insertions(+), 1 deletion(-)
--- a/fs/nilfs2/segbuf.c~nilfs2-fix-buffer-corruption-due-to-concurrent-device-reads
+++ a/fs/nilfs2/segbuf.c
@@ -101,6 +101,12 @@ int nilfs_segbuf_extend_segsum(struct ni
if (unlikely(!bh))
return -ENOMEM;
+ lock_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ memset(bh->b_data, 0, bh->b_size);
+ set_buffer_uptodate(bh);
+ }
+ unlock_buffer(bh);
nilfs_segbuf_add_segsum_buffer(segbuf, bh);
return 0;
}
--- a/fs/nilfs2/segment.c~nilfs2-fix-buffer-corruption-due-to-concurrent-device-reads
+++ a/fs/nilfs2/segment.c
@@ -981,10 +981,13 @@ static void nilfs_segctor_fill_in_super_
unsigned int isz, srsz;
bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
+
+ lock_buffer(bh_sr);
raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
isz = nilfs->ns_inode_size;
srsz = NILFS_SR_BYTES(isz);
+ raw_sr->sr_sum = 0; /* Ensure initialization within this update */
raw_sr->sr_bytes = cpu_to_le16(srsz);
raw_sr->sr_nongc_ctime
= cpu_to_le64(nilfs_doing_gc() ?
@@ -998,6 +1001,8 @@ static void nilfs_segctor_fill_in_super_
nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
NILFS_SR_SUFILE_OFFSET(isz), 1);
memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz);
+ set_buffer_uptodate(bh_sr);
+ unlock_buffer(bh_sr);
}
static void nilfs_redirty_inodes(struct list_head *head)
@@ -1780,6 +1785,7 @@ static void nilfs_abort_logs(struct list
list_for_each_entry(segbuf, logs, sb_list) {
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
+ clear_buffer_uptodate(bh);
if (bh->b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
@@ -1791,6 +1797,7 @@ static void nilfs_abort_logs(struct list
b_assoc_buffers) {
clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
+ clear_buffer_uptodate(bh);
if (bh->b_page != bd_page) {
end_page_writeback(bd_page);
bd_page = bh->b_page;
--- a/fs/nilfs2/super.c~nilfs2-fix-buffer-corruption-due-to-concurrent-device-reads
+++ a/fs/nilfs2/super.c
@@ -372,10 +372,31 @@ static int nilfs_move_2nd_super(struct s
goto out;
}
nsbp = (void *)nsbh->b_data + offset;
- memset(nsbp, 0, nilfs->ns_blocksize);
+ lock_buffer(nsbh);
if (sb2i >= 0) {
+ /*
+ * The position of the second superblock only changes by 4KiB,
+ * which is larger than the maximum superblock data size
+ * (= 1KiB), so there is no need to use memmove() to allow
+ * overlap between source and destination.
+ */
memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize);
+
+ /*
+ * Zero fill after copy to avoid overwriting in case of move
+ * within the same block.
+ */
+ memset(nsbh->b_data, 0, offset);
+ memset((void *)nsbp + nilfs->ns_sbsize, 0,
+ nsbh->b_size - offset - nilfs->ns_sbsize);
+ } else {
+ memset(nsbh->b_data, 0, nsbh->b_size);
+ }
+ set_buffer_uptodate(nsbh);
+ unlock_buffer(nsbh);
+
+ if (sb2i >= 0) {
brelse(nilfs->ns_sbh[sb2i]);
nilfs->ns_sbh[sb2i] = nsbh;
nilfs->ns_sbp[sb2i] = nsbp;
_
Patches currently in -mm which might be from konishi.ryusuke(a)gmail.com are
The quilt patch titled
Subject: scripts: fix the gfp flags header path in gfp-translate
has been removed from the -mm tree. Its filename was
scripts-fix-the-gfp-flags-header-path-in-gfp-translate.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Prathu Baronia <prathubaronia2011(a)gmail.com>
Subject: scripts: fix the gfp flags header path in gfp-translate
Date: Thu, 8 Jun 2023 21:14:49 +0530
Since gfp flags have been shifted to gfp_types.h so update the path in
the gfp-translate script.
Link: https://lkml.kernel.org/r/20230608154450.21758-1-prathubaronia2011@gmail.com
Fixes: cb5a065b4ea9c ("headers/deps: mm: Split <linux/gfp_types.h> out of <linux/gfp.h>")
Signed-off-by: Prathu Baronia <prathubaronia2011(a)gmail.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Cc: Masahiro Yamada <masahiroy(a)kernel.org>
Cc: Nathan Chancellor <nathan(a)kernel.org>
Cc: Nick Desaulniers <ndesaulniers(a)google.com>
Cc: Nicolas Schier <nicolas(a)fjasle.eu>
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: Yury Norov <yury.norov(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
scripts/gfp-translate | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
--- a/scripts/gfp-translate~scripts-fix-the-gfp-flags-header-path-in-gfp-translate
+++ a/scripts/gfp-translate
@@ -63,11 +63,11 @@ fi
# Extract GFP flags from the kernel source
TMPFILE=`mktemp -t gfptranslate-XXXXXX` || exit 1
-grep -q ___GFP $SOURCE/include/linux/gfp.h
+grep -q ___GFP $SOURCE/include/linux/gfp_types.h
if [ $? -eq 0 ]; then
- grep "^#define ___GFP" $SOURCE/include/linux/gfp.h | sed -e 's/u$//' | grep -v GFP_BITS > $TMPFILE
+ grep "^#define ___GFP" $SOURCE/include/linux/gfp_types.h | sed -e 's/u$//' | grep -v GFP_BITS > $TMPFILE
else
- grep "^#define __GFP" $SOURCE/include/linux/gfp.h | sed -e 's/(__force gfp_t)//' | sed -e 's/u)/)/' | grep -v GFP_BITS | sed -e 's/)\//) \//' > $TMPFILE
+ grep "^#define __GFP" $SOURCE/include/linux/gfp_types.h | sed -e 's/(__force gfp_t)//' | sed -e 's/u)/)/' | grep -v GFP_BITS | sed -e 's/)\//) \//' > $TMPFILE
fi
# Parse the flags
_
Patches currently in -mm which might be from prathubaronia2011(a)gmail.com are
The quilt patch titled
Subject: udmabuf: revert 'Add support for mapping hugepages (v4)'
has been removed from the -mm tree. Its filename was
udmabuf-revert-add-support-for-mapping-hugepages-v4.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Mike Kravetz <mike.kravetz(a)oracle.com>
Subject: udmabuf: revert 'Add support for mapping hugepages (v4)'
Date: Thu, 8 Jun 2023 13:49:27 -0700
This effectively reverts commit 16c243e99d33 ("udmabuf: Add support for
mapping hugepages (v4)"). Recently, Junxiao Chang found a BUG with page
map counting as described here [1]. This issue pointed out that the
udmabuf driver was making direct use of subpages of hugetlb pages. This
is not a good idea, and no other mm code attempts such use. In addition
to the mapcount issue, this also causes issues with hugetlb vmemmap
optimization and page poisoning.
For now, remove hugetlb support.
If udmabuf wants to be used on hugetlb mappings, it should be changed to
only use complete hugetlb pages. This will require different alignment
and size requirements on the UDMABUF_CREATE API.
[1] https://lore.kernel.org/linux-mm/20230512072036.1027784-1-junxiao.chang@int…
Link: https://lkml.kernel.org/r/20230608204927.88711-1-mike.kravetz@oracle.com
Fixes: 16c243e99d33 ("udmabuf: Add support for mapping hugepages (v4)")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Acked-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Acked-by: Vivek Kasireddy <vivek.kasireddy(a)intel.com>
Acked-by: Gerd Hoffmann <kraxel(a)redhat.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Dongwon Kim <dongwon.kim(a)intel.com>
Cc: James Houghton <jthoughton(a)google.com>
Cc: Jerome Marchand <jmarchan(a)redhat.com>
Cc: Junxiao Chang <junxiao.chang(a)intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
drivers/dma-buf/udmabuf.c | 47 ++++--------------------------------
1 file changed, 6 insertions(+), 41 deletions(-)
--- a/drivers/dma-buf/udmabuf.c~udmabuf-revert-add-support-for-mapping-hugepages-v4
+++ a/drivers/dma-buf/udmabuf.c
@@ -12,7 +12,6 @@
#include <linux/shmem_fs.h>
#include <linux/slab.h>
#include <linux/udmabuf.h>
-#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/iosys-map.h>
@@ -207,9 +206,7 @@ static long udmabuf_create(struct miscde
struct udmabuf *ubuf;
struct dma_buf *buf;
pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit;
- struct page *page, *hpage = NULL;
- pgoff_t subpgoff, maxsubpgs;
- struct hstate *hpstate;
+ struct page *page;
int seals, ret = -EINVAL;
u32 i, flags;
@@ -245,7 +242,7 @@ static long udmabuf_create(struct miscde
if (!memfd)
goto err;
mapping = memfd->f_mapping;
- if (!shmem_mapping(mapping) && !is_file_hugepages(memfd))
+ if (!shmem_mapping(mapping))
goto err;
seals = memfd_fcntl(memfd, F_GET_SEALS, 0);
if (seals == -EINVAL)
@@ -256,48 +253,16 @@ static long udmabuf_create(struct miscde
goto err;
pgoff = list[i].offset >> PAGE_SHIFT;
pgcnt = list[i].size >> PAGE_SHIFT;
- if (is_file_hugepages(memfd)) {
- hpstate = hstate_file(memfd);
- pgoff = list[i].offset >> huge_page_shift(hpstate);
- subpgoff = (list[i].offset &
- ~huge_page_mask(hpstate)) >> PAGE_SHIFT;
- maxsubpgs = huge_page_size(hpstate) >> PAGE_SHIFT;
- }
for (pgidx = 0; pgidx < pgcnt; pgidx++) {
- if (is_file_hugepages(memfd)) {
- if (!hpage) {
- hpage = find_get_page_flags(mapping, pgoff,
- FGP_ACCESSED);
- if (!hpage) {
- ret = -EINVAL;
- goto err;
- }
- }
- page = hpage + subpgoff;
- get_page(page);
- subpgoff++;
- if (subpgoff == maxsubpgs) {
- put_page(hpage);
- hpage = NULL;
- subpgoff = 0;
- pgoff++;
- }
- } else {
- page = shmem_read_mapping_page(mapping,
- pgoff + pgidx);
- if (IS_ERR(page)) {
- ret = PTR_ERR(page);
- goto err;
- }
+ page = shmem_read_mapping_page(mapping, pgoff + pgidx);
+ if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ goto err;
}
ubuf->pages[pgbuf++] = page;
}
fput(memfd);
memfd = NULL;
- if (hpage) {
- put_page(hpage);
- hpage = NULL;
- }
}
exp_info.ops = &udmabuf_ops;
_
Patches currently in -mm which might be from mike.kravetz(a)oracle.com are
The quilt patch titled
Subject: memfd: check for non-NULL file_seals in memfd_create() syscall
has been removed from the -mm tree. Its filename was
memfd-check-for-non-null-file_seals-in-memfd_create-syscall.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Roberto Sassu <roberto.sassu(a)huawei.com>
Subject: memfd: check for non-NULL file_seals in memfd_create() syscall
Date: Wed, 7 Jun 2023 15:24:27 +0200
Ensure that file_seals is non-NULL before using it in the memfd_create()
syscall. One situation in which memfd_file_seals_ptr() could return a
NULL pointer when CONFIG_SHMEM=n, oopsing the kernel.
Link: https://lkml.kernel.org/r/20230607132427.2867435-1-roberto.sassu@huaweiclou…
Fixes: 47b9012ecdc7 ("shmem: add sealing support to hugetlb-backed memfd")
Signed-off-by: Roberto Sassu <roberto.sassu(a)huawei.com>
Cc: Marc-Andr Lureau <marcandre.lureau(a)redhat.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memfd.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
--- a/mm/memfd.c~memfd-check-for-non-null-file_seals-in-memfd_create-syscall
+++ a/mm/memfd.c
@@ -371,12 +371,15 @@ SYSCALL_DEFINE2(memfd_create,
inode->i_mode &= ~0111;
file_seals = memfd_file_seals_ptr(file);
- *file_seals &= ~F_SEAL_SEAL;
- *file_seals |= F_SEAL_EXEC;
+ if (file_seals) {
+ *file_seals &= ~F_SEAL_SEAL;
+ *file_seals |= F_SEAL_EXEC;
+ }
} else if (flags & MFD_ALLOW_SEALING) {
/* MFD_EXEC and MFD_ALLOW_SEALING are set */
file_seals = memfd_file_seals_ptr(file);
- *file_seals &= ~F_SEAL_SEAL;
+ if (file_seals)
+ *file_seals &= ~F_SEAL_SEAL;
}
fd_install(fd, file);
_
Patches currently in -mm which might be from roberto.sassu(a)huawei.com are
shmem-use-ramfs_kill_sb-for-kill_sb-method-of-ramfs-based-tmpfs.patch
The quilt patch titled
Subject: mm/vmalloc: do not output a spurious warning when huge vmalloc() fails
has been removed from the -mm tree. Its filename was
mm-vmalloc-do-not-output-a-spurious-warning-when-huge-vmalloc-fails.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Lorenzo Stoakes <lstoakes(a)gmail.com>
Subject: mm/vmalloc: do not output a spurious warning when huge vmalloc() fails
Date: Mon, 5 Jun 2023 21:11:07 +0100
In __vmalloc_area_node() we always warn_alloc() when an allocation
performed by vm_area_alloc_pages() fails unless it was due to a pending
fatal signal.
However, huge page allocations instigated either by vmalloc_huge() or
__vmalloc_node_range() (or a caller that invokes this like kvmalloc() or
kvmalloc_node()) always falls back to order-0 allocations if the huge page
allocation fails.
This renders the warning useless and noisy, especially as all callers
appear to be aware that this may fallback. This has already resulted in
at least one bug report from a user who was confused by this (see link).
Therefore, simply update the code to only output this warning for order-0
pages when no fatal signal is pending.
Link: https://bugzilla.suse.com/show_bug.cgi?id=1211410
Link: https://lkml.kernel.org/r/20230605201107.83298-1-lstoakes@gmail.com
Fixes: 80b1d8fdfad1 ("mm: vmalloc: correct use of __GFP_NOWARN mask in __vmalloc_area_node()")
Signed-off-by: Lorenzo Stoakes <lstoakes(a)gmail.com>
Acked-by: Vlastimil Babka <vbabka(a)suse.cz>
Reviewed-by: Baoquan He <bhe(a)redhat.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Reviewed-by: Uladzislau Rezki (Sony) <urezki(a)gmail.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Cc: Christoph Hellwig <hch(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/vmalloc.c | 17 +++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)
--- a/mm/vmalloc.c~mm-vmalloc-do-not-output-a-spurious-warning-when-huge-vmalloc-fails
+++ a/mm/vmalloc.c
@@ -3098,11 +3098,20 @@ static void *__vmalloc_area_node(struct
* allocation request, free them via vfree() if any.
*/
if (area->nr_pages != nr_small_pages) {
- /* vm_area_alloc_pages() can also fail due to a fatal signal */
- if (!fatal_signal_pending(current))
+ /*
+ * vm_area_alloc_pages() can fail due to insufficient memory but
+ * also:-
+ *
+ * - a pending fatal signal
+ * - insufficient huge page-order pages
+ *
+ * Since we always retry allocations at order-0 in the huge page
+ * case a warning for either is spurious.
+ */
+ if (!fatal_signal_pending(current) && page_order == 0)
warn_alloc(gfp_mask, NULL,
- "vmalloc error: size %lu, page order %u, failed to allocate pages",
- area->nr_pages * PAGE_SIZE, page_order);
+ "vmalloc error: size %lu, failed to allocate pages",
+ area->nr_pages * PAGE_SIZE);
goto fail;
}
_
Patches currently in -mm which might be from lstoakes(a)gmail.com are
The quilt patch titled
Subject: mm/mprotect: fix do_mprotect_pkey() limit check
has been removed from the -mm tree. Its filename was
mm-mprotect-fix-do_mprotect_pkey-limit-check.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "Liam R. Howlett" <Liam.Howlett(a)oracle.com>
Subject: mm/mprotect: fix do_mprotect_pkey() limit check
Date: Tue, 6 Jun 2023 14:29:12 -0400
The return of do_mprotect_pkey() can still be incorrectly returned as
success if there is a gap that spans to or beyond the end address passed
in. Update the check to ensure that the end address has indeed been seen.
Link: https://lore.kernel.org/all/CABi2SkXjN+5iFoBhxk71t3cmunTk-s=rB4T7qo0UQRh17s…
Link: https://lkml.kernel.org/r/20230606182912.586576-1-Liam.Howlett@oracle.com
Fixes: 82f951340f25 ("mm/mprotect: fix do_mprotect_pkey() return on error")
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reported-by: Jeff Xu <jeffxu(a)chromium.org>
Reviewed-by: Lorenzo Stoakes <lstoakes(a)gmail.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mprotect.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/mprotect.c~mm-mprotect-fix-do_mprotect_pkey-limit-check
+++ a/mm/mprotect.c
@@ -867,7 +867,7 @@ static int do_mprotect_pkey(unsigned lon
}
tlb_finish_mmu(&tlb);
- if (!error && vma_iter_end(&vmi) < end)
+ if (!error && tmp < end)
error = -ENOMEM;
out:
_
Patches currently in -mm which might be from Liam.Howlett(a)oracle.com are
maple_tree-add-benchmarking-for-mas_for_each.patch
maple_tree-add-benchmarking-for-mas_prev.patch
mm-move-unmap_vmas-declaration-to-internal-header.patch
mm-change-do_vmi_align_munmap-side-tree-index.patch
mm-remove-prev-check-from-do_vmi_align_munmap.patch
maple_tree-introduce-__mas_set_range.patch
mm-remove-re-walk-from-mmap_region.patch
maple_tree-adjust-node-allocation-on-mas_rebalance.patch
maple_tree-re-introduce-entry-to-mas_preallocate-arguments.patch
mm-use-vma_iter_clear_gfp-in-nommu.patch
mm-set-up-vma-iterator-for-vma_iter_prealloc-calls.patch
maple_tree-move-mas_wr_end_piv-below-mas_wr_extend_null.patch
maple_tree-update-mas_preallocate-testing.patch
maple_tree-refine-mas_preallocate-node-calculations.patch
maple_tree-reduce-resets-during-store-setup.patch
mm-mmap-change-vma-iteration-order-in-do_vmi_align_munmap.patch
userfaultfd-fix-regression-in-userfaultfd_unmap_prep.patch
The quilt patch titled
Subject: writeback: fix dereferencing NULL mapping->host on writeback_page_template
has been removed from the -mm tree. Its filename was
writeback-fix-dereferencing-null-mapping-host-on-writeback_page_template.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Rafael Aquini <aquini(a)redhat.com>
Subject: writeback: fix dereferencing NULL mapping->host on writeback_page_template
Date: Tue, 6 Jun 2023 19:36:13 -0400
When commit 19343b5bdd16 ("mm/page-writeback: introduce tracepoint for
wait_on_page_writeback()") repurposed the writeback_dirty_page trace event
as a template to create its new wait_on_page_writeback trace event, it
ended up opening a window to NULL pointer dereference crashes due to the
(infrequent) occurrence of a race where an access to a page in the
swap-cache happens concurrently with the moment this page is being written
to disk and the tracepoint is enabled:
BUG: kernel NULL pointer dereference, address: 0000000000000040
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 800000010ec0a067 P4D 800000010ec0a067 PUD 102353067 PMD 0
Oops: 0000 [#1] PREEMPT SMP PTI
CPU: 1 PID: 1320 Comm: shmem-worker Kdump: loaded Not tainted 6.4.0-rc5+ #13
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS edk2-20230301gitf80f052277c8-1.fc37 03/01/2023
RIP: 0010:trace_event_raw_event_writeback_folio_template+0x76/0xf0
Code: 4d 85 e4 74 5c 49 8b 3c 24 e8 06 98 ee ff 48 89 c7 e8 9e 8b ee ff ba 20 00 00 00 48 89 ef 48 89 c6 e8 fe d4 1a 00 49 8b 04 24 <48> 8b 40 40 48 89 43 28 49 8b 45 20 48 89 e7 48 89 43 30 e8 a2 4d
RSP: 0000:ffffaad580b6fb60 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff90e38035c01c RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff90e38035c044
RBP: ffff90e38035c024 R08: 0000000000000002 R09: 0000000000000006
R10: ffff90e38035c02e R11: 0000000000000020 R12: ffff90e380bac000
R13: ffffe3a7456d9200 R14: 0000000000001b81 R15: ffffe3a7456d9200
FS: 00007f2e4e8a15c0(0000) GS:ffff90e3fbc80000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000040 CR3: 00000001150c6003 CR4: 0000000000170ee0
Call Trace:
<TASK>
? __die+0x20/0x70
? page_fault_oops+0x76/0x170
? kernelmode_fixup_or_oops+0x84/0x110
? exc_page_fault+0x65/0x150
? asm_exc_page_fault+0x22/0x30
? trace_event_raw_event_writeback_folio_template+0x76/0xf0
folio_wait_writeback+0x6b/0x80
shmem_swapin_folio+0x24a/0x500
? filemap_get_entry+0xe3/0x140
shmem_get_folio_gfp+0x36e/0x7c0
? find_busiest_group+0x43/0x1a0
shmem_fault+0x76/0x2a0
? __update_load_avg_cfs_rq+0x281/0x2f0
__do_fault+0x33/0x130
do_read_fault+0x118/0x160
do_pte_missing+0x1ed/0x2a0
__handle_mm_fault+0x566/0x630
handle_mm_fault+0x91/0x210
do_user_addr_fault+0x22c/0x740
exc_page_fault+0x65/0x150
asm_exc_page_fault+0x22/0x30
This problem arises from the fact that the repurposed writeback_dirty_page
trace event code was written assuming that every pointer to mapping
(struct address_space) would come from a file-mapped page-cache object,
thus mapping->host would always be populated, and that was a valid case
before commit 19343b5bdd16. The swap-cache address space
(swapper_spaces), however, doesn't populate its ->host (struct inode)
pointer, thus leading to the crashes in the corner-case aforementioned.
commit 19343b5bdd16 ended up breaking the assignment of __entry->name and
__entry->ino for the wait_on_page_writeback tracepoint -- both dependent
on mapping->host carrying a pointer to a valid inode. The assignment of
__entry->name was fixed by commit 68f23b89067f ("memcg: fix a crash in
wb_workfn when a device disappears"), and this commit fixes the remaining
case, for __entry->ino.
Link: https://lkml.kernel.org/r/20230606233613.1290819-1-aquini@redhat.com
Fixes: 19343b5bdd16 ("mm/page-writeback: introduce tracepoint for wait_on_page_writeback()")
Signed-off-by: Rafael Aquini <aquini(a)redhat.com>
Reviewed-by: Yafang Shao <laoar.shao(a)gmail.com>
Cc: Aristeu Rozanski <aris(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/trace/events/writeback.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/include/trace/events/writeback.h~writeback-fix-dereferencing-null-mapping-host-on-writeback_page_template
+++ a/include/trace/events/writeback.h
@@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(writeback_folio_temp
strscpy_pad(__entry->name,
bdi_dev_name(mapping ? inode_to_bdi(mapping->host) :
NULL), 32);
- __entry->ino = mapping ? mapping->host->i_ino : 0;
+ __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0;
__entry->index = folio->index;
),
_
Patches currently in -mm which might be from aquini(a)redhat.com are
Culprit: https://lore.kernel.org/r/20211227180026.4068352-2-martin.blumenstingl@goog…
On lun 27-12-2021 19:00:24, Martin Blumenstingl wrote:
> The dt-bindings for the UART controller only allow the following values
> for Meson6 SoCs:
> - "amlogic,meson6-uart", "amlogic,meson-ao-uart"
> - "amlogic,meson6-uart"
>
> Use the correct fallback compatible string "amlogic,meson-ao-uart" for
> AO UART. Drop the "amlogic,meson-uart" compatible string from the EE
> domain UART controllers.
KernelCI detected that this patch introduced a regression in
stable-rc/linux-4.14.y (4.14.267) on a meson8b-odroidc1.
After this patch was applied the tests running on this platform don't
show any serial output.
This doesn't happen in other stable branches nor in mainline, but 4.14
hasn't still reached EOL and it'd be good to find a fix.
Here's the bisection report:
https://groups.io/g/kernelci-results/message/40147
KernelCI info:
https://linux.kernelci.org/test/case/id/64234f7761021a30b262f776/
Test log:
https://storage.kernelci.org/stable-rc/linux-4.14.y/v4.14.311-43-g88e481d60…
Thanks,
Ricardo
In the post init sequence of v2.9.0, write access to read only registers
are not disabled after updating the registers. Fix it by disabling the
access after register update.
Cc: <stable(a)vger.kernel.org>
Fixes: 5d76117f070d ("PCI: qcom: Add support for IPQ8074 PCIe controller")
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam(a)linaro.org>
---
drivers/pci/controller/dwc/pcie-qcom.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
index 4ab30892f6ef..ef385d36d653 100644
--- a/drivers/pci/controller/dwc/pcie-qcom.c
+++ b/drivers/pci/controller/dwc/pcie-qcom.c
@@ -836,6 +836,8 @@ static int qcom_pcie_post_init_2_3_3(struct qcom_pcie *pcie)
writel(PCI_EXP_DEVCTL2_COMP_TMOUT_DIS, pci->dbi_base + offset +
PCI_EXP_DEVCTL2);
+ dw_pcie_dbi_ro_wr_dis(pci);
+
return 0;
}
--
2.25.1
Mi-am dat seama că parsam flag.urile prost.
In primul rând, mi se pare gresit să enablezi ceva dacă
PTP_ENABLE_FEATURE nu e setat.
În al 2lea rând, dacă nu se specifica nici falling edge, nici rising
edge, functiona cu ce se găsea în registre, ceea ce era greșit.
Zi-mi dacă au sens modificările.
Link de unde m-am inspirat pentru flag.uri:
https://elixir.bootlin.com/linux/latest/source/tools/testing/selftests/ptp/…
Fixes 7a71c8aa0a75c ("phy: nxp-c45-tja11xx: add extts and perout support")
CC: stable(a)vger.kernel.org # 5.15+
Signed-off-by: Radu Pirea (NXP OSS) <radu-nicolae.pirea(a)oss.nxp.com>
---
drivers/net/phy/nxp-c45-tja11xx.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c
index f0d047019f33..ef4acb8eb0e4 100644
--- a/drivers/net/phy/nxp-c45-tja11xx.c
+++ b/drivers/net/phy/nxp-c45-tja11xx.c
@@ -595,7 +595,11 @@ static int nxp_c45_extts_enable(struct nxp_c45_phy *priv,
return 0;
}
- if (extts->flags & PTP_RISING_EDGE)
+ if (!(extts->flags & PTP_ENABLE_FEATURE))
+ return -EINVAL;
+
+ if ((extts->flags == PTP_ENABLE_FEATURE) ||
+ (extts->flags & PTP_RISING_EDGE))
phy_clear_bits_mmd(priv->phydev, MDIO_MMD_VEND1,
VEND1_PTP_CONFIG, EXT_TRG_EDGE);
--
2.34.1
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
commit e18eb8783ec4949adebc7d7b0fdb65f65bfeefd9 upstream.
Currently the tracing_reset_all_online_cpus() requires the
trace_types_lock held. But only one caller of this function actually has
that lock held before calling it, and the other just takes the lock so
that it can call it. More users of this function is needed where the lock
is not held.
Add a tracing_reset_all_online_cpus_unlocked() function for the one use
case that calls it without being held, and also add a lockdep_assert to
make sure it is held when called.
Then have tracing_reset_all_online_cpus() take the lock internally, such
that callers do not need to worry about taking it.
Link: https://lkml.kernel.org/r/20221123192741.658273220@goodmis.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Zheng Yejian <zhengyejian1(a)huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
[this patch is pre-depended by be111ebd8868d4b7c041cb3c6102e1ae27d6dc1d
due to tracing_reset_all_online_cpus() should be called after taking lock]
Fixes: be111ebd8868 ("tracing: Free buffers when a used dynamic event is removed")
Signed-off-by: Zheng Yejian <zhengyejian1(a)huawei.com>
---
kernel/trace/trace.c | 11 ++++++++++-
kernel/trace/trace.h | 1 +
kernel/trace/trace_events.c | 2 +-
kernel/trace/trace_events_synth.c | 2 --
4 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 482ec6606b7b..70526400e05c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2178,10 +2178,12 @@ void tracing_reset_online_cpus(struct array_buffer *buf)
}
/* Must have trace_types_lock held */
-void tracing_reset_all_online_cpus(void)
+void tracing_reset_all_online_cpus_unlocked(void)
{
struct trace_array *tr;
+ lockdep_assert_held(&trace_types_lock);
+
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (!tr->clear_trace)
continue;
@@ -2193,6 +2195,13 @@ void tracing_reset_all_online_cpus(void)
}
}
+void tracing_reset_all_online_cpus(void)
+{
+ mutex_lock(&trace_types_lock);
+ tracing_reset_all_online_cpus_unlocked();
+ mutex_unlock(&trace_types_lock);
+}
+
/*
* The tgid_map array maps from pid to tgid; i.e. the value stored at index i
* is the tgid last observed corresponding to pid=i.
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 37f616bf5fa9..e5b505b5b7d0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -725,6 +725,7 @@ int tracing_is_enabled(void);
void tracing_reset_online_cpus(struct array_buffer *buf);
void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
+void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index bac13f24a96e..f8ed66f38175 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2661,7 +2661,7 @@ static void trace_module_remove_events(struct module *mod)
* over from this module may be passed to the new module events and
* unexpected results may occur.
*/
- tracing_reset_all_online_cpus();
+ tracing_reset_all_online_cpus_unlocked();
}
static int trace_module_notify(struct notifier_block *self,
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 18291ab35657..ee174de0b8f6 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -1363,7 +1363,6 @@ int synth_event_delete(const char *event_name)
mutex_unlock(&event_mutex);
if (mod) {
- mutex_lock(&trace_types_lock);
/*
* It is safest to reset the ring buffer if the module
* being unloaded registered any events that were
@@ -1375,7 +1374,6 @@ int synth_event_delete(const char *event_name)
* occur.
*/
tracing_reset_all_online_cpus();
- mutex_unlock(&trace_types_lock);
}
return ret;
--
2.25.1
When a grant entry is still in use by the remote domain, Linux must put
it on a deferred list. Normally, this list is very short, because
the PV network and block protocols expect the backend to unmap the grant
first. However, Qubes OS's GUI protocol is subject to the constraints
of the X Window System, and as such winds up with the frontend unmapping
the window first. As a result, the list can grow very large, resulting
in a massive memory leak and eventual VM freeze.
To partially solve this problem, make the number of entries that the VM
will attempt to free at each iteration tunable. The default is still
10, but it can be overridden at compile-time (via Kconfig), boot-time
(via a kernel command-line option), or runtime (via sysfs).
Fixes: 569ca5b3f94c ("xen/gnttab: add deferred freeing logic")
Cc: stable(a)vger.kernel.org
Signed-off-by: Demi Marie Obenour <demi(a)invisiblethingslab.com>
---
drivers/xen/Kconfig | 12 ++++++++++++
drivers/xen/grant-table.c | 40 ++++++++++++++++++++++++++++-----------
2 files changed, 41 insertions(+), 11 deletions(-)
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index d5d7c402b65112b8592ba10bd3fd1732c26b771e..8f96e1359eb102d6420775b66e7805004a4ce9fe 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -65,6 +65,18 @@ config XEN_MEMORY_HOTPLUG_LIMIT
This value is used to allocate enough space in internal
tables needed for physical memory administration.
+config XEN_GRANTS_RECLAIM_PER_ITERATION
+ int "Default number of grant entries to reclaim per iteration"
+ default 10
+ range 10 4294967295
+ help
+ This sets the default value for the grant_table.free_per_iteration
+ kernel command line option, which sets the number of grants that
+ Linux will try to reclaim at once. The default is 10, but
+ workloads that make heavy use of gntalloc will likely want to
+ increase this. The current value can be accessed and/or modified
+ via /sys/module/grant_table/parameters/free_per_iteration.
+
config XEN_SCRUB_PAGES_DEFAULT
bool "Scrub pages before returning them to system by default"
depends on XEN_BALLOON
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index e1ec725c2819d4d5dede063eb00d86a6d52944c0..fa666aa6abc3e786dddc94f895641505ec0b23d8 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -498,14 +498,20 @@ static LIST_HEAD(deferred_list);
static void gnttab_handle_deferred(struct timer_list *);
static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
+static atomic64_t deferred_count;
+static atomic64_t leaked_count;
+static unsigned int free_per_iteration = CONFIG_XEN_GRANTS_RECLAIM_PER_ITERATION;
+
static void gnttab_handle_deferred(struct timer_list *unused)
{
- unsigned int nr = 10;
+ unsigned int nr = READ_ONCE(free_per_iteration);
+ const bool ignore_limit = nr == 0;
struct deferred_entry *first = NULL;
unsigned long flags;
+ size_t freed = 0;
spin_lock_irqsave(&gnttab_list_lock, flags);
- while (nr--) {
+ while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
struct deferred_entry *entry
= list_first_entry(&deferred_list,
struct deferred_entry, list);
@@ -515,10 +521,13 @@ static void gnttab_handle_deferred(struct timer_list *unused)
list_del(&entry->list);
spin_unlock_irqrestore(&gnttab_list_lock, flags);
if (_gnttab_end_foreign_access_ref(entry->ref)) {
+ uint64_t ret = atomic64_sub_return(1, &deferred_count);
put_free_entry(entry->ref);
- pr_debug("freeing g.e. %#x (pfn %#lx)\n",
- entry->ref, page_to_pfn(entry->page));
+ pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
+ entry->ref, page_to_pfn(entry->page),
+ (unsigned long long)ret);
put_page(entry->page);
+ freed++;
kfree(entry);
entry = NULL;
} else {
@@ -530,21 +539,22 @@ static void gnttab_handle_deferred(struct timer_list *unused)
spin_lock_irqsave(&gnttab_list_lock, flags);
if (entry)
list_add_tail(&entry->list, &deferred_list);
- else if (list_empty(&deferred_list))
- break;
}
- if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
+ if (list_empty(&deferred_list))
+ WARN_ON(atomic64_read(&deferred_count));
+ else if (!timer_pending(&deferred_timer)) {
deferred_timer.expires = jiffies + HZ;
add_timer(&deferred_timer);
}
spin_unlock_irqrestore(&gnttab_list_lock, flags);
+ pr_debug("Freed %zu references", freed);
}
static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
{
struct deferred_entry *entry;
gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
- const char *what = KERN_WARNING "leaking";
+ uint64_t leaked, deferred;
entry = kmalloc(sizeof(*entry), gfp);
if (!page) {
@@ -567,12 +577,20 @@ static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
add_timer(&deferred_timer);
}
spin_unlock_irqrestore(&gnttab_list_lock, flags);
- what = KERN_DEBUG "deferring";
+ deferred = atomic64_add_return(1, &deferred_count);
+ leaked = atomic64_read(&leaked_count);
+ pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+ ref, page ? page_to_pfn(page) : -1, deferred, leaked);
+ } else {
+ deferred = atomic64_read(&deferred_count);
+ leaked = atomic64_add_return(1, &leaked_count);
+ pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+ ref, page ? page_to_pfn(page) : -1, deferred, leaked);
}
- printk("%s g.e. %#x (pfn %#lx)\n",
- what, ref, page ? page_to_pfn(page) : -1);
}
+module_param(free_per_iteration, uint, 0600);
+
int gnttab_try_end_foreign_access(grant_ref_t ref)
{
int ret = _gnttab_end_foreign_access_ref(ref);
--
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 606c812eb1d5b5fb0dd9e330ca94b52d7c227830
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061905-footless-freewill-5f13@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 606c812eb1d5b5fb0dd9e330ca94b52d7c227830 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett(a)oracle.com>
Date: Sat, 17 Jun 2023 20:47:08 -0400
Subject: [PATCH] mm/mmap: Fix error path in do_vmi_align_munmap()
The error unrolling was leaving the VMAs detached in many cases and
leaving the locked_vm statistic altered, and skipping the unrolling
entirely in the case of the vma tree write failing.
Fix the error path by re-attaching the detached VMAs and adding the
necessary goto for the failed vma tree write, and fix the locked_vm
statistic by only updating after the vma tree write succeeds.
Fixes: 763ecb035029 ("mm: remove the vma linked list")
Reported-by: Vegard Nossum <vegard.nossum(a)oracle.com>
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/mmap.c b/mm/mmap.c
index 13678edaa22c..d600404580b2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2318,21 +2318,6 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
return __split_vma(vmi, vma, addr, new_below);
}
-static inline int munmap_sidetree(struct vm_area_struct *vma,
- struct ma_state *mas_detach)
-{
- vma_start_write(vma);
- mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1);
- if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
- return -ENOMEM;
-
- vma_mark_detached(vma, true);
- if (vma->vm_flags & VM_LOCKED)
- vma->vm_mm->locked_vm -= vma_pages(vma);
-
- return 0;
-}
-
/*
* do_vmi_align_munmap() - munmap the aligned region from @start to @end.
* @vmi: The vma iterator
@@ -2354,6 +2339,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct maple_tree mt_detach;
int count = 0;
int error = -ENOMEM;
+ unsigned long locked_vm = 0;
MA_STATE(mas_detach, &mt_detach, 0, 0);
mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
mt_set_external_lock(&mt_detach, &mm->mmap_lock);
@@ -2399,9 +2385,13 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (error)
goto end_split_failed;
}
- error = munmap_sidetree(next, &mas_detach);
- if (error)
- goto munmap_sidetree_failed;
+ vma_start_write(next);
+ mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1);
+ if (mas_store_gfp(&mas_detach, next, GFP_KERNEL))
+ goto munmap_gather_failed;
+ vma_mark_detached(next, true);
+ if (next->vm_flags & VM_LOCKED)
+ locked_vm += vma_pages(next);
count++;
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
@@ -2447,10 +2437,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
}
#endif
/* Point of no return */
+ error = -ENOMEM;
vma_iter_set(vmi, start);
if (vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL))
- return -ENOMEM;
+ goto clear_tree_failed;
+ mm->locked_vm -= locked_vm;
mm->map_count -= count;
/*
* Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
@@ -2480,9 +2472,14 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
validate_mm(mm);
return downgrade ? 1 : 0;
+clear_tree_failed:
userfaultfd_error:
-munmap_sidetree_failed:
+munmap_gather_failed:
end_split_failed:
+ mas_set(&mas_detach, 0);
+ mas_for_each(&mas_detach, next, end)
+ vma_mark_detached(next, false);
+
__mt_destroy(&mt_detach);
start_split_failed:
map_count_exceeded:
The patch below does not apply to the 6.3-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.3.y
git checkout FETCH_HEAD
git cherry-pick -x 606c812eb1d5b5fb0dd9e330ca94b52d7c227830
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061957-faceplate-coeditor-ef11@gregkh' --subject-prefix 'PATCH 6.3.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 606c812eb1d5b5fb0dd9e330ca94b52d7c227830 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett(a)oracle.com>
Date: Sat, 17 Jun 2023 20:47:08 -0400
Subject: [PATCH] mm/mmap: Fix error path in do_vmi_align_munmap()
The error unrolling was leaving the VMAs detached in many cases and
leaving the locked_vm statistic altered, and skipping the unrolling
entirely in the case of the vma tree write failing.
Fix the error path by re-attaching the detached VMAs and adding the
necessary goto for the failed vma tree write, and fix the locked_vm
statistic by only updating after the vma tree write succeeds.
Fixes: 763ecb035029 ("mm: remove the vma linked list")
Reported-by: Vegard Nossum <vegard.nossum(a)oracle.com>
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/mmap.c b/mm/mmap.c
index 13678edaa22c..d600404580b2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2318,21 +2318,6 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
return __split_vma(vmi, vma, addr, new_below);
}
-static inline int munmap_sidetree(struct vm_area_struct *vma,
- struct ma_state *mas_detach)
-{
- vma_start_write(vma);
- mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1);
- if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
- return -ENOMEM;
-
- vma_mark_detached(vma, true);
- if (vma->vm_flags & VM_LOCKED)
- vma->vm_mm->locked_vm -= vma_pages(vma);
-
- return 0;
-}
-
/*
* do_vmi_align_munmap() - munmap the aligned region from @start to @end.
* @vmi: The vma iterator
@@ -2354,6 +2339,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct maple_tree mt_detach;
int count = 0;
int error = -ENOMEM;
+ unsigned long locked_vm = 0;
MA_STATE(mas_detach, &mt_detach, 0, 0);
mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
mt_set_external_lock(&mt_detach, &mm->mmap_lock);
@@ -2399,9 +2385,13 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (error)
goto end_split_failed;
}
- error = munmap_sidetree(next, &mas_detach);
- if (error)
- goto munmap_sidetree_failed;
+ vma_start_write(next);
+ mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1);
+ if (mas_store_gfp(&mas_detach, next, GFP_KERNEL))
+ goto munmap_gather_failed;
+ vma_mark_detached(next, true);
+ if (next->vm_flags & VM_LOCKED)
+ locked_vm += vma_pages(next);
count++;
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
@@ -2447,10 +2437,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
}
#endif
/* Point of no return */
+ error = -ENOMEM;
vma_iter_set(vmi, start);
if (vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL))
- return -ENOMEM;
+ goto clear_tree_failed;
+ mm->locked_vm -= locked_vm;
mm->map_count -= count;
/*
* Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
@@ -2480,9 +2472,14 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
validate_mm(mm);
return downgrade ? 1 : 0;
+clear_tree_failed:
userfaultfd_error:
-munmap_sidetree_failed:
+munmap_gather_failed:
end_split_failed:
+ mas_set(&mas_detach, 0);
+ mas_for_each(&mas_detach, next, end)
+ vma_mark_detached(next, false);
+
__mt_destroy(&mt_detach);
start_split_failed:
map_count_exceeded:
On 19/06/2023 03:52, gregkh(a)linuxfoundation.org wrote:
>
> This is a note to let you know that I've just added the patch titled
>
> net/sched: act_api: move TCA_EXT_WARN_MSG to the correct hierarchy
>
> to the 6.1-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> net-sched-act_api-move-tca_ext_warn_msg-to-the-correct-hierarchy.patch
> and it can be found in the queue-6.1 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
>
> From 923b2e30dc9cd05931da0f64e2e23d040865c035 Mon Sep 17 00:00:00 2001
> From: Pedro Tammela <pctammela(a)mojatatu.com>
> Date: Fri, 24 Feb 2023 14:56:01 -0300
> Subject: net/sched: act_api: move TCA_EXT_WARN_MSG to the correct hierarchy
>
> From: Pedro Tammela <pctammela(a)mojatatu.com>
>
> commit 923b2e30dc9cd05931da0f64e2e23d040865c035 upstream.
>
> TCA_EXT_WARN_MSG is currently sitting outside of the expected hierarchy
> for the tc actions code. It should sit within TCA_ACT_TAB.
>
> Fixes: 0349b8779cc9 ("sched: add new attr TCA_EXT_WARN_MSG to report tc extact message")
> Reviewed-by: Jamal Hadi Salim <jhs(a)mojatatu.com>
> Signed-off-by: Pedro Tammela <pctammela(a)mojatatu.com>
> Reviewed-by: Simon Horman <simon.horman(a)corigine.com>
> Signed-off-by: David S. Miller <davem(a)davemloft.net>
> Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
> ---
> net/sched/act_api.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -1603,12 +1603,12 @@ static int tca_get_fill(struct sk_buff *
> if (tcf_action_dump(skb, actions, bind, ref, false) < 0)
> goto out_nlmsg_trim;
>
> - nla_nest_end(skb, nest);
> -
> if (extack && extack->_msg &&
> nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
> goto out_nlmsg_trim;
>
> + nla_nest_end(skb, nest);
> +
> nlh->nlmsg_len = skb_tail_pointer(skb) - b;
>
> return skb->len;
>
Hi!
This commit is bogus. The correct one to pull is:
2f59823fe696 ("net/sched: act_api: add specific EXT_WARN_MSG for tc action")
If it's already in the queue then just removing this one is enough.
Thanks,
Pedro
In jfs_dmap.c at line 381, BLKTODMAP is used to get a logical block
number inside dbFree(). db_l2nbperpage, which is the log2 number of
blocks per page, is passed as an argument to BLKTODMAP which uses it
for shifting.
Syzbot reported a shift out-of-bounds crash because db_l2nbperpage is
too big. This happens because the large value is set without any
validation in dbMount() at line 181.
Thus, make sure that db_l2nbperpage is correct while mounting.
Reported-and-tested-by: syzbot+d2cd27dcf8e04b232eb2(a)syzkaller.appspotmail.com
Link: https://syzkaller.appspot.com/bug?id=2a70a453331db32ed491f5cbb07e81bf2d2257…
Cc: stable(a)vger.kernel.org
Signed-off-by: Siddh Raman Pant <code(a)siddh.me>
---
fs/jfs/jfs_dmap.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index a3eb1e826947..62f058822a3a 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -178,7 +178,13 @@ int dbMount(struct inode *ipbmap)
dbmp_le = (struct dbmap_disk *) mp->data;
bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
+
bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
+ if (bmp->db_l2nbperpage > L2MAXL0SIZE) {
+ err = -EINVAL;
+ goto err_release_metapage;
+ }
+
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
if (!bmp->db_numag) {
err = -EINVAL;
--
2.39.2
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 97b6b9cbba40a21c1d9a344d5c1991f8cfbf136e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061759-gilled-droop-f51d@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 97b6b9cbba40a21c1d9a344d5c1991f8cfbf136e Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda <ribalda(a)chromium.org>
Date: Fri, 19 May 2023 16:47:37 +0200
Subject: [PATCH] x86/purgatory: remove PGO flags
If profile-guided optimization is enabled, the purgatory ends up with
multiple .text sections. This is not supported by kexec and crashes the
system.
Link: https://lkml.kernel.org/r/20230321-kexec_clang16-v7-2-b05c520b7296@chromium…
Fixes: 930457057abe ("kernel/kexec_file.c: split up __kexec_load_puragory")
Signed-off-by: Ricardo Ribalda <ribalda(a)chromium.org>
Cc: <stable(a)vger.kernel.org>
Cc: Albert Ou <aou(a)eecs.berkeley.edu>
Cc: Baoquan He <bhe(a)redhat.com>
Cc: Borislav Petkov (AMD) <bp(a)alien8.de>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Dave Young <dyoung(a)redhat.com>
Cc: Eric W. Biederman <ebiederm(a)xmission.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Nathan Chancellor <nathan(a)kernel.org>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Nick Desaulniers <ndesaulniers(a)google.com>
Cc: Palmer Dabbelt <palmer(a)dabbelt.com>
Cc: Palmer Dabbelt <palmer(a)rivosinc.com>
Cc: Paul Walmsley <paul.walmsley(a)sifive.com>
Cc: Philipp Rudo <prudo(a)redhat.com>
Cc: Ross Zwisler <zwisler(a)google.com>
Cc: Simon Horman <horms(a)kernel.org>
Cc: Steven Rostedt (Google) <rostedt(a)goodmis.org>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Tom Rix <trix(a)redhat.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 82fec66d46d2..42abd6af1198 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -14,6 +14,11 @@ $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE
CFLAGS_sha256.o := -D__DISABLE_EXPORTS
+# When profile-guided optimization is enabled, llvm emits two different
+# overlapping text sections, which is not supported by kexec. Remove profile
+# optimization flags.
+KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%,$(KBUILD_CFLAGS))
+
# When linking purgatory.ro with -r unresolved symbols are not checked,
# also link a purgatory.chk binary without -r to check for unresolved symbols.
PURGATORY_LDFLAGS := -e purgatory_start -z nodefaultlib
As noted by Michal, the blkg_iostat_set's in the lockless list hold
reference to blkg's to protect against their removal. Those blkg's
hold reference to blkcg. When a cgroup is being destroyed,
cgroup_rstat_flush() is only called at css_release_work_fn() which
is called when the blkcg reference count reaches 0. This circular
dependency will prevent blkcg and some blkgs from being freed after
they are made offline.
It is less a problem if the cgroup to be destroyed also has other
controllers like memory that will call cgroup_rstat_flush() which will
clean up the reference count. If block is the only controller that uses
rstat, these offline blkcg and blkgs may never be freed leaking more
and more memory over time.
To prevent this potential memory leak:
- flush blkcg per-cpu stats list in __blkg_release(), when no new stat
can be added
- add global blkg_stat_lock for covering concurrent parent blkg stat
update
- don't grab bio->bi_blkg reference when adding the stats into blkcg's
per-cpu stat list since all stats are guaranteed to be consumed before
releasing blkg instance, and grabbing blkg reference for stats was the
most fragile part of original patch
Based on Waiman's patch:
https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.…
Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
Cc: stable(a)vger.kernel.org
Reported-by: Jay Shin <jaeshin(a)redhat.com>
Acked-by: Tejun Heo <tj(a)kernel.org>
Cc: Waiman Long <longman(a)redhat.com>
Cc: mkoutny(a)suse.com
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Signed-off-by: Ming Lei <ming.lei(a)redhat.com>
Link: https://lore.kernel.org/r/20230609234249.1412858-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
---
Context difference with linus tree: 2c275afeb61d ("block: make blkcg_punt_bio_submit
optional") adds '#ifdef CONFIG_BLK_CGROUP_PUNT_BIO' in __blkg_release().
block/blk-cgroup.c | 40 +++++++++++++++++++++++++++++++---------
1 file changed, 31 insertions(+), 9 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 75bad5d60c9f..dd6d1c0117b1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -35,6 +35,8 @@
#include "blk-throttle.h"
#include "blk-rq-qos.h"
+static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
+
/*
* blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
* blkcg_pol_register_mutex nests outside of it and synchronizes entire
@@ -58,6 +60,8 @@ static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
bool blkcg_debug_stats = false;
static struct workqueue_struct *blkcg_punt_bio_wq;
+static DEFINE_RAW_SPINLOCK(blkg_stat_lock);
+
#define BLKG_DESTROY_BATCH_SIZE 64
/*
@@ -165,8 +169,18 @@ static void blkg_free(struct blkcg_gq *blkg)
static void __blkg_release(struct rcu_head *rcu)
{
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+ struct blkcg *blkcg = blkg->blkcg;
+ int cpu;
WARN_ON(!bio_list_empty(&blkg->async_bios));
+ /*
+ * Flush all the non-empty percpu lockless lists before releasing
+ * us, given these stat belongs to us.
+ *
+ * blkg_stat_lock is for serializing blkg stat update
+ */
+ for_each_possible_cpu(cpu)
+ __blkcg_rstat_flush(blkcg, cpu);
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
@@ -888,23 +902,26 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
-static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
{
- struct blkcg *blkcg = css_to_blkcg(css);
struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
struct llist_node *lnode;
struct blkg_iostat_set *bisc, *next_bisc;
- /* Root-level stats are sourced from system-wide IO stats */
- if (!cgroup_parent(css->cgroup))
- return;
-
rcu_read_lock();
lnode = llist_del_all(lhead);
if (!lnode)
goto out;
+ /*
+ * For covering concurrent parent blkg update from blkg_release().
+ *
+ * When flushing from cgroup, cgroup_rstat_lock is always held, so
+ * this lock won't cause contention most of time.
+ */
+ raw_spin_lock(&blkg_stat_lock);
+
/*
* Iterate only the iostat_cpu's queued in the lockless list.
*/
@@ -928,13 +945,19 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (parent && parent->parent)
blkcg_iostat_update(parent, &blkg->iostat.cur,
&blkg->iostat.last);
- percpu_ref_put(&blkg->refcnt);
}
-
+ raw_spin_unlock(&blkg_stat_lock);
out:
rcu_read_unlock();
}
+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ /* Root-level stats are sourced from system-wide IO stats */
+ if (cgroup_parent(css->cgroup))
+ __blkcg_rstat_flush(css_to_blkcg(css), cpu);
+}
+
/*
* We source root cgroup stats from the system-wide stats to avoid
* tracking the same information twice and incurring overhead when no
@@ -2063,7 +2086,6 @@ void blk_cgroup_bio_start(struct bio *bio)
llist_add(&bis->lnode, lhead);
WRITE_ONCE(bis->lqueued, true);
- percpu_ref_get(&bis->blkg->refcnt);
}
u64_stats_update_end_irqrestore(&bis->sync, flags);
--
2.40.1
Hi,
Not sure why this was backported in the first place, but if so you'd
also need 1fb1abc83636 ("um: Fix build w/o CONFIG_PM_SLEEP").
I think b58294ce1a8a ("um: Allow PM with suspend-to-idle") should just
be reverted, but picking up the fix for it also works.
Robot keeps reporting to me that it's broken :)
Thanks,
johannes
Following regressions found on stable rc 5.4 while building MIPS configs,
Reported-by: Linux Kernel Functional Testing <lkft(a)linaro.org>
MIPS: Restore Au1300 support
[ Upstream commit f2041708dee30a3425f680265c337acd28293782 ]
Build log:
======
arch/mips/kernel/cpu-probe.c: In function 'cpu_probe':
arch/mips/kernel/cpu-probe.c:2125:9: error: duplicate case value
2125 | case PRID_COMP_NETLOGIC:
| ^~~~
arch/mips/kernel/cpu-probe.c:2099:9: note: previously used here
2099 | case PRID_COMP_NETLOGIC:
| ^~~~
make[3]: *** [scripts/Makefile.build:262: arch/mips/kernel/cpu-probe.o] Error 1
Links:
- https://qa-reports.linaro.org/lkft/linux-stable-rc-linux-5.4.y/build/v5.4.2…
- https://qa-reports.linaro.org/lkft/linux-stable-rc-linux-5.4.y/build/v5.4.2…
--
Linaro LKFT
https://lkft.linaro.org
Struct bd6107_platform_data refers to a platform device within
the Linux device hierarchy. The test in bd6107_backlight_check_fb()
compares it against the fbdev device in struct fb_info.dev, which
is different. Fix the test by comparing to struct fb_info.device.
Fixes a bug in the backlight driver and prepares fbdev for making
struct fb_info.dev optional.
v2:
* move renames into separate patch (Javier, Sam, Michael)
Fixes: 67b43e590415 ("backlight: Add ROHM BD6107 backlight driver")
Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Cc: Laurent Pinchart <laurent.pinchart+renesas(a)ideasonboard.com>
Cc: Lee Jones <lee(a)kernel.org>
Cc: Daniel Thompson <daniel.thompson(a)linaro.org>
Cc: Jingoo Han <jingoohan1(a)gmail.com>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v3.12+
Reviewed-by: Javier Martinez Canillas <javierm(a)redhat.com>
Reviewed-by: Sam Ravnborg <sam(a)ravnborg.org>
Reviewed-by: Daniel Thompson <daniel.thompson(a)linaro.org>
---
drivers/video/backlight/bd6107.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/video/backlight/bd6107.c b/drivers/video/backlight/bd6107.c
index f4db6c064635b..e3410444ea235 100644
--- a/drivers/video/backlight/bd6107.c
+++ b/drivers/video/backlight/bd6107.c
@@ -104,7 +104,7 @@ static int bd6107_backlight_check_fb(struct backlight_device *backlight,
{
struct bd6107 *bd = bl_get_data(backlight);
- return bd->pdata->fbdev == NULL || bd->pdata->fbdev == info->dev;
+ return bd->pdata->fbdev == NULL || bd->pdata->fbdev == info->device;
}
static const struct backlight_ops bd6107_backlight_ops = {
--
2.41.0
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 1240eb93f0616b21c675416516ff3d74798fdc97
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061933-remover-tweet-3f9b@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1240eb93f0616b21c675416516ff3d74798fdc97 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo(a)netfilter.org>
Date: Thu, 8 Jun 2023 02:32:02 +0200
Subject: [PATCH] netfilter: nf_tables: incorrect error path handling with
NFT_MSG_NEWRULE
In case of error when adding a new rule that refers to an anonymous set,
deactivate expressions via NFT_TRANS_PREPARE state, not NFT_TRANS_RELEASE.
Thus, the lookup expression marks anonymous sets as inactive in the next
generation to ensure it is not reachable in this transaction anymore and
decrement the set refcount as introduced by c1592a89942e ("netfilter:
nf_tables: deactivate anonymous set from preparation phase"). The abort
step takes care of undoing the anonymous set.
This is also consistent with rule deletion, where NFT_TRANS_PREPARE is
used. Note that this error path is exercised in the preparation step of
the commit protocol. This patch replaces nf_tables_rule_release() by the
deactivate and destroy calls, this time with NFT_TRANS_PREPARE.
Due to this incorrect error handling, it is possible to access a
dangling pointer to the anonymous set that remains in the transaction
list.
[1009.379054] BUG: KASAN: use-after-free in nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379106] Read of size 8 at addr ffff88816c4c8020 by task nft-rule-add/137110
[1009.379116] CPU: 7 PID: 137110 Comm: nft-rule-add Not tainted 6.4.0-rc4+ #256
[1009.379128] Call Trace:
[1009.379132] <TASK>
[1009.379135] dump_stack_lvl+0x33/0x50
[1009.379146] ? nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379191] print_address_description.constprop.0+0x27/0x300
[1009.379201] kasan_report+0x107/0x120
[1009.379210] ? nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379255] nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379302] nft_lookup_init+0xa5/0x270 [nf_tables]
[1009.379350] nf_tables_newrule+0x698/0xe50 [nf_tables]
[1009.379397] ? nf_tables_rule_release+0xe0/0xe0 [nf_tables]
[1009.379441] ? kasan_unpoison+0x23/0x50
[1009.379450] nfnetlink_rcv_batch+0x97c/0xd90 [nfnetlink]
[1009.379470] ? nfnetlink_rcv_msg+0x480/0x480 [nfnetlink]
[1009.379485] ? __alloc_skb+0xb8/0x1e0
[1009.379493] ? __alloc_skb+0xb8/0x1e0
[1009.379502] ? entry_SYSCALL_64_after_hwframe+0x46/0xb0
[1009.379509] ? unwind_get_return_address+0x2a/0x40
[1009.379517] ? write_profile+0xc0/0xc0
[1009.379524] ? avc_lookup+0x8f/0xc0
[1009.379532] ? __rcu_read_unlock+0x43/0x60
Fixes: 958bee14d071 ("netfilter: nf_tables: use new transaction infrastructure to handle sets")
Signed-off-by: Pablo Neira Ayuso <pablo(a)netfilter.org>
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3bb0800b3849..69bceefaa5c8 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3844,7 +3844,8 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
if (flow)
nft_flow_rule_destroy(flow);
err_release_rule:
- nf_tables_rule_release(&ctx, rule);
+ nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE);
+ nf_tables_rule_destroy(&ctx, rule);
err_release_expr:
for (i = 0; i < n; i++) {
if (expr_info[i].ops) {
From: Ma Wupeng <mawupeng1(a)huawei.com>
Hi maintainers:
Our test find a memleak in init_memory_block, it is clear that mem is never
been released due to wrong refcount. Commit 08b3acd7a68f ("mm/memory_hotplug:
Introduce offline_and_remove_memory()") failed to dec refcount after
find_memory_block which fail to dec refcount to zero in remove memory
causing the leak.
Commit 8dc4bb58a146 ("mm/memory_hotplug: extend offline_and_remove_memory()
to handle more than one memory block") introduce walk_memory_blocks to
replace find_memory_block which dec refcount by calling put_device after
find_memory_block_by_id. In the way, the memleak is fixed.
Here is the simplified calltrace:
kmem_cache_alloc_trace+0x664/0xed0
init_memory_block+0x8c/0x170
create_memory_block_devices+0xa4/0x150
add_memory_resource+0x188/0x530
__add_memory+0x78/0x104
add_memory+0x6c/0xb0
David Hildenbrand (1):
mm/memory_hotplug: extend offline_and_remove_memory() to handle more
than one memory block
mm/memory_hotplug.c | 105 +++++++++++++++++++++++++++++++++++++-------
1 file changed, 89 insertions(+), 16 deletions(-)
--
2.25.1
Hi all,
This series backports commit d5c8d6e0fa61 ("kbuild: Update assembler
calls to use proper flags and language target") to linux-6.1.y to
address a recent issue caused by a change in behavior in clang:
https://lore.kernel.org/CA+G9fYsJq0sPC+q6vLNKUgBqCGmmjDrfeP4R1-95Eu28FJRY_A…https://lore.kernel.org/20230612185424.GA2891387@dev-arch.thelio-3990X/
While that was not the original intention of the aforementioned change,
it ends up resolving the issue for the same reason, by not passing flags
that are not supported or necessary for the current language target
(KBUILD_CFLAGS for .c files and KBUILD_AFLAGS for .S files) when testing
flags for that language target.
All patches except the second one are direct backports from mainline.
The second patch is a stable specific patch because the upstream
solution could break stable due to the minimum supported version of
binutils in mainline being a newer version than 6.1 and earlier; it
chooses to do the more conservative fix, which was alluded to in the
changelog of the upstream commit.
For now, this is just a 6.1 issue. If the issue occurs in older
releases, I will send separate backports. If there are any issues or
objections to this series, please let me know.
Cheers,
Nathan
---
Nathan Chancellor (2):
MIPS: Move '-Wa,-msoft-float' check from as-option to cc-option
MIPS: Prefer cc-option for additions to cflags
Nick Desaulniers (2):
x86/boot/compressed: prefer cc-option for CFLAGS additions
kbuild: Update assembler calls to use proper flags and language target
arch/mips/Makefile | 4 ++--
arch/mips/loongson2ef/Platform | 2 +-
arch/x86/boot/compressed/Makefile | 2 +-
scripts/Kconfig.include | 2 +-
scripts/Makefile.compiler | 8 ++++----
scripts/as-version.sh | 2 +-
6 files changed, 10 insertions(+), 10 deletions(-)
---
base-commit: ca87e77a2ef8b298aa9f69658d5898e72ee450fe
change-id: 20230612-6-1-asssembler-target-llvm-17-3f8101fc008f
Best regards,
--
Nathan Chancellor <nathan(a)kernel.org>
commit 92c5d1b860e9581d64baca76779576c0ab0d943d upstream.
The current sanity check for nilfs2 geometry information lacks checks for
the number of segments stored in superblocks, so even for device images
that have been destructively truncated or have an unusually high number of
segments, the mount operation may succeed.
This causes out-of-bounds block I/O on file system block reads or log
writes to the segments, the latter in particular causing
"a_ops->writepages" to repeatedly fail, resulting in sync_inodes_sb() to
hang.
Fix this issue by checking the number of segments stored in the superblock
and avoiding mounting devices that can cause out-of-bounds accesses. To
eliminate the possibility of overflow when calculating the number of
blocks required for the device from the number of segments, this also adds
a helper function to calculate the upper bound on the number of segments
and inserts a check using it.
Link: https://lkml.kernel.org/r/20230526021332.3431-1-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+7d50f1e54a12ba3aeae2(a)syzkaller.appspotmail.com
Link: https://syzkaller.appspot.com/bug?extid=7d50f1e54a12ba3aeae2
Tested-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
Please apply this patch to the above stable trees instead of the patch
that could not be applied to them. The hang issue reported by syzbot was
confirmed to reproduce on these stable kernels using its reproducer.
This fixes it.
In this patch, "sb_bdev_nr_blocks()" is replaced with its equivalent since
it doesn't yet exist in these kernels. With this tweak, this patch is
applicable from v5.9 to v5.15. Also, this patch has been tested against
the title stable trees.
fs/nilfs2/the_nilfs.c | 44 ++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 43 insertions(+), 1 deletion(-)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 0fa130362816..fe2e7197268b 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -405,6 +405,18 @@ unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs)
100));
}
+/**
+ * nilfs_max_segment_count - calculate the maximum number of segments
+ * @nilfs: nilfs object
+ */
+static u64 nilfs_max_segment_count(struct the_nilfs *nilfs)
+{
+ u64 max_count = U64_MAX;
+
+ do_div(max_count, nilfs->ns_blocks_per_segment);
+ return min_t(u64, max_count, ULONG_MAX);
+}
+
void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs)
{
nilfs->ns_nsegments = nsegs;
@@ -414,6 +426,8 @@ void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs)
static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
struct nilfs_super_block *sbp)
{
+ u64 nsegments, nblocks;
+
if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
nilfs_err(nilfs->ns_sb,
"unsupported revision (superblock rev.=%d.%d, current rev.=%d.%d). Please check the version of mkfs.nilfs(2).",
@@ -457,7 +471,35 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
return -EINVAL;
}
- nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments));
+ nsegments = le64_to_cpu(sbp->s_nsegments);
+ if (nsegments > nilfs_max_segment_count(nilfs)) {
+ nilfs_err(nilfs->ns_sb,
+ "segment count %llu exceeds upper limit (%llu segments)",
+ (unsigned long long)nsegments,
+ (unsigned long long)nilfs_max_segment_count(nilfs));
+ return -EINVAL;
+ }
+
+ nblocks = (u64)i_size_read(nilfs->ns_sb->s_bdev->bd_inode) >>
+ nilfs->ns_sb->s_blocksize_bits;
+ if (nblocks) {
+ u64 min_block_count = nsegments * nilfs->ns_blocks_per_segment;
+ /*
+ * To avoid failing to mount early device images without a
+ * second superblock, exclude that block count from the
+ * "min_block_count" calculation.
+ */
+
+ if (nblocks < min_block_count) {
+ nilfs_err(nilfs->ns_sb,
+ "total number of segment blocks %llu exceeds device size (%llu blocks)",
+ (unsigned long long)min_block_count,
+ (unsigned long long)nblocks);
+ return -EINVAL;
+ }
+ }
+
+ nilfs_set_nsegments(nilfs, nsegments);
nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
return 0;
}
--
2.39.3
commit 003fb0a51162d940f25fc35e70b0996a12c9e08a upstream.
Requests to the mmc layer usually come through a block device IO.
The exceptions are the ioctl interface, RPMB chardev ioctl
and debugfs, which issue their own blk_mq requests through
blk_execute_rq and do not query the BLK_STS error but the
mmcblk-internal drv_op_result. This patch ensures that drv_op_result
defaults to an error and has to be overwritten by the operation
to be considered successful.
The behavior leads to a bug where the request never propagates
the error, e.g. by directly erroring out at mmc_blk_mq_issue_rq if
mmc_blk_part_switch fails. The ioctl caller of the rpmb chardev then
can never see an error (BLK_STS_IOERR, but drv_op_result is unchanged)
and thus may assume that their call executed successfully when it did not.
While always checking the blk_execute_rq return value would be
advised, let's eliminate the error by always setting
drv_op_result as -EIO to be overwritten on success (or other error)
Fixes: 614f0388f580 ("mmc: block: move single ioctl() commands to block requests")
Signed-off-by: Christian Loehle <cloehle(a)hyperstone.com>
---
This is for the 5.15. stable tree
drivers/mmc/core/block.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index ed034b93cb25..0b72096f10e6 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -265,6 +265,7 @@ static ssize_t power_ro_lock_store(struct device *dev,
goto out_put;
}
req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_BOOT_WP;
+ req_to_mmc_queue_req(req)->drv_op_result = -EIO;
blk_execute_rq(NULL, req, 0);
ret = req_to_mmc_queue_req(req)->drv_op_result;
blk_put_request(req);
@@ -656,6 +657,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md,
idatas[0] = idata;
req_to_mmc_queue_req(req)->drv_op =
rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL;
+ req_to_mmc_queue_req(req)->drv_op_result = -EIO;
req_to_mmc_queue_req(req)->drv_op_data = idatas;
req_to_mmc_queue_req(req)->ioc_count = 1;
blk_execute_rq(NULL, req, 0);
@@ -725,6 +727,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md,
}
req_to_mmc_queue_req(req)->drv_op =
rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL;
+ req_to_mmc_queue_req(req)->drv_op_result = -EIO;
req_to_mmc_queue_req(req)->drv_op_data = idata;
req_to_mmc_queue_req(req)->ioc_count = num_of_cmds;
blk_execute_rq(NULL, req, 0);
@@ -2784,6 +2787,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val)
if (IS_ERR(req))
return PTR_ERR(req);
req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS;
+ req_to_mmc_queue_req(req)->drv_op_result = -EIO;
blk_execute_rq(NULL, req, 0);
ret = req_to_mmc_queue_req(req)->drv_op_result;
if (ret >= 0) {
@@ -2822,6 +2826,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp)
goto out_free;
}
req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_EXT_CSD;
+ req_to_mmc_queue_req(req)->drv_op_result = -EIO;
req_to_mmc_queue_req(req)->drv_op_data = &ext_csd;
blk_execute_rq(NULL, req, 0);
err = req_to_mmc_queue_req(req)->drv_op_result;
--
2.37.3
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x 1240eb93f0616b21c675416516ff3d74798fdc97
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061939-sprout-jujitsu-b6a0@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1240eb93f0616b21c675416516ff3d74798fdc97 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo(a)netfilter.org>
Date: Thu, 8 Jun 2023 02:32:02 +0200
Subject: [PATCH] netfilter: nf_tables: incorrect error path handling with
NFT_MSG_NEWRULE
In case of error when adding a new rule that refers to an anonymous set,
deactivate expressions via NFT_TRANS_PREPARE state, not NFT_TRANS_RELEASE.
Thus, the lookup expression marks anonymous sets as inactive in the next
generation to ensure it is not reachable in this transaction anymore and
decrement the set refcount as introduced by c1592a89942e ("netfilter:
nf_tables: deactivate anonymous set from preparation phase"). The abort
step takes care of undoing the anonymous set.
This is also consistent with rule deletion, where NFT_TRANS_PREPARE is
used. Note that this error path is exercised in the preparation step of
the commit protocol. This patch replaces nf_tables_rule_release() by the
deactivate and destroy calls, this time with NFT_TRANS_PREPARE.
Due to this incorrect error handling, it is possible to access a
dangling pointer to the anonymous set that remains in the transaction
list.
[1009.379054] BUG: KASAN: use-after-free in nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379106] Read of size 8 at addr ffff88816c4c8020 by task nft-rule-add/137110
[1009.379116] CPU: 7 PID: 137110 Comm: nft-rule-add Not tainted 6.4.0-rc4+ #256
[1009.379128] Call Trace:
[1009.379132] <TASK>
[1009.379135] dump_stack_lvl+0x33/0x50
[1009.379146] ? nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379191] print_address_description.constprop.0+0x27/0x300
[1009.379201] kasan_report+0x107/0x120
[1009.379210] ? nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379255] nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379302] nft_lookup_init+0xa5/0x270 [nf_tables]
[1009.379350] nf_tables_newrule+0x698/0xe50 [nf_tables]
[1009.379397] ? nf_tables_rule_release+0xe0/0xe0 [nf_tables]
[1009.379441] ? kasan_unpoison+0x23/0x50
[1009.379450] nfnetlink_rcv_batch+0x97c/0xd90 [nfnetlink]
[1009.379470] ? nfnetlink_rcv_msg+0x480/0x480 [nfnetlink]
[1009.379485] ? __alloc_skb+0xb8/0x1e0
[1009.379493] ? __alloc_skb+0xb8/0x1e0
[1009.379502] ? entry_SYSCALL_64_after_hwframe+0x46/0xb0
[1009.379509] ? unwind_get_return_address+0x2a/0x40
[1009.379517] ? write_profile+0xc0/0xc0
[1009.379524] ? avc_lookup+0x8f/0xc0
[1009.379532] ? __rcu_read_unlock+0x43/0x60
Fixes: 958bee14d071 ("netfilter: nf_tables: use new transaction infrastructure to handle sets")
Signed-off-by: Pablo Neira Ayuso <pablo(a)netfilter.org>
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3bb0800b3849..69bceefaa5c8 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3844,7 +3844,8 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
if (flow)
nft_flow_rule_destroy(flow);
err_release_rule:
- nf_tables_rule_release(&ctx, rule);
+ nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE);
+ nf_tables_rule_destroy(&ctx, rule);
err_release_expr:
for (i = 0; i < n; i++) {
if (expr_info[i].ops) {
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 1240eb93f0616b21c675416516ff3d74798fdc97
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061937-spiritism-reliably-6082@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1240eb93f0616b21c675416516ff3d74798fdc97 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo(a)netfilter.org>
Date: Thu, 8 Jun 2023 02:32:02 +0200
Subject: [PATCH] netfilter: nf_tables: incorrect error path handling with
NFT_MSG_NEWRULE
In case of error when adding a new rule that refers to an anonymous set,
deactivate expressions via NFT_TRANS_PREPARE state, not NFT_TRANS_RELEASE.
Thus, the lookup expression marks anonymous sets as inactive in the next
generation to ensure it is not reachable in this transaction anymore and
decrement the set refcount as introduced by c1592a89942e ("netfilter:
nf_tables: deactivate anonymous set from preparation phase"). The abort
step takes care of undoing the anonymous set.
This is also consistent with rule deletion, where NFT_TRANS_PREPARE is
used. Note that this error path is exercised in the preparation step of
the commit protocol. This patch replaces nf_tables_rule_release() by the
deactivate and destroy calls, this time with NFT_TRANS_PREPARE.
Due to this incorrect error handling, it is possible to access a
dangling pointer to the anonymous set that remains in the transaction
list.
[1009.379054] BUG: KASAN: use-after-free in nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379106] Read of size 8 at addr ffff88816c4c8020 by task nft-rule-add/137110
[1009.379116] CPU: 7 PID: 137110 Comm: nft-rule-add Not tainted 6.4.0-rc4+ #256
[1009.379128] Call Trace:
[1009.379132] <TASK>
[1009.379135] dump_stack_lvl+0x33/0x50
[1009.379146] ? nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379191] print_address_description.constprop.0+0x27/0x300
[1009.379201] kasan_report+0x107/0x120
[1009.379210] ? nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379255] nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379302] nft_lookup_init+0xa5/0x270 [nf_tables]
[1009.379350] nf_tables_newrule+0x698/0xe50 [nf_tables]
[1009.379397] ? nf_tables_rule_release+0xe0/0xe0 [nf_tables]
[1009.379441] ? kasan_unpoison+0x23/0x50
[1009.379450] nfnetlink_rcv_batch+0x97c/0xd90 [nfnetlink]
[1009.379470] ? nfnetlink_rcv_msg+0x480/0x480 [nfnetlink]
[1009.379485] ? __alloc_skb+0xb8/0x1e0
[1009.379493] ? __alloc_skb+0xb8/0x1e0
[1009.379502] ? entry_SYSCALL_64_after_hwframe+0x46/0xb0
[1009.379509] ? unwind_get_return_address+0x2a/0x40
[1009.379517] ? write_profile+0xc0/0xc0
[1009.379524] ? avc_lookup+0x8f/0xc0
[1009.379532] ? __rcu_read_unlock+0x43/0x60
Fixes: 958bee14d071 ("netfilter: nf_tables: use new transaction infrastructure to handle sets")
Signed-off-by: Pablo Neira Ayuso <pablo(a)netfilter.org>
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3bb0800b3849..69bceefaa5c8 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3844,7 +3844,8 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
if (flow)
nft_flow_rule_destroy(flow);
err_release_rule:
- nf_tables_rule_release(&ctx, rule);
+ nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE);
+ nf_tables_rule_destroy(&ctx, rule);
err_release_expr:
for (i = 0; i < n; i++) {
if (expr_info[i].ops) {
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 1240eb93f0616b21c675416516ff3d74798fdc97
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061935-renewed-granite-7529@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1240eb93f0616b21c675416516ff3d74798fdc97 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo(a)netfilter.org>
Date: Thu, 8 Jun 2023 02:32:02 +0200
Subject: [PATCH] netfilter: nf_tables: incorrect error path handling with
NFT_MSG_NEWRULE
In case of error when adding a new rule that refers to an anonymous set,
deactivate expressions via NFT_TRANS_PREPARE state, not NFT_TRANS_RELEASE.
Thus, the lookup expression marks anonymous sets as inactive in the next
generation to ensure it is not reachable in this transaction anymore and
decrement the set refcount as introduced by c1592a89942e ("netfilter:
nf_tables: deactivate anonymous set from preparation phase"). The abort
step takes care of undoing the anonymous set.
This is also consistent with rule deletion, where NFT_TRANS_PREPARE is
used. Note that this error path is exercised in the preparation step of
the commit protocol. This patch replaces nf_tables_rule_release() by the
deactivate and destroy calls, this time with NFT_TRANS_PREPARE.
Due to this incorrect error handling, it is possible to access a
dangling pointer to the anonymous set that remains in the transaction
list.
[1009.379054] BUG: KASAN: use-after-free in nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379106] Read of size 8 at addr ffff88816c4c8020 by task nft-rule-add/137110
[1009.379116] CPU: 7 PID: 137110 Comm: nft-rule-add Not tainted 6.4.0-rc4+ #256
[1009.379128] Call Trace:
[1009.379132] <TASK>
[1009.379135] dump_stack_lvl+0x33/0x50
[1009.379146] ? nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379191] print_address_description.constprop.0+0x27/0x300
[1009.379201] kasan_report+0x107/0x120
[1009.379210] ? nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379255] nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379302] nft_lookup_init+0xa5/0x270 [nf_tables]
[1009.379350] nf_tables_newrule+0x698/0xe50 [nf_tables]
[1009.379397] ? nf_tables_rule_release+0xe0/0xe0 [nf_tables]
[1009.379441] ? kasan_unpoison+0x23/0x50
[1009.379450] nfnetlink_rcv_batch+0x97c/0xd90 [nfnetlink]
[1009.379470] ? nfnetlink_rcv_msg+0x480/0x480 [nfnetlink]
[1009.379485] ? __alloc_skb+0xb8/0x1e0
[1009.379493] ? __alloc_skb+0xb8/0x1e0
[1009.379502] ? entry_SYSCALL_64_after_hwframe+0x46/0xb0
[1009.379509] ? unwind_get_return_address+0x2a/0x40
[1009.379517] ? write_profile+0xc0/0xc0
[1009.379524] ? avc_lookup+0x8f/0xc0
[1009.379532] ? __rcu_read_unlock+0x43/0x60
Fixes: 958bee14d071 ("netfilter: nf_tables: use new transaction infrastructure to handle sets")
Signed-off-by: Pablo Neira Ayuso <pablo(a)netfilter.org>
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3bb0800b3849..69bceefaa5c8 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3844,7 +3844,8 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
if (flow)
nft_flow_rule_destroy(flow);
err_release_rule:
- nf_tables_rule_release(&ctx, rule);
+ nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE);
+ nf_tables_rule_destroy(&ctx, rule);
err_release_expr:
for (i = 0; i < n; i++) {
if (expr_info[i].ops) {
From: Michael Ellerman <mpe(a)ellerman.id.au>
Our logic for choosing defconfig doesn't work well in some situations.
For example if you're on a ppc64le machine but you specify a non-empty
CROSS_COMPILE, in order to use a non-default toolchain, then defconfig
will give you ppc64_defconfig (big endian):
$ make CROSS_COMPILE=~/toolchains/gcc-8/bin/powerpc-linux- defconfig
*** Default configuration is based on 'ppc64_defconfig'
This is because we assume that CROSS_COMPILE being set means we
can't be on a ppc machine and rather than checking we just default to
ppc64_defconfig.
We should just ignore CROSS_COMPILE, instead check the machine with
uname and if it's one of ppc, ppc64 or ppc64le then use that
defconfig. If it's none of those then we fall back to ppc64_defconfig.
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
(cherry picked from commit af5cd05de5dd38cf25d14ea4d30ae9b791d2420b)
Signed-off-by: Alyssa Ross <hi(a)alyssa.is>
---
arch/powerpc/Makefile | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 9c78ef298257..cbc7c05a6165 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -29,11 +29,10 @@ endif
export CROSS32CC CROSS32AR
-ifeq ($(CROSS_COMPILE),)
-KBUILD_DEFCONFIG := $(shell uname -m)_defconfig
-else
-KBUILD_DEFCONFIG := ppc64_defconfig
-endif
+# If we're on a ppc/ppc64/ppc64le machine use that defconfig, otherwise just use
+# ppc64_defconfig because we have nothing better to go on.
+uname := $(shell uname -m)
+KBUILD_DEFCONFIG := $(if $(filter ppc%,$(uname)),$(uname),ppc64)_defconfig
ifeq ($(CONFIG_PPC64),y)
new_nm := $(shell if $(NM) --help 2>&1 | grep -- '--synthetic' > /dev/null; then echo y; else echo n; fi)
base-commit: 1914956342c8cf52a377aecc4944e63f9229cb9b
--
2.37.1
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 84ad0af0bccd3691cb951c2974c5cb2c10594d4a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061948-emerald-clamor-35c8@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 84ad0af0bccd3691cb951c2974c5cb2c10594d4a Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye(a)bytedance.com>
Date: Sat, 10 Jun 2023 20:30:25 -0700
Subject: [PATCH] net/sched: qdisc_destroy() old ingress and clsact Qdiscs
before grafting
mini_Qdisc_pair::p_miniq is a double pointer to mini_Qdisc, initialized
in ingress_init() to point to net_device::miniq_ingress. ingress Qdiscs
access this per-net_device pointer in mini_qdisc_pair_swap(). Similar
for clsact Qdiscs and miniq_egress.
Unfortunately, after introducing RTNL-unlocked RTM_{NEW,DEL,GET}TFILTER
requests (thanks Hillf Danton for the hint), when replacing ingress or
clsact Qdiscs, for example, the old Qdisc ("@old") could access the same
miniq_{in,e}gress pointer(s) concurrently with the new Qdisc ("@new"),
causing race conditions [1] including a use-after-free bug in
mini_qdisc_pair_swap() reported by syzbot:
BUG: KASAN: slab-use-after-free in mini_qdisc_pair_swap+0x1c2/0x1f0 net/sched/sch_generic.c:1573
Write of size 8 at addr ffff888045b31308 by task syz-executor690/14901
...
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xd9/0x150 lib/dump_stack.c:106
print_address_description.constprop.0+0x2c/0x3c0 mm/kasan/report.c:319
print_report mm/kasan/report.c:430 [inline]
kasan_report+0x11c/0x130 mm/kasan/report.c:536
mini_qdisc_pair_swap+0x1c2/0x1f0 net/sched/sch_generic.c:1573
tcf_chain_head_change_item net/sched/cls_api.c:495 [inline]
tcf_chain0_head_change.isra.0+0xb9/0x120 net/sched/cls_api.c:509
tcf_chain_tp_insert net/sched/cls_api.c:1826 [inline]
tcf_chain_tp_insert_unique net/sched/cls_api.c:1875 [inline]
tc_new_tfilter+0x1de6/0x2290 net/sched/cls_api.c:2266
...
@old and @new should not affect each other. In other words, @old should
never modify miniq_{in,e}gress after @new, and @new should not update
@old's RCU state.
Fixing without changing sch_api.c turned out to be difficult (please
refer to Closes: for discussions). Instead, make sure @new's first call
always happen after @old's last call (in {ingress,clsact}_destroy()) has
finished:
In qdisc_graft(), return -EBUSY if @old has any ongoing filter requests,
and call qdisc_destroy() for @old before grafting @new.
Introduce qdisc_refcount_dec_if_one() as the counterpart of
qdisc_refcount_inc_nz() used for filter requests. Introduce a
non-static version of qdisc_destroy() that does a TCQ_F_BUILTIN check,
just like qdisc_put() etc.
Depends on patch "net/sched: Refactor qdisc_graft() for ingress and
clsact Qdiscs".
[1] To illustrate, the syzkaller reproducer adds ingress Qdiscs under
TC_H_ROOT (no longer possible after commit c7cfbd115001 ("net/sched:
sch_ingress: Only create under TC_H_INGRESS")) on eth0 that has 8
transmission queues:
Thread 1 creates ingress Qdisc A (containing mini Qdisc a1 and a2),
then adds a flower filter X to A.
Thread 2 creates another ingress Qdisc B (containing mini Qdisc b1 and
b2) to replace A, then adds a flower filter Y to B.
Thread 1 A's refcnt Thread 2
RTM_NEWQDISC (A, RTNL-locked)
qdisc_create(A) 1
qdisc_graft(A) 9
RTM_NEWTFILTER (X, RTNL-unlocked)
__tcf_qdisc_find(A) 10
tcf_chain0_head_change(A)
mini_qdisc_pair_swap(A) (1st)
|
| RTM_NEWQDISC (B, RTNL-locked)
RCU sync 2 qdisc_graft(B)
| 1 notify_and_destroy(A)
|
tcf_block_release(A) 0 RTM_NEWTFILTER (Y, RTNL-unlocked)
qdisc_destroy(A) tcf_chain0_head_change(B)
tcf_chain0_head_change_cb_del(A) mini_qdisc_pair_swap(B) (2nd)
mini_qdisc_pair_swap(A) (3rd) |
... ...
Here, B calls mini_qdisc_pair_swap(), pointing eth0->miniq_ingress to
its mini Qdisc, b1. Then, A calls mini_qdisc_pair_swap() again during
ingress_destroy(), setting eth0->miniq_ingress to NULL, so ingress
packets on eth0 will not find filter Y in sch_handle_ingress().
This is just one of the possible consequences of concurrently accessing
miniq_{in,e}gress pointers.
Fixes: 7a096d579e8e ("net: sched: ingress: set 'unlocked' flag for Qdisc ops")
Fixes: 87f373921c4e ("net: sched: ingress: set 'unlocked' flag for clsact Qdisc ops")
Reported-by: syzbot+b53a9c0d1ea4ad62da8b(a)syzkaller.appspotmail.com
Closes: https://lore.kernel.org/r/0000000000006cf87705f79acf1a@google.com/
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Vlad Buslov <vladbu(a)mellanox.com>
Signed-off-by: Peilin Ye <peilin.ye(a)bytedance.com>
Acked-by: Jamal Hadi Salim <jhs(a)mojatatu.com>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 27271f2b37cb..12eadecf8cd0 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -137,6 +137,13 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
refcount_inc(&qdisc->refcnt);
}
+static inline bool qdisc_refcount_dec_if_one(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN)
+ return true;
+ return refcount_dec_if_one(&qdisc->refcnt);
+}
+
/* Intended to be used by unlocked users, when concurrent qdisc release is
* possible.
*/
@@ -652,6 +659,7 @@ void dev_deactivate_many(struct list_head *head);
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
struct Qdisc *qdisc);
void qdisc_reset(struct Qdisc *qdisc);
+void qdisc_destroy(struct Qdisc *qdisc);
void qdisc_put(struct Qdisc *qdisc);
void qdisc_put_unlocked(struct Qdisc *qdisc);
void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 094ca3a5b633..aa6b1fe65151 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1086,10 +1086,22 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
if ((q && q->flags & TCQ_F_INGRESS) ||
(new && new->flags & TCQ_F_INGRESS)) {
ingress = 1;
- if (!dev_ingress_queue(dev)) {
+ dev_queue = dev_ingress_queue(dev);
+ if (!dev_queue) {
NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
return -ENOENT;
}
+
+ q = rtnl_dereference(dev_queue->qdisc_sleeping);
+
+ /* This is the counterpart of that qdisc_refcount_inc_nz() call in
+ * __tcf_qdisc_find() for filter requests.
+ */
+ if (!qdisc_refcount_dec_if_one(q)) {
+ NL_SET_ERR_MSG(extack,
+ "Current ingress or clsact Qdisc has ongoing filter requests");
+ return -EBUSY;
+ }
}
if (dev->flags & IFF_UP)
@@ -1110,8 +1122,16 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
qdisc_put(old);
}
} else {
- dev_queue = dev_ingress_queue(dev);
- old = dev_graft_qdisc(dev_queue, new);
+ old = dev_graft_qdisc(dev_queue, NULL);
+
+ /* {ingress,clsact}_destroy() @old before grafting @new to avoid
+ * unprotected concurrent accesses to net_device::miniq_{in,e}gress
+ * pointer(s) in mini_qdisc_pair_swap().
+ */
+ qdisc_notify(net, skb, n, classid, old, new, extack);
+ qdisc_destroy(old);
+
+ dev_graft_qdisc(dev_queue, new);
}
skip:
@@ -1125,8 +1145,6 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
if (new && new->ops->attach)
new->ops->attach(new);
- } else {
- notify_and_destroy(net, skb, n, classid, old, new, extack);
}
if (dev->flags & IFF_UP)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 3248259eba32..5d7e23f4cc0e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1046,7 +1046,7 @@ static void qdisc_free_cb(struct rcu_head *head)
qdisc_free(q);
}
-static void qdisc_destroy(struct Qdisc *qdisc)
+static void __qdisc_destroy(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
@@ -1070,6 +1070,14 @@ static void qdisc_destroy(struct Qdisc *qdisc)
call_rcu(&qdisc->rcu, qdisc_free_cb);
}
+void qdisc_destroy(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN)
+ return;
+
+ __qdisc_destroy(qdisc);
+}
+
void qdisc_put(struct Qdisc *qdisc)
{
if (!qdisc)
@@ -1079,7 +1087,7 @@ void qdisc_put(struct Qdisc *qdisc)
!refcount_dec_and_test(&qdisc->refcnt))
return;
- qdisc_destroy(qdisc);
+ __qdisc_destroy(qdisc);
}
EXPORT_SYMBOL(qdisc_put);
@@ -1094,7 +1102,7 @@ void qdisc_put_unlocked(struct Qdisc *qdisc)
!refcount_dec_and_rtnl_lock(&qdisc->refcnt))
return;
- qdisc_destroy(qdisc);
+ __qdisc_destroy(qdisc);
rtnl_unlock();
}
EXPORT_SYMBOL(qdisc_put_unlocked);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 84ad0af0bccd3691cb951c2974c5cb2c10594d4a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061945-polish-remorse-02cf@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 84ad0af0bccd3691cb951c2974c5cb2c10594d4a Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye(a)bytedance.com>
Date: Sat, 10 Jun 2023 20:30:25 -0700
Subject: [PATCH] net/sched: qdisc_destroy() old ingress and clsact Qdiscs
before grafting
mini_Qdisc_pair::p_miniq is a double pointer to mini_Qdisc, initialized
in ingress_init() to point to net_device::miniq_ingress. ingress Qdiscs
access this per-net_device pointer in mini_qdisc_pair_swap(). Similar
for clsact Qdiscs and miniq_egress.
Unfortunately, after introducing RTNL-unlocked RTM_{NEW,DEL,GET}TFILTER
requests (thanks Hillf Danton for the hint), when replacing ingress or
clsact Qdiscs, for example, the old Qdisc ("@old") could access the same
miniq_{in,e}gress pointer(s) concurrently with the new Qdisc ("@new"),
causing race conditions [1] including a use-after-free bug in
mini_qdisc_pair_swap() reported by syzbot:
BUG: KASAN: slab-use-after-free in mini_qdisc_pair_swap+0x1c2/0x1f0 net/sched/sch_generic.c:1573
Write of size 8 at addr ffff888045b31308 by task syz-executor690/14901
...
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xd9/0x150 lib/dump_stack.c:106
print_address_description.constprop.0+0x2c/0x3c0 mm/kasan/report.c:319
print_report mm/kasan/report.c:430 [inline]
kasan_report+0x11c/0x130 mm/kasan/report.c:536
mini_qdisc_pair_swap+0x1c2/0x1f0 net/sched/sch_generic.c:1573
tcf_chain_head_change_item net/sched/cls_api.c:495 [inline]
tcf_chain0_head_change.isra.0+0xb9/0x120 net/sched/cls_api.c:509
tcf_chain_tp_insert net/sched/cls_api.c:1826 [inline]
tcf_chain_tp_insert_unique net/sched/cls_api.c:1875 [inline]
tc_new_tfilter+0x1de6/0x2290 net/sched/cls_api.c:2266
...
@old and @new should not affect each other. In other words, @old should
never modify miniq_{in,e}gress after @new, and @new should not update
@old's RCU state.
Fixing without changing sch_api.c turned out to be difficult (please
refer to Closes: for discussions). Instead, make sure @new's first call
always happen after @old's last call (in {ingress,clsact}_destroy()) has
finished:
In qdisc_graft(), return -EBUSY if @old has any ongoing filter requests,
and call qdisc_destroy() for @old before grafting @new.
Introduce qdisc_refcount_dec_if_one() as the counterpart of
qdisc_refcount_inc_nz() used for filter requests. Introduce a
non-static version of qdisc_destroy() that does a TCQ_F_BUILTIN check,
just like qdisc_put() etc.
Depends on patch "net/sched: Refactor qdisc_graft() for ingress and
clsact Qdiscs".
[1] To illustrate, the syzkaller reproducer adds ingress Qdiscs under
TC_H_ROOT (no longer possible after commit c7cfbd115001 ("net/sched:
sch_ingress: Only create under TC_H_INGRESS")) on eth0 that has 8
transmission queues:
Thread 1 creates ingress Qdisc A (containing mini Qdisc a1 and a2),
then adds a flower filter X to A.
Thread 2 creates another ingress Qdisc B (containing mini Qdisc b1 and
b2) to replace A, then adds a flower filter Y to B.
Thread 1 A's refcnt Thread 2
RTM_NEWQDISC (A, RTNL-locked)
qdisc_create(A) 1
qdisc_graft(A) 9
RTM_NEWTFILTER (X, RTNL-unlocked)
__tcf_qdisc_find(A) 10
tcf_chain0_head_change(A)
mini_qdisc_pair_swap(A) (1st)
|
| RTM_NEWQDISC (B, RTNL-locked)
RCU sync 2 qdisc_graft(B)
| 1 notify_and_destroy(A)
|
tcf_block_release(A) 0 RTM_NEWTFILTER (Y, RTNL-unlocked)
qdisc_destroy(A) tcf_chain0_head_change(B)
tcf_chain0_head_change_cb_del(A) mini_qdisc_pair_swap(B) (2nd)
mini_qdisc_pair_swap(A) (3rd) |
... ...
Here, B calls mini_qdisc_pair_swap(), pointing eth0->miniq_ingress to
its mini Qdisc, b1. Then, A calls mini_qdisc_pair_swap() again during
ingress_destroy(), setting eth0->miniq_ingress to NULL, so ingress
packets on eth0 will not find filter Y in sch_handle_ingress().
This is just one of the possible consequences of concurrently accessing
miniq_{in,e}gress pointers.
Fixes: 7a096d579e8e ("net: sched: ingress: set 'unlocked' flag for Qdisc ops")
Fixes: 87f373921c4e ("net: sched: ingress: set 'unlocked' flag for clsact Qdisc ops")
Reported-by: syzbot+b53a9c0d1ea4ad62da8b(a)syzkaller.appspotmail.com
Closes: https://lore.kernel.org/r/0000000000006cf87705f79acf1a@google.com/
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Vlad Buslov <vladbu(a)mellanox.com>
Signed-off-by: Peilin Ye <peilin.ye(a)bytedance.com>
Acked-by: Jamal Hadi Salim <jhs(a)mojatatu.com>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 27271f2b37cb..12eadecf8cd0 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -137,6 +137,13 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
refcount_inc(&qdisc->refcnt);
}
+static inline bool qdisc_refcount_dec_if_one(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN)
+ return true;
+ return refcount_dec_if_one(&qdisc->refcnt);
+}
+
/* Intended to be used by unlocked users, when concurrent qdisc release is
* possible.
*/
@@ -652,6 +659,7 @@ void dev_deactivate_many(struct list_head *head);
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
struct Qdisc *qdisc);
void qdisc_reset(struct Qdisc *qdisc);
+void qdisc_destroy(struct Qdisc *qdisc);
void qdisc_put(struct Qdisc *qdisc);
void qdisc_put_unlocked(struct Qdisc *qdisc);
void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 094ca3a5b633..aa6b1fe65151 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1086,10 +1086,22 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
if ((q && q->flags & TCQ_F_INGRESS) ||
(new && new->flags & TCQ_F_INGRESS)) {
ingress = 1;
- if (!dev_ingress_queue(dev)) {
+ dev_queue = dev_ingress_queue(dev);
+ if (!dev_queue) {
NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
return -ENOENT;
}
+
+ q = rtnl_dereference(dev_queue->qdisc_sleeping);
+
+ /* This is the counterpart of that qdisc_refcount_inc_nz() call in
+ * __tcf_qdisc_find() for filter requests.
+ */
+ if (!qdisc_refcount_dec_if_one(q)) {
+ NL_SET_ERR_MSG(extack,
+ "Current ingress or clsact Qdisc has ongoing filter requests");
+ return -EBUSY;
+ }
}
if (dev->flags & IFF_UP)
@@ -1110,8 +1122,16 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
qdisc_put(old);
}
} else {
- dev_queue = dev_ingress_queue(dev);
- old = dev_graft_qdisc(dev_queue, new);
+ old = dev_graft_qdisc(dev_queue, NULL);
+
+ /* {ingress,clsact}_destroy() @old before grafting @new to avoid
+ * unprotected concurrent accesses to net_device::miniq_{in,e}gress
+ * pointer(s) in mini_qdisc_pair_swap().
+ */
+ qdisc_notify(net, skb, n, classid, old, new, extack);
+ qdisc_destroy(old);
+
+ dev_graft_qdisc(dev_queue, new);
}
skip:
@@ -1125,8 +1145,6 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
if (new && new->ops->attach)
new->ops->attach(new);
- } else {
- notify_and_destroy(net, skb, n, classid, old, new, extack);
}
if (dev->flags & IFF_UP)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 3248259eba32..5d7e23f4cc0e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1046,7 +1046,7 @@ static void qdisc_free_cb(struct rcu_head *head)
qdisc_free(q);
}
-static void qdisc_destroy(struct Qdisc *qdisc)
+static void __qdisc_destroy(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
@@ -1070,6 +1070,14 @@ static void qdisc_destroy(struct Qdisc *qdisc)
call_rcu(&qdisc->rcu, qdisc_free_cb);
}
+void qdisc_destroy(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN)
+ return;
+
+ __qdisc_destroy(qdisc);
+}
+
void qdisc_put(struct Qdisc *qdisc)
{
if (!qdisc)
@@ -1079,7 +1087,7 @@ void qdisc_put(struct Qdisc *qdisc)
!refcount_dec_and_test(&qdisc->refcnt))
return;
- qdisc_destroy(qdisc);
+ __qdisc_destroy(qdisc);
}
EXPORT_SYMBOL(qdisc_put);
@@ -1094,7 +1102,7 @@ void qdisc_put_unlocked(struct Qdisc *qdisc)
!refcount_dec_and_rtnl_lock(&qdisc->refcnt))
return;
- qdisc_destroy(qdisc);
+ __qdisc_destroy(qdisc);
rtnl_unlock();
}
EXPORT_SYMBOL(qdisc_put_unlocked);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 84ad0af0bccd3691cb951c2974c5cb2c10594d4a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061941-anaerobic-washing-b481@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 84ad0af0bccd3691cb951c2974c5cb2c10594d4a Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye(a)bytedance.com>
Date: Sat, 10 Jun 2023 20:30:25 -0700
Subject: [PATCH] net/sched: qdisc_destroy() old ingress and clsact Qdiscs
before grafting
mini_Qdisc_pair::p_miniq is a double pointer to mini_Qdisc, initialized
in ingress_init() to point to net_device::miniq_ingress. ingress Qdiscs
access this per-net_device pointer in mini_qdisc_pair_swap(). Similar
for clsact Qdiscs and miniq_egress.
Unfortunately, after introducing RTNL-unlocked RTM_{NEW,DEL,GET}TFILTER
requests (thanks Hillf Danton for the hint), when replacing ingress or
clsact Qdiscs, for example, the old Qdisc ("@old") could access the same
miniq_{in,e}gress pointer(s) concurrently with the new Qdisc ("@new"),
causing race conditions [1] including a use-after-free bug in
mini_qdisc_pair_swap() reported by syzbot:
BUG: KASAN: slab-use-after-free in mini_qdisc_pair_swap+0x1c2/0x1f0 net/sched/sch_generic.c:1573
Write of size 8 at addr ffff888045b31308 by task syz-executor690/14901
...
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xd9/0x150 lib/dump_stack.c:106
print_address_description.constprop.0+0x2c/0x3c0 mm/kasan/report.c:319
print_report mm/kasan/report.c:430 [inline]
kasan_report+0x11c/0x130 mm/kasan/report.c:536
mini_qdisc_pair_swap+0x1c2/0x1f0 net/sched/sch_generic.c:1573
tcf_chain_head_change_item net/sched/cls_api.c:495 [inline]
tcf_chain0_head_change.isra.0+0xb9/0x120 net/sched/cls_api.c:509
tcf_chain_tp_insert net/sched/cls_api.c:1826 [inline]
tcf_chain_tp_insert_unique net/sched/cls_api.c:1875 [inline]
tc_new_tfilter+0x1de6/0x2290 net/sched/cls_api.c:2266
...
@old and @new should not affect each other. In other words, @old should
never modify miniq_{in,e}gress after @new, and @new should not update
@old's RCU state.
Fixing without changing sch_api.c turned out to be difficult (please
refer to Closes: for discussions). Instead, make sure @new's first call
always happen after @old's last call (in {ingress,clsact}_destroy()) has
finished:
In qdisc_graft(), return -EBUSY if @old has any ongoing filter requests,
and call qdisc_destroy() for @old before grafting @new.
Introduce qdisc_refcount_dec_if_one() as the counterpart of
qdisc_refcount_inc_nz() used for filter requests. Introduce a
non-static version of qdisc_destroy() that does a TCQ_F_BUILTIN check,
just like qdisc_put() etc.
Depends on patch "net/sched: Refactor qdisc_graft() for ingress and
clsact Qdiscs".
[1] To illustrate, the syzkaller reproducer adds ingress Qdiscs under
TC_H_ROOT (no longer possible after commit c7cfbd115001 ("net/sched:
sch_ingress: Only create under TC_H_INGRESS")) on eth0 that has 8
transmission queues:
Thread 1 creates ingress Qdisc A (containing mini Qdisc a1 and a2),
then adds a flower filter X to A.
Thread 2 creates another ingress Qdisc B (containing mini Qdisc b1 and
b2) to replace A, then adds a flower filter Y to B.
Thread 1 A's refcnt Thread 2
RTM_NEWQDISC (A, RTNL-locked)
qdisc_create(A) 1
qdisc_graft(A) 9
RTM_NEWTFILTER (X, RTNL-unlocked)
__tcf_qdisc_find(A) 10
tcf_chain0_head_change(A)
mini_qdisc_pair_swap(A) (1st)
|
| RTM_NEWQDISC (B, RTNL-locked)
RCU sync 2 qdisc_graft(B)
| 1 notify_and_destroy(A)
|
tcf_block_release(A) 0 RTM_NEWTFILTER (Y, RTNL-unlocked)
qdisc_destroy(A) tcf_chain0_head_change(B)
tcf_chain0_head_change_cb_del(A) mini_qdisc_pair_swap(B) (2nd)
mini_qdisc_pair_swap(A) (3rd) |
... ...
Here, B calls mini_qdisc_pair_swap(), pointing eth0->miniq_ingress to
its mini Qdisc, b1. Then, A calls mini_qdisc_pair_swap() again during
ingress_destroy(), setting eth0->miniq_ingress to NULL, so ingress
packets on eth0 will not find filter Y in sch_handle_ingress().
This is just one of the possible consequences of concurrently accessing
miniq_{in,e}gress pointers.
Fixes: 7a096d579e8e ("net: sched: ingress: set 'unlocked' flag for Qdisc ops")
Fixes: 87f373921c4e ("net: sched: ingress: set 'unlocked' flag for clsact Qdisc ops")
Reported-by: syzbot+b53a9c0d1ea4ad62da8b(a)syzkaller.appspotmail.com
Closes: https://lore.kernel.org/r/0000000000006cf87705f79acf1a@google.com/
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Vlad Buslov <vladbu(a)mellanox.com>
Signed-off-by: Peilin Ye <peilin.ye(a)bytedance.com>
Acked-by: Jamal Hadi Salim <jhs(a)mojatatu.com>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 27271f2b37cb..12eadecf8cd0 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -137,6 +137,13 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
refcount_inc(&qdisc->refcnt);
}
+static inline bool qdisc_refcount_dec_if_one(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN)
+ return true;
+ return refcount_dec_if_one(&qdisc->refcnt);
+}
+
/* Intended to be used by unlocked users, when concurrent qdisc release is
* possible.
*/
@@ -652,6 +659,7 @@ void dev_deactivate_many(struct list_head *head);
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
struct Qdisc *qdisc);
void qdisc_reset(struct Qdisc *qdisc);
+void qdisc_destroy(struct Qdisc *qdisc);
void qdisc_put(struct Qdisc *qdisc);
void qdisc_put_unlocked(struct Qdisc *qdisc);
void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 094ca3a5b633..aa6b1fe65151 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1086,10 +1086,22 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
if ((q && q->flags & TCQ_F_INGRESS) ||
(new && new->flags & TCQ_F_INGRESS)) {
ingress = 1;
- if (!dev_ingress_queue(dev)) {
+ dev_queue = dev_ingress_queue(dev);
+ if (!dev_queue) {
NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
return -ENOENT;
}
+
+ q = rtnl_dereference(dev_queue->qdisc_sleeping);
+
+ /* This is the counterpart of that qdisc_refcount_inc_nz() call in
+ * __tcf_qdisc_find() for filter requests.
+ */
+ if (!qdisc_refcount_dec_if_one(q)) {
+ NL_SET_ERR_MSG(extack,
+ "Current ingress or clsact Qdisc has ongoing filter requests");
+ return -EBUSY;
+ }
}
if (dev->flags & IFF_UP)
@@ -1110,8 +1122,16 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
qdisc_put(old);
}
} else {
- dev_queue = dev_ingress_queue(dev);
- old = dev_graft_qdisc(dev_queue, new);
+ old = dev_graft_qdisc(dev_queue, NULL);
+
+ /* {ingress,clsact}_destroy() @old before grafting @new to avoid
+ * unprotected concurrent accesses to net_device::miniq_{in,e}gress
+ * pointer(s) in mini_qdisc_pair_swap().
+ */
+ qdisc_notify(net, skb, n, classid, old, new, extack);
+ qdisc_destroy(old);
+
+ dev_graft_qdisc(dev_queue, new);
}
skip:
@@ -1125,8 +1145,6 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
if (new && new->ops->attach)
new->ops->attach(new);
- } else {
- notify_and_destroy(net, skb, n, classid, old, new, extack);
}
if (dev->flags & IFF_UP)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 3248259eba32..5d7e23f4cc0e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1046,7 +1046,7 @@ static void qdisc_free_cb(struct rcu_head *head)
qdisc_free(q);
}
-static void qdisc_destroy(struct Qdisc *qdisc)
+static void __qdisc_destroy(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
@@ -1070,6 +1070,14 @@ static void qdisc_destroy(struct Qdisc *qdisc)
call_rcu(&qdisc->rcu, qdisc_free_cb);
}
+void qdisc_destroy(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN)
+ return;
+
+ __qdisc_destroy(qdisc);
+}
+
void qdisc_put(struct Qdisc *qdisc)
{
if (!qdisc)
@@ -1079,7 +1087,7 @@ void qdisc_put(struct Qdisc *qdisc)
!refcount_dec_and_test(&qdisc->refcnt))
return;
- qdisc_destroy(qdisc);
+ __qdisc_destroy(qdisc);
}
EXPORT_SYMBOL(qdisc_put);
@@ -1094,7 +1102,7 @@ void qdisc_put_unlocked(struct Qdisc *qdisc)
!refcount_dec_and_rtnl_lock(&qdisc->refcnt))
return;
- qdisc_destroy(qdisc);
+ __qdisc_destroy(qdisc);
rtnl_unlock();
}
EXPORT_SYMBOL(qdisc_put_unlocked);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x f1a0898b5d6a77d332d036da03bad6fa9770de5b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061907-salami-everyday-f129@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f1a0898b5d6a77d332d036da03bad6fa9770de5b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Fri, 9 Jun 2023 14:29:39 -0700
Subject: [PATCH] wifi: iwlwifi: mvm: spin_lock_bh() to fix lockdep regression
Lockdep on 6.4-rc on ThinkPad X1 Carbon 5th says
=====================================================
WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected
6.4.0-rc5 #1 Not tainted
-----------------------------------------------------
kworker/3:1/49 [HC0[0]:SC0[4]:HE1:SE0] is trying to acquire:
ffff8881066fa368 (&mvm_sta->deflink.lq_sta.rs_drv.pers.lock){+.+.}-{2:2}, at: rs_drv_get_rate+0x46/0xe7
and this task is already holding:
ffff8881066f80a8 (&sta->rate_ctrl_lock){+.-.}-{2:2}, at: rate_control_get_rate+0xbd/0x126
which would create a new lock dependency:
(&sta->rate_ctrl_lock){+.-.}-{2:2} -> (&mvm_sta->deflink.lq_sta.rs_drv.pers.lock){+.+.}-{2:2}
but this new dependency connects a SOFTIRQ-irq-safe lock:
(&sta->rate_ctrl_lock){+.-.}-{2:2}
etc. etc. etc.
Changing the spin_lock() in rs_drv_get_rate() to spin_lock_bh() was not
enough to pacify lockdep, but changing them all on pers.lock has worked.
Fixes: a8938bc881d2 ("wifi: iwlwifi: mvm: Add locking to the rate read flow")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Link: https://lore.kernel.org/r/79ffcc22-9775-cb6d-3ffd-1a517c40beef@google.com
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
index 23266d0c9ce4..9a20468345e4 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
@@ -2692,7 +2692,7 @@ static void rs_drv_get_rate(void *mvm_r, struct ieee80211_sta *sta,
lq_sta = mvm_sta;
- spin_lock(&lq_sta->pers.lock);
+ spin_lock_bh(&lq_sta->pers.lock);
iwl_mvm_hwrate_to_tx_rate_v1(lq_sta->last_rate_n_flags,
info->band, &info->control.rates[0]);
info->control.rates[0].count = 1;
@@ -2707,7 +2707,7 @@ static void rs_drv_get_rate(void *mvm_r, struct ieee80211_sta *sta,
iwl_mvm_hwrate_to_tx_rate_v1(last_ucode_rate, info->band,
&txrc->reported_rate);
}
- spin_unlock(&lq_sta->pers.lock);
+ spin_unlock_bh(&lq_sta->pers.lock);
}
static void *rs_drv_alloc_sta(void *mvm_rate, struct ieee80211_sta *sta,
@@ -3264,11 +3264,11 @@ void iwl_mvm_rs_tx_status(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
/* If it's locked we are in middle of init flow
* just wait for next tx status to update the lq_sta data
*/
- if (!spin_trylock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock))
+ if (!spin_trylock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock))
return;
__iwl_mvm_rs_tx_status(mvm, sta, tid, info, ndp);
- spin_unlock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
+ spin_unlock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
}
#ifdef CONFIG_MAC80211_DEBUGFS
@@ -4117,9 +4117,9 @@ void iwl_mvm_rs_rate_init(struct iwl_mvm *mvm,
} else {
struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta);
- spin_lock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
+ spin_lock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
rs_drv_rate_init(mvm, sta, band);
- spin_unlock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
+ spin_unlock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
}
}
The patch below does not apply to the 6.3-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.3.y
git checkout FETCH_HEAD
git cherry-pick -x f1a0898b5d6a77d332d036da03bad6fa9770de5b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061905-bonelike-reorder-8961@gregkh' --subject-prefix 'PATCH 6.3.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f1a0898b5d6a77d332d036da03bad6fa9770de5b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Fri, 9 Jun 2023 14:29:39 -0700
Subject: [PATCH] wifi: iwlwifi: mvm: spin_lock_bh() to fix lockdep regression
Lockdep on 6.4-rc on ThinkPad X1 Carbon 5th says
=====================================================
WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected
6.4.0-rc5 #1 Not tainted
-----------------------------------------------------
kworker/3:1/49 [HC0[0]:SC0[4]:HE1:SE0] is trying to acquire:
ffff8881066fa368 (&mvm_sta->deflink.lq_sta.rs_drv.pers.lock){+.+.}-{2:2}, at: rs_drv_get_rate+0x46/0xe7
and this task is already holding:
ffff8881066f80a8 (&sta->rate_ctrl_lock){+.-.}-{2:2}, at: rate_control_get_rate+0xbd/0x126
which would create a new lock dependency:
(&sta->rate_ctrl_lock){+.-.}-{2:2} -> (&mvm_sta->deflink.lq_sta.rs_drv.pers.lock){+.+.}-{2:2}
but this new dependency connects a SOFTIRQ-irq-safe lock:
(&sta->rate_ctrl_lock){+.-.}-{2:2}
etc. etc. etc.
Changing the spin_lock() in rs_drv_get_rate() to spin_lock_bh() was not
enough to pacify lockdep, but changing them all on pers.lock has worked.
Fixes: a8938bc881d2 ("wifi: iwlwifi: mvm: Add locking to the rate read flow")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Link: https://lore.kernel.org/r/79ffcc22-9775-cb6d-3ffd-1a517c40beef@google.com
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
index 23266d0c9ce4..9a20468345e4 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
@@ -2692,7 +2692,7 @@ static void rs_drv_get_rate(void *mvm_r, struct ieee80211_sta *sta,
lq_sta = mvm_sta;
- spin_lock(&lq_sta->pers.lock);
+ spin_lock_bh(&lq_sta->pers.lock);
iwl_mvm_hwrate_to_tx_rate_v1(lq_sta->last_rate_n_flags,
info->band, &info->control.rates[0]);
info->control.rates[0].count = 1;
@@ -2707,7 +2707,7 @@ static void rs_drv_get_rate(void *mvm_r, struct ieee80211_sta *sta,
iwl_mvm_hwrate_to_tx_rate_v1(last_ucode_rate, info->band,
&txrc->reported_rate);
}
- spin_unlock(&lq_sta->pers.lock);
+ spin_unlock_bh(&lq_sta->pers.lock);
}
static void *rs_drv_alloc_sta(void *mvm_rate, struct ieee80211_sta *sta,
@@ -3264,11 +3264,11 @@ void iwl_mvm_rs_tx_status(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
/* If it's locked we are in middle of init flow
* just wait for next tx status to update the lq_sta data
*/
- if (!spin_trylock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock))
+ if (!spin_trylock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock))
return;
__iwl_mvm_rs_tx_status(mvm, sta, tid, info, ndp);
- spin_unlock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
+ spin_unlock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
}
#ifdef CONFIG_MAC80211_DEBUGFS
@@ -4117,9 +4117,9 @@ void iwl_mvm_rs_rate_init(struct iwl_mvm *mvm,
} else {
struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta);
- spin_lock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
+ spin_lock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
rs_drv_rate_init(mvm, sta, band);
- spin_unlock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
+ spin_unlock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
}
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 661a4f089317c877aecd598fb70cd46510cc8d29
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061918-disabled-spoiler-d23d@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 661a4f089317c877aecd598fb70cd46510cc8d29 Mon Sep 17 00:00:00 2001
From: Konrad Dybcio <konrad.dybcio(a)linaro.org>
Date: Wed, 17 May 2023 04:18:50 +0200
Subject: [PATCH] arm64: dts: qcom: sm8550: Use the correct LLCC register
scheme
During the ABI-breaking (for good reasons) conversion of the LLCC
register description, SM8550 was not taken into account, resulting
in LLCC being broken on any kernel containing the patch referenced
in the fixes tag.
Fix it by describing the regions properly.
Fixes: ee13b5008707 ("qcom: llcc/edac: Fix the base address used for accessing LLCC banks")
Signed-off-by: Konrad Dybcio <konrad.dybcio(a)linaro.org>
Acked-by: Manivannan Sadhasivam <mani(a)kernel.org>
Signed-off-by: Bjorn Andersson <andersson(a)kernel.org>
Link: https://lore.kernel.org/r/20230517-topic-kailua-llcc-v1-2-d57bd860c43e@lina…
diff --git a/arch/arm64/boot/dts/qcom/sm8550.dtsi b/arch/arm64/boot/dts/qcom/sm8550.dtsi
index 4c6b2c582b27..558cbc430708 100644
--- a/arch/arm64/boot/dts/qcom/sm8550.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8550.dtsi
@@ -3771,9 +3771,16 @@ gem_noc: interconnect@24100000 {
system-cache-controller@25000000 {
compatible = "qcom,sm8550-llcc";
- reg = <0 0x25000000 0 0x800000>,
+ reg = <0 0x25000000 0 0x200000>,
+ <0 0x25200000 0 0x200000>,
+ <0 0x25400000 0 0x200000>,
+ <0 0x25600000 0 0x200000>,
<0 0x25800000 0 0x200000>;
- reg-names = "llcc_base", "llcc_broadcast_base";
+ reg-names = "llcc0_base",
+ "llcc1_base",
+ "llcc2_base",
+ "llcc3_base",
+ "llcc_broadcast_base";
interrupts = <GIC_SPI 266 IRQ_TYPE_LEVEL_HIGH>;
};
Please consider adding the following commit to v6.1.x:
0e3172bac3f4 ("drm/amdgpu: Don't set struct drm_driver.output_poll_changed")
This fixes a few issues where a system resuming may end up with a black screen if the display topology has changed. It would be great to have it in 6.1 branch.
Alex
Building parisc64:a500_defconfig ... failed
--------------
Error log:
drivers/char/agp/parisc-agp.c: In function 'parisc_agp_tlbflush':
drivers/char/agp/parisc-agp.c:98:9: error: implicit declaration of function 'asm_io_sync' [-Werror=implicit-function-declaration]
98 | asm_io_sync();
| ^~~~~~~~~~~
drivers/char/agp/parisc-agp.c: In function 'parisc_agp_insert_memory':
drivers/char/agp/parisc-agp.c:168:25: error: implicit declaration of function 'asm_io_fdc' [-Werror=implicit-function-declaration]
168 | asm_io_fdc(&info->gatt[j]);
| ^~~~~~~~~~
Those functions are indeed not available in v4.1.y.
Guenter
The patch below does not apply to the 6.3-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.3.y
git checkout FETCH_HEAD
git cherry-pick -x 20cb1c2fb7568a6054c55defe044311397e01ddb
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061747-drum-devourer-6205@gregkh' --subject-prefix 'PATCH 6.3.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 20cb1c2fb7568a6054c55defe044311397e01ddb Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei(a)redhat.com>
Date: Sat, 10 Jun 2023 07:42:49 +0800
Subject: [PATCH] blk-cgroup: Flush stats before releasing blkcg_gq
As noted by Michal, the blkg_iostat_set's in the lockless list hold
reference to blkg's to protect against their removal. Those blkg's
hold reference to blkcg. When a cgroup is being destroyed,
cgroup_rstat_flush() is only called at css_release_work_fn() which
is called when the blkcg reference count reaches 0. This circular
dependency will prevent blkcg and some blkgs from being freed after
they are made offline.
It is less a problem if the cgroup to be destroyed also has other
controllers like memory that will call cgroup_rstat_flush() which will
clean up the reference count. If block is the only controller that uses
rstat, these offline blkcg and blkgs may never be freed leaking more
and more memory over time.
To prevent this potential memory leak:
- flush blkcg per-cpu stats list in __blkg_release(), when no new stat
can be added
- add global blkg_stat_lock for covering concurrent parent blkg stat
update
- don't grab bio->bi_blkg reference when adding the stats into blkcg's
per-cpu stat list since all stats are guaranteed to be consumed before
releasing blkg instance, and grabbing blkg reference for stats was the
most fragile part of original patch
Based on Waiman's patch:
https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.…
Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
Cc: stable(a)vger.kernel.org
Reported-by: Jay Shin <jaeshin(a)redhat.com>
Acked-by: Tejun Heo <tj(a)kernel.org>
Cc: Waiman Long <longman(a)redhat.com>
Cc: mkoutny(a)suse.com
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Signed-off-by: Ming Lei <ming.lei(a)redhat.com>
Link: https://lore.kernel.org/r/20230609234249.1412858-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ce64dd73cfe..f0b5c9c41cde 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -34,6 +34,8 @@
#include "blk-ioprio.h"
#include "blk-throttle.h"
+static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
+
/*
* blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
* blkcg_pol_register_mutex nests outside of it and synchronizes entire
@@ -56,6 +58,8 @@ static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
bool blkcg_debug_stats = false;
+static DEFINE_RAW_SPINLOCK(blkg_stat_lock);
+
#define BLKG_DESTROY_BATCH_SIZE 64
/*
@@ -163,10 +167,20 @@ static void blkg_free(struct blkcg_gq *blkg)
static void __blkg_release(struct rcu_head *rcu)
{
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+ struct blkcg *blkcg = blkg->blkcg;
+ int cpu;
#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
WARN_ON(!bio_list_empty(&blkg->async_bios));
#endif
+ /*
+ * Flush all the non-empty percpu lockless lists before releasing
+ * us, given these stat belongs to us.
+ *
+ * blkg_stat_lock is for serializing blkg stat update
+ */
+ for_each_possible_cpu(cpu)
+ __blkcg_rstat_flush(blkcg, cpu);
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
@@ -951,23 +965,26 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
-static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
{
- struct blkcg *blkcg = css_to_blkcg(css);
struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
struct llist_node *lnode;
struct blkg_iostat_set *bisc, *next_bisc;
- /* Root-level stats are sourced from system-wide IO stats */
- if (!cgroup_parent(css->cgroup))
- return;
-
rcu_read_lock();
lnode = llist_del_all(lhead);
if (!lnode)
goto out;
+ /*
+ * For covering concurrent parent blkg update from blkg_release().
+ *
+ * When flushing from cgroup, cgroup_rstat_lock is always held, so
+ * this lock won't cause contention most of time.
+ */
+ raw_spin_lock(&blkg_stat_lock);
+
/*
* Iterate only the iostat_cpu's queued in the lockless list.
*/
@@ -991,13 +1008,19 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (parent && parent->parent)
blkcg_iostat_update(parent, &blkg->iostat.cur,
&blkg->iostat.last);
- percpu_ref_put(&blkg->refcnt);
}
-
+ raw_spin_unlock(&blkg_stat_lock);
out:
rcu_read_unlock();
}
+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ /* Root-level stats are sourced from system-wide IO stats */
+ if (cgroup_parent(css->cgroup))
+ __blkcg_rstat_flush(css_to_blkcg(css), cpu);
+}
+
/*
* We source root cgroup stats from the system-wide stats to avoid
* tracking the same information twice and incurring overhead when no
@@ -2075,7 +2098,6 @@ void blk_cgroup_bio_start(struct bio *bio)
llist_add(&bis->lnode, lhead);
WRITE_ONCE(bis->lqueued, true);
- percpu_ref_get(&bis->blkg->refcnt);
}
u64_stats_update_end_irqrestore(&bis->sync, flags);