Upstream commit 81b704d3e4674e09781d331df73d76675d5ad8cb
Applies to 4.9-stable tree
----------->8---------------
From: "Rafael J. Wysocki" <rafael.j.wysocki(a)intel.com>
Date: Thu, 14 Jan 2021 19:34:22 +0100
Subject: [PATCH] ACPI: thermal: Do not call acpi_thermal_check() directly
Calling acpi_thermal_check() from acpi_thermal_notify() directly
is problematic if _TMP triggers Notify () on the thermal zone for
which it has been evaluated (which happens on some systems), because
it causes a new acpi_thermal_notify() invocation to be queued up
every time and if that takes place too often, an indefinite number of
pending work items may accumulate in kacpi_notify_wq over time.
Besides, it is not really useful to queue up a new invocation of
acpi_thermal_check() if one of them is pending already.
For these reasons, rework acpi_thermal_notify() to queue up a thermal
check instead of calling acpi_thermal_check() directly and only allow
one thermal check to be pending at a time. Moreover, only allow one
acpi_thermal_check_fn() instance at a time to run
thermal_zone_device_update() for one thermal zone and make it return
early if it sees other instances running for the same thermal zone.
While at it, fold acpi_thermal_check() into acpi_thermal_check_fn(),
as it is only called from there after the other changes made here.
[This issue appears to have been exposed by commit 6d25be5782e4
("sched/core, workqueues: Distangle worker accounting from rq
lock"), but it is unclear why it was not visible earlier.]
BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=208877
Reported-by: Stephen Berman <stephen.berman(a)gmx.net>
Diagnosed-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
Tested-by: Stephen Berman <stephen.berman(a)gmx.net>
Cc: All applicable <stable(a)vger.kernel.org>
[bigeasy: Backported to v4.9.y, use atomic_t instead of refcount_t]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
---
drivers/acpi/thermal.c | 55 +++++++++++++++++++++++++++++-------------
1 file changed, 38 insertions(+), 17 deletions(-)
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 35e8fbca10ad5..c53c88b531639 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -188,6 +188,8 @@ struct acpi_thermal {
int tz_enabled;
int kelvin_offset;
struct work_struct thermal_check_work;
+ struct mutex thermal_check_lock;
+ atomic_t thermal_check_count;
};
/* --------------------------------------------------------------------------
@@ -513,17 +515,6 @@ static int acpi_thermal_get_trip_points(struct acpi_thermal *tz)
return 0;
}
-static void acpi_thermal_check(void *data)
-{
- struct acpi_thermal *tz = data;
-
- if (!tz->tz_enabled)
- return;
-
- thermal_zone_device_update(tz->thermal_zone,
- THERMAL_EVENT_UNSPECIFIED);
-}
-
/* sys I/F for generic thermal sysfs support */
static int thermal_get_temp(struct thermal_zone_device *thermal, int *temp)
@@ -557,6 +548,8 @@ static int thermal_get_mode(struct thermal_zone_device *thermal,
return 0;
}
+static void acpi_thermal_check_fn(struct work_struct *work);
+
static int thermal_set_mode(struct thermal_zone_device *thermal,
enum thermal_device_mode mode)
{
@@ -582,7 +575,7 @@ static int thermal_set_mode(struct thermal_zone_device *thermal,
ACPI_DEBUG_PRINT((ACPI_DB_INFO,
"%s kernel ACPI thermal control\n",
tz->tz_enabled ? "Enable" : "Disable"));
- acpi_thermal_check(tz);
+ acpi_thermal_check_fn(&tz->thermal_check_work);
}
return 0;
}
@@ -951,6 +944,12 @@ static void acpi_thermal_unregister_thermal_zone(struct acpi_thermal *tz)
Driver Interface
-------------------------------------------------------------------------- */
+static void acpi_queue_thermal_check(struct acpi_thermal *tz)
+{
+ if (!work_pending(&tz->thermal_check_work))
+ queue_work(acpi_thermal_pm_queue, &tz->thermal_check_work);
+}
+
static void acpi_thermal_notify(struct acpi_device *device, u32 event)
{
struct acpi_thermal *tz = acpi_driver_data(device);
@@ -961,17 +960,17 @@ static void acpi_thermal_notify(struct acpi_device *device, u32 event)
switch (event) {
case ACPI_THERMAL_NOTIFY_TEMPERATURE:
- acpi_thermal_check(tz);
+ acpi_queue_thermal_check(tz);
break;
case ACPI_THERMAL_NOTIFY_THRESHOLDS:
acpi_thermal_trips_update(tz, ACPI_TRIPS_REFRESH_THRESHOLDS);
- acpi_thermal_check(tz);
+ acpi_queue_thermal_check(tz);
acpi_bus_generate_netlink_event(device->pnp.device_class,
dev_name(&device->dev), event, 0);
break;
case ACPI_THERMAL_NOTIFY_DEVICES:
acpi_thermal_trips_update(tz, ACPI_TRIPS_REFRESH_DEVICES);
- acpi_thermal_check(tz);
+ acpi_queue_thermal_check(tz);
acpi_bus_generate_netlink_event(device->pnp.device_class,
dev_name(&device->dev), event, 0);
break;
@@ -1071,7 +1070,27 @@ static void acpi_thermal_check_fn(struct work_struct *work)
{
struct acpi_thermal *tz = container_of(work, struct acpi_thermal,
thermal_check_work);
- acpi_thermal_check(tz);
+
+ if (!tz->tz_enabled)
+ return;
+ /*
+ * In general, it is not sufficient to check the pending bit, because
+ * subsequent instances of this function may be queued after one of them
+ * has started running (e.g. if _TMP sleeps). Avoid bailing out if just
+ * one of them is running, though, because it may have done the actual
+ * check some time ago, so allow at least one of them to block on the
+ * mutex while another one is running the update.
+ */
+ if (!atomic_add_unless(&tz->thermal_check_count, -1, 1))
+ return;
+
+ mutex_lock(&tz->thermal_check_lock);
+
+ thermal_zone_device_update(tz->thermal_zone, THERMAL_EVENT_UNSPECIFIED);
+
+ atomic_inc(&tz->thermal_check_count);
+
+ mutex_unlock(&tz->thermal_check_lock);
}
static int acpi_thermal_add(struct acpi_device *device)
@@ -1103,6 +1122,8 @@ static int acpi_thermal_add(struct acpi_device *device)
if (result)
goto free_memory;
+ atomic_set(&tz->thermal_check_count, 3);
+ mutex_init(&tz->thermal_check_lock);
INIT_WORK(&tz->thermal_check_work, acpi_thermal_check_fn);
pr_info(PREFIX "%s [%s] (%ld C)\n", acpi_device_name(device),
@@ -1168,7 +1189,7 @@ static int acpi_thermal_resume(struct device *dev)
tz->state.active |= tz->trips.active[i].flags.enabled;
}
- queue_work(acpi_thermal_pm_queue, &tz->thermal_check_work);
+ acpi_queue_thermal_check(tz);
return AE_OK;
}
--
2.30.0
This is the start of the stable review cycle for the 5.4.96 release.
There are 32 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sun, 07 Feb 2021 14:06:42 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.4.96-rc1…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.4.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 5.4.96-rc1
Peter Zijlstra <peterz(a)infradead.org>
workqueue: Restrict affinity change to rescuer
Peter Zijlstra <peterz(a)infradead.org>
kthread: Extract KTHREAD_IS_PER_CPU
Josh Poimboeuf <jpoimboe(a)redhat.com>
objtool: Don't fail on missing symbol table
Bing Guo <bing.guo(a)amd.com>
drm/amd/display: Change function decide_dp_link_settings to avoid infinite looping
Jake Wang <haonan.wang2(a)amd.com>
drm/amd/display: Update dram_clock_change_latency for DCN2.1
Michael Ellerman <mpe(a)ellerman.id.au>
selftests/powerpc: Only test lwm/stmw on big endian
Revanth Rajashekar <revanth.rajashekar(a)intel.com>
nvme: check the PRINFO bit before deciding the host buffer length
lianzhi chang <changlianzhi(a)uniontech.com>
udf: fix the problem that the disc content is not displayed
Kai-Chuan Hsieh <kaichuan.hsieh(a)canonical.com>
ALSA: hda: Add Cometlake-R PCI ID
Brian King <brking(a)linux.vnet.ibm.com>
scsi: ibmvfc: Set default timeout to avoid crash during migration
Felix Fietkau <nbd(a)nbd.name>
mac80211: fix fast-rx encryption check
Kai-Heng Feng <kai.heng.feng(a)canonical.com>
ASoC: SOF: Intel: hda: Resume codec to do jack detection
Dinghao Liu <dinghao.liu(a)zju.edu.cn>
scsi: fnic: Fix memleak in vnic_dev_init_devcmd2
Javed Hasan <jhasan(a)marvell.com>
scsi: libfc: Avoid invoking response handler twice if ep is already completed
Martin Wilck <mwilck(a)suse.com>
scsi: scsi_transport_srp: Don't block target in failfast state
Peter Zijlstra <peterz(a)infradead.org>
x86: __always_inline __{rd,wr}msr()
Arnold Gozum <arngozum(a)gmail.com>
platform/x86: intel-vbtn: Support for tablet mode on Dell Inspiron 7352
Hans de Goede <hdegoede(a)redhat.com>
platform/x86: touchscreen_dmi: Add swap-x-y quirk for Goodix touchscreen on Estar Beauty HD tablet
Tony Lindgren <tony(a)atomide.com>
phy: cpcap-usb: Fix warning for missing regulator_disable
Eric Dumazet <edumazet(a)google.com>
net_sched: gen_estimator: support large ewma log
ethanwu <ethanwu(a)synology.com>
btrfs: backref, use correct count to resolve normal data refs
ethanwu <ethanwu(a)synology.com>
btrfs: backref, only search backref entries from leaves of the same root
ethanwu <ethanwu(a)synology.com>
btrfs: backref, don't add refs from shared block when resolving normal backref
ethanwu <ethanwu(a)synology.com>
btrfs: backref, only collect file extent items matching backref offset
Enke Chen <enchen(a)paloaltonetworks.com>
tcp: make TCP_USER_TIMEOUT accurate for zero window probes
Catalin Marinas <catalin.marinas(a)arm.com>
arm64: Do not pass tagged addresses to __is_lm_address()
Vincenzo Frascino <vincenzo.frascino(a)arm.com>
arm64: Fix kernel address detection of __is_lm_address()
Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
ACPI: thermal: Do not call acpi_thermal_check() directly
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Revert "Revert "block: end bio with BLK_STS_AGAIN in case of non-mq devs and REQ_NOWAIT""
Lijun Pan <ljp(a)linux.ibm.com>
ibmvnic: Ensure that CRQ entry read are correctly ordered
Rasmus Villemoes <rasmus.villemoes(a)prevas.dk>
net: switchdev: don't set port_obj_info->handled true when -EOPNOTSUPP
Pan Bian <bianpan2016(a)163.com>
net: dsa: bcm_sf2: put device node before return
-------------
Diffstat:
Makefile | 4 +-
arch/arm64/include/asm/memory.h | 10 +-
arch/arm64/mm/physaddr.c | 2 +-
arch/x86/include/asm/msr.h | 4 +-
block/blk-core.c | 11 +-
drivers/acpi/thermal.c | 55 +++++---
drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c | 3 +
.../gpu/drm/amd/display/dc/dcn21/dcn21_resource.c | 2 +-
drivers/net/dsa/bcm_sf2.c | 8 +-
drivers/net/ethernet/ibm/ibmvnic.c | 6 +
drivers/nvme/host/core.c | 17 ++-
drivers/phy/motorola/phy-cpcap-usb.c | 19 ++-
drivers/platform/x86/intel-vbtn.c | 6 +
drivers/platform/x86/touchscreen_dmi.c | 18 +++
drivers/scsi/fnic/vnic_dev.c | 8 +-
drivers/scsi/ibmvscsi/ibmvfc.c | 4 +-
drivers/scsi/libfc/fc_exch.c | 16 ++-
drivers/scsi/scsi_transport_srp.c | 9 +-
fs/btrfs/backref.c | 157 +++++++++++++--------
fs/udf/super.c | 7 +-
include/linux/kthread.h | 3 +
include/net/tcp.h | 1 +
kernel/kthread.c | 27 +++-
kernel/smpboot.c | 1 +
kernel/workqueue.c | 9 +-
net/core/gen_estimator.c | 11 +-
net/ipv4/tcp_input.c | 1 +
net/ipv4/tcp_output.c | 2 +
net/ipv4/tcp_timer.c | 18 +++
net/mac80211/rx.c | 2 +
net/switchdev/switchdev.c | 23 +--
sound/pci/hda/hda_intel.c | 3 +
sound/soc/sof/intel/hda-codec.c | 3 +-
tools/objtool/elf.c | 7 +-
.../powerpc/alignment/alignment_handler.c | 5 +-
35 files changed, 348 insertions(+), 134 deletions(-)
I'm announcing the release of the 5.4.96 kernel.
All users of the 5.4 kernel series must upgrade.
The updated 5.4.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-5.4.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2
arch/arm64/include/asm/memory.h | 10
arch/arm64/mm/physaddr.c | 2
arch/x86/include/asm/msr.h | 4
block/blk-core.c | 11
drivers/acpi/thermal.c | 55 ++-
drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c | 3
drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c | 2
drivers/net/dsa/bcm_sf2.c | 8
drivers/net/ethernet/ibm/ibmvnic.c | 6
drivers/nvme/host/core.c | 17 -
drivers/phy/motorola/phy-cpcap-usb.c | 19 -
drivers/platform/x86/intel-vbtn.c | 6
drivers/platform/x86/touchscreen_dmi.c | 18 +
drivers/scsi/fnic/vnic_dev.c | 8
drivers/scsi/ibmvscsi/ibmvfc.c | 4
drivers/scsi/libfc/fc_exch.c | 16 -
drivers/scsi/scsi_transport_srp.c | 9
fs/btrfs/backref.c | 157 ++++++----
fs/udf/super.c | 7
include/linux/kthread.h | 3
include/net/tcp.h | 1
kernel/kthread.c | 27 +
kernel/smpboot.c | 1
kernel/workqueue.c | 9
net/core/gen_estimator.c | 11
net/ipv4/tcp_input.c | 1
net/ipv4/tcp_output.c | 2
net/ipv4/tcp_timer.c | 18 +
net/mac80211/rx.c | 2
net/switchdev/switchdev.c | 23 -
sound/pci/hda/hda_intel.c | 3
sound/soc/sof/intel/hda-codec.c | 3
tools/objtool/elf.c | 7
tools/testing/selftests/powerpc/alignment/alignment_handler.c | 5
35 files changed, 347 insertions(+), 133 deletions(-)
Arnold Gozum (1):
platform/x86: intel-vbtn: Support for tablet mode on Dell Inspiron 7352
Bing Guo (1):
drm/amd/display: Change function decide_dp_link_settings to avoid infinite looping
Brian King (1):
scsi: ibmvfc: Set default timeout to avoid crash during migration
Catalin Marinas (1):
arm64: Do not pass tagged addresses to __is_lm_address()
Dinghao Liu (1):
scsi: fnic: Fix memleak in vnic_dev_init_devcmd2
Enke Chen (1):
tcp: make TCP_USER_TIMEOUT accurate for zero window probes
Eric Dumazet (1):
net_sched: gen_estimator: support large ewma log
Felix Fietkau (1):
mac80211: fix fast-rx encryption check
Greg Kroah-Hartman (2):
Revert "Revert "block: end bio with BLK_STS_AGAIN in case of non-mq devs and REQ_NOWAIT""
Linux 5.4.96
Hans de Goede (1):
platform/x86: touchscreen_dmi: Add swap-x-y quirk for Goodix touchscreen on Estar Beauty HD tablet
Jake Wang (1):
drm/amd/display: Update dram_clock_change_latency for DCN2.1
Javed Hasan (1):
scsi: libfc: Avoid invoking response handler twice if ep is already completed
Josh Poimboeuf (1):
objtool: Don't fail on missing symbol table
Kai-Chuan Hsieh (1):
ALSA: hda: Add Cometlake-R PCI ID
Kai-Heng Feng (1):
ASoC: SOF: Intel: hda: Resume codec to do jack detection
Lijun Pan (1):
ibmvnic: Ensure that CRQ entry read are correctly ordered
Martin Wilck (1):
scsi: scsi_transport_srp: Don't block target in failfast state
Michael Ellerman (1):
selftests/powerpc: Only test lwm/stmw on big endian
Pan Bian (1):
net: dsa: bcm_sf2: put device node before return
Peter Zijlstra (3):
x86: __always_inline __{rd,wr}msr()
kthread: Extract KTHREAD_IS_PER_CPU
workqueue: Restrict affinity change to rescuer
Rafael J. Wysocki (1):
ACPI: thermal: Do not call acpi_thermal_check() directly
Rasmus Villemoes (1):
net: switchdev: don't set port_obj_info->handled true when -EOPNOTSUPP
Revanth Rajashekar (1):
nvme: check the PRINFO bit before deciding the host buffer length
Tony Lindgren (1):
phy: cpcap-usb: Fix warning for missing regulator_disable
Vincenzo Frascino (1):
arm64: Fix kernel address detection of __is_lm_address()
ethanwu (4):
btrfs: backref, only collect file extent items matching backref offset
btrfs: backref, don't add refs from shared block when resolving normal backref
btrfs: backref, only search backref entries from leaves of the same root
btrfs: backref, use correct count to resolve normal data refs
lianzhi chang (1):
udf: fix the problem that the disc content is not displayed
I'm announcing the release of the 4.19.174 kernel.
All users of the 4.19 kernel series must upgrade.
The updated 4.19.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-4.19.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2
arch/x86/include/asm/msr.h | 4
drivers/acpi/thermal.c | 55 ++++++----
drivers/net/dsa/bcm_sf2.c | 8 +
drivers/net/ethernet/ibm/ibmvnic.c | 6 +
drivers/phy/motorola/phy-cpcap-usb.c | 19 ++-
drivers/platform/x86/intel-vbtn.c | 6 +
drivers/platform/x86/touchscreen_dmi.c | 18 +++
drivers/scsi/ibmvscsi/ibmvfc.c | 4
drivers/scsi/libfc/fc_exch.c | 16 ++
drivers/scsi/scsi_transport_srp.c | 9 +
include/linux/kthread.h | 3
kernel/kthread.c | 27 ++++
kernel/smpboot.c | 1
kernel/sysctl.c | 40 +++++++
kernel/workqueue.c | 9 -
net/core/gen_estimator.c | 11 +-
net/mac80211/rx.c | 2
tools/objtool/elf.c | 7 -
tools/testing/selftests/powerpc/alignment/alignment_handler.c | 5
20 files changed, 205 insertions(+), 47 deletions(-)
Arnold Gozum (1):
platform/x86: intel-vbtn: Support for tablet mode on Dell Inspiron 7352
Brian King (1):
scsi: ibmvfc: Set default timeout to avoid crash during migration
Christian Brauner (1):
sysctl: handle overflow in proc_get_long
Eric Dumazet (1):
net_sched: gen_estimator: support large ewma log
Felix Fietkau (1):
mac80211: fix fast-rx encryption check
Greg Kroah-Hartman (1):
Linux 4.19.174
Hans de Goede (1):
platform/x86: touchscreen_dmi: Add swap-x-y quirk for Goodix touchscreen on Estar Beauty HD tablet
Javed Hasan (1):
scsi: libfc: Avoid invoking response handler twice if ep is already completed
Josh Poimboeuf (1):
objtool: Don't fail on missing symbol table
Lijun Pan (1):
ibmvnic: Ensure that CRQ entry read are correctly ordered
Martin Wilck (1):
scsi: scsi_transport_srp: Don't block target in failfast state
Michael Ellerman (1):
selftests/powerpc: Only test lwm/stmw on big endian
Pan Bian (1):
net: dsa: bcm_sf2: put device node before return
Peter Zijlstra (3):
x86: __always_inline __{rd,wr}msr()
kthread: Extract KTHREAD_IS_PER_CPU
workqueue: Restrict affinity change to rescuer
Rafael J. Wysocki (1):
ACPI: thermal: Do not call acpi_thermal_check() directly
Tony Lindgren (1):
phy: cpcap-usb: Fix warning for missing regulator_disable
I'm announcing the release of the 4.14.220 kernel.
All users of the 4.14 kernel series must upgrade.
The updated 4.14.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-4.14.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2 -
arch/x86/include/asm/msr.h | 4 +-
drivers/acpi/thermal.c | 55 ++++++++++++++++++++++++-----------
drivers/base/core.c | 19 ++++++++++--
drivers/net/dsa/bcm_sf2.c | 8 +++--
drivers/net/ethernet/ibm/ibmvnic.c | 6 +++
drivers/phy/motorola/phy-cpcap-usb.c | 19 ++++++++----
drivers/scsi/ibmvscsi/ibmvfc.c | 4 +-
drivers/scsi/libfc/fc_exch.c | 16 ++++++++--
drivers/scsi/scsi_transport_srp.c | 9 +++++
include/linux/kthread.h | 3 +
kernel/kthread.c | 27 ++++++++++++++++-
kernel/smpboot.c | 1
net/core/gen_estimator.c | 11 ++++---
net/mac80211/rx.c | 2 +
net/sched/sch_api.c | 3 +
tools/objtool/elf.c | 7 +++-
17 files changed, 154 insertions(+), 42 deletions(-)
Benjamin Gaignard (1):
base: core: Remove WARN_ON from link dependencies check
Brian King (1):
scsi: ibmvfc: Set default timeout to avoid crash during migration
Eric Dumazet (2):
net_sched: reject silly cell_log in qdisc_get_rtab()
net_sched: gen_estimator: support large ewma log
Felix Fietkau (1):
mac80211: fix fast-rx encryption check
Greg Kroah-Hartman (1):
Linux 4.14.220
Javed Hasan (1):
scsi: libfc: Avoid invoking response handler twice if ep is already completed
Josh Poimboeuf (1):
objtool: Don't fail on missing symbol table
Lijun Pan (1):
ibmvnic: Ensure that CRQ entry read are correctly ordered
Martin Wilck (1):
scsi: scsi_transport_srp: Don't block target in failfast state
Pan Bian (1):
net: dsa: bcm_sf2: put device node before return
Peter Zijlstra (2):
x86: __always_inline __{rd,wr}msr()
kthread: Extract KTHREAD_IS_PER_CPU
Rafael J. Wysocki (2):
ACPI: thermal: Do not call acpi_thermal_check() directly
driver core: Extend device_is_dependent()
Tony Lindgren (1):
phy: cpcap-usb: Fix warning for missing regulator_disable
This is the start of the stable review cycle for the 4.19.174 release.
There are 17 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sun, 07 Feb 2021 14:06:42 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.19.174-r…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.19.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.19.174-rc1
Peter Zijlstra <peterz(a)infradead.org>
workqueue: Restrict affinity change to rescuer
Peter Zijlstra <peterz(a)infradead.org>
kthread: Extract KTHREAD_IS_PER_CPU
Josh Poimboeuf <jpoimboe(a)redhat.com>
objtool: Don't fail on missing symbol table
Michael Ellerman <mpe(a)ellerman.id.au>
selftests/powerpc: Only test lwm/stmw on big endian
Brian King <brking(a)linux.vnet.ibm.com>
scsi: ibmvfc: Set default timeout to avoid crash during migration
Felix Fietkau <nbd(a)nbd.name>
mac80211: fix fast-rx encryption check
Javed Hasan <jhasan(a)marvell.com>
scsi: libfc: Avoid invoking response handler twice if ep is already completed
Martin Wilck <mwilck(a)suse.com>
scsi: scsi_transport_srp: Don't block target in failfast state
Peter Zijlstra <peterz(a)infradead.org>
x86: __always_inline __{rd,wr}msr()
Arnold Gozum <arngozum(a)gmail.com>
platform/x86: intel-vbtn: Support for tablet mode on Dell Inspiron 7352
Hans de Goede <hdegoede(a)redhat.com>
platform/x86: touchscreen_dmi: Add swap-x-y quirk for Goodix touchscreen on Estar Beauty HD tablet
Tony Lindgren <tony(a)atomide.com>
phy: cpcap-usb: Fix warning for missing regulator_disable
Eric Dumazet <edumazet(a)google.com>
net_sched: gen_estimator: support large ewma log
Christian Brauner <christian(a)brauner.io>
sysctl: handle overflow in proc_get_long
Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
ACPI: thermal: Do not call acpi_thermal_check() directly
Lijun Pan <ljp(a)linux.ibm.com>
ibmvnic: Ensure that CRQ entry read are correctly ordered
Pan Bian <bianpan2016(a)163.com>
net: dsa: bcm_sf2: put device node before return
-------------
Diffstat:
Makefile | 4 +-
arch/x86/include/asm/msr.h | 4 +-
drivers/acpi/thermal.c | 55 +++++++++++++++-------
drivers/net/dsa/bcm_sf2.c | 8 +++-
drivers/net/ethernet/ibm/ibmvnic.c | 6 +++
drivers/phy/motorola/phy-cpcap-usb.c | 19 +++++---
drivers/platform/x86/intel-vbtn.c | 6 +++
drivers/platform/x86/touchscreen_dmi.c | 18 +++++++
drivers/scsi/ibmvscsi/ibmvfc.c | 4 +-
drivers/scsi/libfc/fc_exch.c | 16 ++++++-
drivers/scsi/scsi_transport_srp.c | 9 +++-
include/linux/kthread.h | 3 ++
kernel/kthread.c | 27 ++++++++++-
kernel/smpboot.c | 1 +
kernel/sysctl.c | 40 +++++++++++++++-
kernel/workqueue.c | 9 ++--
net/core/gen_estimator.c | 11 +++--
net/mac80211/rx.c | 2 +
tools/objtool/elf.c | 7 ++-
.../powerpc/alignment/alignment_handler.c | 5 +-
20 files changed, 206 insertions(+), 48 deletions(-)
We expect toolchains to produce these new debug info sections as part of
DWARF v5. Add explicit placements to prevent the linker warnings from
--orphan-section=warn.
Compilers may produce such sections with explicit -gdwarf-5, or based on
the implicit default version of DWARF when -g is used via DEBUG_INFO.
This implicit default changes over time, and has changed to DWARF v5
with GCC 11.
.debug_sup was mentioned in review, but without compilers producing it
today, let's wait to add it until it becomes necessary.
Cc: stable(a)vger.kernel.org
Link: https://bugzilla.redhat.com/show_bug.cgi?id=1922707
Reported-by: Chris Murphy <lists(a)colorremedies.com>
Suggested-by: Fangrui Song <maskray(a)google.com>
Reviewed-by: Nathan Chancellor <nathan(a)kernel.org>
Tested-by: Sedat Dilek <sedat.dilek(a)gmail.com>
Signed-off-by: Nick Desaulniers <ndesaulniers(a)google.com>
---
include/asm-generic/vmlinux.lds.h | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 34b7e0d2346c..1e7cde4bd3f9 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -842,8 +842,13 @@
/* DWARF 4 */ \
.debug_types 0 : { *(.debug_types) } \
/* DWARF 5 */ \
+ .debug_addr 0 : { *(.debug_addr) } \
+ .debug_line_str 0 : { *(.debug_line_str) } \
+ .debug_loclists 0 : { *(.debug_loclists) } \
.debug_macro 0 : { *(.debug_macro) } \
- .debug_addr 0 : { *(.debug_addr) }
+ .debug_names 0 : { *(.debug_names) } \
+ .debug_rnglists 0 : { *(.debug_rnglists) } \
+ .debug_str_offsets 0 : { *(.debug_str_offsets) }
/* Stabs debugging sections. */
#define STABS_DEBUG \
--
2.30.0.365.g02bc693789-goog
Do you really want companies only?
If you detail how to install the new kernel
(and any other prerequisites) and if 'ordinary use'
is sufficient, I'm sure others would help.
regards
--
Dave Pawson
XSLT XSL-FO FAQ.
Docbook FAQ.
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: i2c: max9286: fix access to unallocated memory
Author: Tomi Valkeinen <tomi.valkeinen(a)ideasonboard.com>
Date: Mon Jan 18 09:14:46 2021 +0100
The asd allocated with v4l2_async_notifier_add_fwnode_subdev() must be
of size max9286_asd, otherwise access to max9286_asd->source will go to
unallocated memory.
Fixes: 86d37bf31af6 ("media: i2c: max9286: Allocate v4l2_async_subdev dynamically")
Signed-off-by: Tomi Valkeinen <tomi.valkeinen(a)ideasonboard.com>
Cc: stable(a)vger.kernel.org # v5.10+
Reviewed-by: Laurent Pinchart <laurent.pinchart(a)ideasonboard.com>
Reviewed-by: Kieran Bingham <kieran.bingham+renesas(a)ideasonboard.com>
Tested-by: Kieran Bingham <kieran.bingham+renesas(a)ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
drivers/media/i2c/max9286.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
---
diff --git a/drivers/media/i2c/max9286.c b/drivers/media/i2c/max9286.c
index c82c1493e099..b1e2476d3c9e 100644
--- a/drivers/media/i2c/max9286.c
+++ b/drivers/media/i2c/max9286.c
@@ -580,7 +580,7 @@ static int max9286_v4l2_notifier_register(struct max9286_priv *priv)
asd = v4l2_async_notifier_add_fwnode_subdev(&priv->notifier,
source->fwnode,
- sizeof(*asd));
+ sizeof(struct max9286_asd));
if (IS_ERR(asd)) {
dev_err(dev, "Failed to add subdev for source %u: %ld",
i, PTR_ERR(asd));
On Thu, Feb 04, 2021 at 05:59:42AM +0000, Jari Ruusu wrote:
> Greg,
> I hope that your linux kernel release scripts are
> implemented in a way that understands that PATCHLEVEL= and
> SUBLEVEL= numbers in top-level linux Makefile are encoded
> as 8-bit numbers for LINUX_VERSION_CODE and
> KERNEL_VERSION() macros, and must stay in range 0...255.
> These 8-bit limits are hardcoded in both kernel source and
> userspace ABI.
>
> After 4.9.255 and 4.4.255, your scripts should be
> incrementing a number in EXTRAVERSION= in top-level
> linux Makefile.
Should already be fixed in linux-next, right?
thanks,
greg k-h
The patch titled
Subject: tmpfs: don't use 64-bit inodes by defulat with 32-bit ino_t
has been removed from the -mm tree. Its filename was
tmpfs-dont-use-64-bit-inodes-by-defulat-with-32-bit-ino_t.patch
This patch was dropped because an alternative patch was merged
------------------------------------------------------
From: Seth Forshee <seth.forshee(a)canonical.com>
Subject: tmpfs: don't use 64-bit inodes by defulat with 32-bit ino_t
Currently there seems to be an assumption in tmpfs that 64-bit
architectures also have a 64-bit ino_t. This is not true; s390 at least
has a 32-bit ino_t. With CONFIG_TMPFS_INODE64=y tmpfs mounts will get
64-bit inode numbers and display "inode64" in the mount options, but
passing the "inode64" mount option will fail. This leads to the following
behavior:
# mkdir mnt
# mount -t tmpfs nodev mnt
# mount -o remount,rw mnt
mount: /home/ubuntu/mnt: mount point not mounted or bad option.
As mount sees "inode64" in the mount options and thus passes it in the
options for the remount.
Ideally CONFIG_TMPFS_INODE64 would depend on sizeof(ino_t) < 8, but I
don't think it's possible to test for this (potentially
CONFIG_ARCH_HAS_64BIT_INO_T or similar could be added, but I'm not sure
whether or not that is wanted). So fix this by simply refusing to honor
the CONFIG_TMPFS_INODE64 setting when sizeof(ino_t) < 8.
Link: https://lkml.kernel.org/r/20210205202159.505612-1-seth.forshee@canonical.com
Fixes: ea3271f7196c ("tmpfs: support 64-bit inums per-sb")
Signed-off-by: Seth Forshee <seth.forshee(a)canonical.com>
Cc: Chris Down <chris(a)chrisdown.name>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Amir Goldstein <amir73il(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/shmem.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/shmem.c~tmpfs-dont-use-64-bit-inodes-by-defulat-with-32-bit-ino_t
+++ a/mm/shmem.c
@@ -3733,7 +3733,7 @@ static int shmem_fill_super(struct super
ctx->blocks = shmem_default_max_blocks();
if (!(ctx->seen & SHMEM_SEEN_INODES))
ctx->inodes = shmem_default_max_inodes();
- if (!(ctx->seen & SHMEM_SEEN_INUMS))
+ if (!(ctx->seen & SHMEM_SEEN_INUMS) && sizeof(ino_t) >= 8)
ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
} else {
sb->s_flags |= SB_NOUSER;
_
Patches currently in -mm which might be from seth.forshee(a)canonical.com are
tmpfs-disallow-config_tmpfs_inode64-on-s390.patch
The patch titled
Subject: tmpfs: disallow CONFIG_TMPFS_INODE64 on s390
has been added to the -mm tree. Its filename is
tmpfs-disallow-config_tmpfs_inode64-on-s390.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/tmpfs-disallow-config_tmpfs_inode…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/tmpfs-disallow-config_tmpfs_inode…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Seth Forshee <seth.forshee(a)canonical.com>
Subject: tmpfs: disallow CONFIG_TMPFS_INODE64 on s390
Currently there is an assumption in tmpfs that 64-bit architectures also
have a 64-bit ino_t. This is not true on s390 which has a 32-bit ino_t.
With CONFIG_TMPFS_INODE64=y tmpfs mounts will get 64-bit inode numbers and
display "inode64" in the mount options, but passing the "inode64" mount
option will fail. This leads to the following behavior:
# mkdir mnt
# mount -t tmpfs nodev mnt
# mount -o remount,rw mnt
mount: /home/ubuntu/mnt: mount point not mounted or bad option.
As mount sees "inode64" in the mount options and thus passes it in the
options for the remount.
So prevent CONFIG_TMPFS_INODE64 from being selected on s390.
Link: https://lkml.kernel.org/r/20210205230620.518245-1-seth.forshee@canonical.com
Fixes: ea3271f7196c ("tmpfs: support 64-bit inums per-sb")
Signed-off-by: Seth Forshee <seth.forshee(a)canonical.com>
Cc: Chris Down <chris(a)chrisdown.name>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Amir Goldstein <amir73il(a)gmail.com>
Cc: Heiko Carstens <hca(a)linux.ibm.com>
Cc: Vasily Gorbik <gor(a)linux.ibm.com>
Cc: Christian Borntraeger <borntraeger(a)de.ibm.com>
Cc: <stable(a)vger.kernel.org> [5.9+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/fs/Kconfig~tmpfs-disallow-config_tmpfs_inode64-on-s390
+++ a/fs/Kconfig
@@ -203,7 +203,7 @@ config TMPFS_XATTR
config TMPFS_INODE64
bool "Use 64-bit ino_t by default in tmpfs"
- depends on TMPFS && 64BIT
+ depends on TMPFS && 64BIT && !S390
default n
help
tmpfs has historically used only inode numbers as wide as an unsigned
_
Patches currently in -mm which might be from seth.forshee(a)canonical.com are
tmpfs-disallow-config_tmpfs_inode64-on-s390.patch
tmpfs-dont-use-64-bit-inodes-by-defulat-with-32-bit-ino_t.patch
From: Aurelien Aptel <aaptel(a)suse.com>
Assuming
- //HOST/a is mounted on /mnt
- //HOST/b is mounted on /mnt/b
On a slow connection, running 'df' and killing it while it's
processing /mnt/b can make cifs_get_inode_info() returns -ERESTARTSYS.
This triggers the following chain of events:
=> the dentry revalidation fail
=> dentry is put and released
=> superblock associated with the dentry is put
=> /mnt/b is unmounted
This patch makes cifs_d_revalidate() return the error instead of 0
(invalid) when cifs_revalidate_dentry() fails, except for ENOENT (file
deleted) and ESTALE (file recreated).
Signed-off-by: Aurelien Aptel <aaptel(a)suse.com>
Suggested-by: Shyam Prasad N <nspmangalore(a)gmail.com>
CC: stable(a)vger.kernel.org
---
fs/cifs/dir.c | 22 ++++++++++++++++++++--
1 file changed, 20 insertions(+), 2 deletions(-)
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 68900f1629bff..97ac363b5df16 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -737,6 +737,7 @@ static int
cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
{
struct inode *inode;
+ int rc;
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -746,8 +747,25 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
if ((flags & LOOKUP_REVAL) && !CIFS_CACHE_READ(CIFS_I(inode)))
CIFS_I(inode)->time = 0; /* force reval */
- if (cifs_revalidate_dentry(direntry))
- return 0;
+ rc = cifs_revalidate_dentry(direntry);
+ if (rc) {
+ cifs_dbg(FYI, "cifs_revalidate_dentry failed with rc=%d", rc);
+ switch (rc) {
+ case -ENOENT:
+ case -ESTALE:
+ /*
+ * Those errors mean the dentry is invalid
+ * (file was deleted or recreated)
+ */
+ return 0;
+ default:
+ /*
+ * Otherwise some unexpected error happened
+ * report it as-is to VFS layer
+ */
+ return rc;
+ }
+ }
else {
/*
* If the inode wasn't known to be a dfs entry when
--
2.29.2
The patch titled
Subject: tmpfs: don't use 64-bit inodes by defulat with 32-bit ino_t
has been added to the -mm tree. Its filename is
tmpfs-dont-use-64-bit-inodes-by-defulat-with-32-bit-ino_t.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/tmpfs-dont-use-64-bit-inodes-by-d…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/tmpfs-dont-use-64-bit-inodes-by-d…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Seth Forshee <seth.forshee(a)canonical.com>
Subject: tmpfs: don't use 64-bit inodes by defulat with 32-bit ino_t
Currently there seems to be an assumption in tmpfs that 64-bit
architectures also have a 64-bit ino_t. This is not true; s390 at least
has a 32-bit ino_t. With CONFIG_TMPFS_INODE64=y tmpfs mounts will get
64-bit inode numbers and display "inode64" in the mount options, but
passing the "inode64" mount option will fail. This leads to the following
behavior:
# mkdir mnt
# mount -t tmpfs nodev mnt
# mount -o remount,rw mnt
mount: /home/ubuntu/mnt: mount point not mounted or bad option.
As mount sees "inode64" in the mount options and thus passes it in the
options for the remount.
Ideally CONFIG_TMPFS_INODE64 would depend on sizeof(ino_t) < 8, but I
don't think it's possible to test for this (potentially
CONFIG_ARCH_HAS_64BIT_INO_T or similar could be added, but I'm not sure
whether or not that is wanted). So fix this by simply refusing to honor
the CONFIG_TMPFS_INODE64 setting when sizeof(ino_t) < 8.
Link: https://lkml.kernel.org/r/20210205202159.505612-1-seth.forshee@canonical.com
Fixes: ea3271f7196c ("tmpfs: support 64-bit inums per-sb")
Signed-off-by: Seth Forshee <seth.forshee(a)canonical.com>
Cc: Chris Down <chris(a)chrisdown.name>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Amir Goldstein <amir73il(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/shmem.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/shmem.c~tmpfs-dont-use-64-bit-inodes-by-defulat-with-32-bit-ino_t
+++ a/mm/shmem.c
@@ -3733,7 +3733,7 @@ static int shmem_fill_super(struct super
ctx->blocks = shmem_default_max_blocks();
if (!(ctx->seen & SHMEM_SEEN_INODES))
ctx->inodes = shmem_default_max_inodes();
- if (!(ctx->seen & SHMEM_SEEN_INUMS))
+ if (!(ctx->seen & SHMEM_SEEN_INUMS) && sizeof(ino_t) >= 8)
ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
} else {
sb->s_flags |= SB_NOUSER;
_
Patches currently in -mm which might be from seth.forshee(a)canonical.com are
tmpfs-dont-use-64-bit-inodes-by-defulat-with-32-bit-ino_t.patch
The following commit has been merged into the irq/urgent branch of tip:
Commit-ID: 4c7bcb51ae25f79e3733982e5d0cd8ce8640ddfc
Gitweb: https://git.kernel.org/tip/4c7bcb51ae25f79e3733982e5d0cd8ce8640ddfc
Author: Hans de Goede <hdegoede(a)redhat.com>
AuthorDate: Mon, 21 Dec 2020 19:56:47 +01:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Fri, 05 Feb 2021 20:48:28 +01:00
genirq: Prevent [devm_]irq_alloc_desc from returning irq 0
Since commit a85a6c86c25b ("driver core: platform: Clarify that IRQ 0
is invalid"), having a linux-irq with number 0 will trigger a WARN()
when calling platform_get_irq*() to retrieve that linux-irq.
Since [devm_]irq_alloc_desc allocs a single irq and since irq 0 is not used
on some systems, it can return 0, triggering that WARN(). This happens
e.g. on Intel Bay Trail and Cherry Trail devices using the LPE audio engine
for HDMI audio:
0 is an invalid IRQ number
WARNING: CPU: 3 PID: 472 at drivers/base/platform.c:238 platform_get_irq_optional+0x108/0x180
Modules linked in: snd_hdmi_lpe_audio(+) ...
Call Trace:
platform_get_irq+0x17/0x30
hdmi_lpe_audio_probe+0x4a/0x6c0 [snd_hdmi_lpe_audio]
---[ end trace ceece38854223a0b ]---
Change the 'from' parameter passed to __[devm_]irq_alloc_descs() by the
[devm_]irq_alloc_desc macros from 0 to 1, so that these macros will no
longer return 0.
Fixes: a85a6c86c25b ("driver core: platform: Clarify that IRQ 0 is invalid")
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20201221185647.226146-1-hdegoede@redhat.com
---
include/linux/irq.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 4aeb1c4..2efde6a 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -928,7 +928,7 @@ int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
__irq_alloc_descs(irq, from, cnt, node, THIS_MODULE, NULL)
#define irq_alloc_desc(node) \
- irq_alloc_descs(-1, 0, 1, node)
+ irq_alloc_descs(-1, 1, 1, node)
#define irq_alloc_desc_at(at, node) \
irq_alloc_descs(at, at, 1, node)
@@ -943,7 +943,7 @@ int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
__devm_irq_alloc_descs(dev, irq, from, cnt, node, THIS_MODULE, NULL)
#define devm_irq_alloc_desc(dev, node) \
- devm_irq_alloc_descs(dev, -1, 0, 1, node)
+ devm_irq_alloc_descs(dev, -1, 1, 1, node)
#define devm_irq_alloc_desc_at(dev, at, node) \
devm_irq_alloc_descs(dev, at, at, 1, node)
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: c4bed4b96918ff1d062ee81fdae4d207da4fa9b0
Gitweb: https://git.kernel.org/tip/c4bed4b96918ff1d062ee81fdae4d207da4fa9b0
Author: Lai Jiangshan <laijs(a)linux.alibaba.com>
AuthorDate: Thu, 04 Feb 2021 23:27:06 +08:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Fri, 05 Feb 2021 20:13:11 +01:00
x86/debug: Prevent data breakpoints on __per_cpu_offset
When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU GSBASE value
via __per_cpu_offset or pcpu_unit_offsets.
When a data breakpoint is set on __per_cpu_offset[cpu] (read-write
operation), the specific CPU will be stuck in an infinite #DB loop.
RCU will try to send an NMI to the specific CPU, but it is not working
either since NMI also relies on paranoid_entry(). Which means it's
undebuggable.
Fixes: eaad981291ee3("x86/entry/64: Introduce the FIND_PERCPU_BASE macro")
Signed-off-by: Lai Jiangshan <laijs(a)linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210204152708.21308-1-jiangshanlai@gmail.com
---
arch/x86/kernel/hw_breakpoint.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 6694c0f..012ed82 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -269,6 +269,20 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
CPU_ENTRY_AREA_TOTAL_SIZE))
return true;
+ /*
+ * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU
+ * GSBASE value via __per_cpu_offset or pcpu_unit_offsets.
+ */
+#ifdef CONFIG_SMP
+ if (within_area(addr, end, (unsigned long)__per_cpu_offset,
+ sizeof(unsigned long) * nr_cpu_ids))
+ return true;
+#else
+ if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets,
+ sizeof(pcpu_unit_offsets)))
+ return true;
+#endif
+
for_each_possible_cpu(cpu) {
/* The original rw GDT is being used after load_direct_gdt() */
if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu),
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 3943abf2dbfae9ea4d2da05c1db569a0603f76da
Gitweb: https://git.kernel.org/tip/3943abf2dbfae9ea4d2da05c1db569a0603f76da
Author: Lai Jiangshan <laijs(a)linux.alibaba.com>
AuthorDate: Thu, 04 Feb 2021 23:27:07 +08:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Fri, 05 Feb 2021 20:13:12 +01:00
x86/debug: Prevent data breakpoints on cpu_dr7
local_db_save() is called at the start of exc_debug_kernel(), reads DR7 and
disables breakpoints to prevent recursion.
When running in a guest (X86_FEATURE_HYPERVISOR), local_db_save() reads the
per-cpu variable cpu_dr7 to check whether a breakpoint is active or not
before it accesses DR7.
A data breakpoint on cpu_dr7 therefore results in infinite #DB recursion.
Disallow data breakpoints on cpu_dr7 to prevent that.
Fixes: 84b6a3491567a("x86/entry: Optimize local_db_save() for virt")
Signed-off-by: Lai Jiangshan <laijs(a)linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210204152708.21308-2-jiangshanlai@gmail.com
---
arch/x86/kernel/hw_breakpoint.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 012ed82..668a4a6 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -307,6 +307,14 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
(unsigned long)&per_cpu(cpu_tlbstate, cpu),
sizeof(struct tlb_state)))
return true;
+
+ /*
+ * When in guest (X86_FEATURE_HYPERVISOR), local_db_save()
+ * will read per-cpu cpu_dr7 before clear dr7 register.
+ */
+ if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu),
+ sizeof(cpu_dr7)))
+ return true;
}
return false;
Right now SUBLEVEL is overflowing, and some userspace may start treating
4.9.256 as 4.10. While out of tree modules have different ways of
extracting the version number (and we're generally ok with breaking
them), we do care about breaking userspace and it would appear that this
overflow might do just that.
Our rules around userspace ABI in the stable kernel are pretty simple:
we don't break it. Thus, while userspace may be checking major/minor, it
shouldn't be doing anything with sublevel.
This patch applies a big band-aid to the 4.9 and 4.4 kernels in the form
of clamping their sublevel to 255.
The clamp is done for the purpose of LINUX_VERSION_CODE only, and
extracting the version number from the Makefile or "make kernelversion"
will continue to work as intended.
We might need to do it later in newer trees, but maybe we'll have a
better solution by then, so I'm ignoring that problem for now.
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
Makefile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Makefile b/Makefile
index 69af44d3dcd14..93f7ce06fc8c9 100644
--- a/Makefile
+++ b/Makefile
@@ -1141,7 +1141,7 @@ endef
define filechk_version.h
(echo \#define LINUX_VERSION_CODE $(shell \
- expr $(VERSION) \* 65536 + 0$(PATCHLEVEL) \* 256 + 0$(SUBLEVEL)); \
+ expr $(VERSION) \* 65536 + 0$(PATCHLEVEL) \* 256 + 255); \
echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))';)
endef
--
2.27.0
This is a note to let you know that I've just added the patch titled
staging: rtl8188eu: Add Edimax EW-7811UN V2 to device table
to my staging git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git
in the staging-next branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will also be merged in the next major kernel release
during the merge window.
If you have any questions about this process, please let me know.
>From 7a8d2f1908a59003e55ef8691d09efb7fbc51625 Mon Sep 17 00:00:00 2001
From: Martin Kaiser <martin(a)kaiser.cx>
Date: Thu, 4 Feb 2021 09:52:17 +0100
Subject: staging: rtl8188eu: Add Edimax EW-7811UN V2 to device table
The Edimax EW-7811UN V2 uses an RTL8188EU chipset and works with this
driver.
Signed-off-by: Martin Kaiser <martin(a)kaiser.cx>
Cc: stable <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20210204085217.9743-1-martin@kaiser.cx
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/staging/rtl8188eu/os_dep/usb_intf.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/staging/rtl8188eu/os_dep/usb_intf.c b/drivers/staging/rtl8188eu/os_dep/usb_intf.c
index 43ebd11b53fe..efad43d8e465 100644
--- a/drivers/staging/rtl8188eu/os_dep/usb_intf.c
+++ b/drivers/staging/rtl8188eu/os_dep/usb_intf.c
@@ -41,6 +41,7 @@ static const struct usb_device_id rtw_usb_id_tbl[] = {
{USB_DEVICE(0x2357, 0x0111)}, /* TP-Link TL-WN727N v5.21 */
{USB_DEVICE(0x2C4E, 0x0102)}, /* MERCUSYS MW150US v2 */
{USB_DEVICE(0x0df6, 0x0076)}, /* Sitecom N150 v2 */
+ {USB_DEVICE(0x7392, 0xb811)}, /* Edimax EW-7811UN V2 */
{USB_DEVICE(USB_VENDER_ID_REALTEK, 0xffef)}, /* Rosewill RNX-N150NUB */
{} /* Terminating entry */
};
--
2.30.0
From: Filipe Laíns <lains(a)riseup.net>
In e400071a805d6229223a98899e9da8c6233704a1 I added support for the
receiver that comes with the G602 device, but unfortunately I screwed up
during testing and it seems the keyboard events were actually not being
sent to userspace.
This resulted in keyboard events being broken in userspace, please
backport the fix.
The receiver uses the normal 0x01 Logitech keyboard report descriptor,
as expected, so it is just a matter of flagging it as supported.
Reported in
https://github.com/libratbag/libratbag/issues/1124
Fixes: e400071a805d6 ("HID: logitech-dj: add the G602 receiver")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Filipe Laíns <lains(a)riseup.net>
---
Changes in v2:
- added missing Fixes: anc Cc: tags
drivers/hid/hid-logitech-dj.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/hid/hid-logitech-dj.c b/drivers/hid/hid-logitech-dj.c
index 6596c81947a8..2703333edc34 100644
--- a/drivers/hid/hid-logitech-dj.c
+++ b/drivers/hid/hid-logitech-dj.c
@@ -981,6 +981,7 @@ static void logi_hidpp_recv_queue_notif(struct hid_device *hdev,
case 0x07:
device_type = "eQUAD step 4 Gaming";
logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem);
+ workitem.reports_supported |= STD_KEYBOARD;
break;
case 0x08:
device_type = "eQUAD step 4 for gamepads";
--
2.30.0
KASAN reserves "redzone" areas between stack frames in order to detect
stack overruns. A read or write to such an area triggers a KASAN
"stack-out-of-bounds" BUG.
Normally, the ORC unwinder stays in-bounds and doesn't access the
redzone. But sometimes it can't find ORC metadata for a given
instruction. This can happen for code which is missing ORC metadata, or
for generated code. In such cases, the unwinder attempts to fall back
to frame pointers, as a best-effort type thing.
This fallback often works, but when it doesn't, the unwinder can get
confused and go off into the weeds into the KASAN redzone, triggering
the aforementioned KASAN BUG.
But in this case, the unwinder's confusion is actually harmless and
working as designed. It already has checks in place to prevent
off-stack accesses, but those checks get short-circuited by the KASAN
BUG. And a BUG is a lot more disruptive than a harmless unwinder
warning.
Disable the KASAN checks by using READ_ONCE_NOCHECK() for all stack
accesses. This finishes the job started by commit 881125bfe65b
("x86/unwind: Disable KASAN checking in the ORC unwinder"), which only
partially fixed the issue.
Fixes: ee9f8fce9964 ("x86/unwind: Add the ORC unwinder")
Reported-by: Ivan Babrou <ivan(a)cloudflare.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Josh Poimboeuf <jpoimboe(a)redhat.com>
---
arch/x86/kernel/unwind_orc.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 73f800100066..c451d5f6422f 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -367,8 +367,8 @@ static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
return false;
- *ip = regs->ip;
- *sp = regs->sp;
+ *ip = READ_ONCE_NOCHECK(regs->ip);
+ *sp = READ_ONCE_NOCHECK(regs->sp);
return true;
}
@@ -380,8 +380,8 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr
if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
return false;
- *ip = regs->ip;
- *sp = regs->sp;
+ *ip = READ_ONCE_NOCHECK(regs->ip);
+ *sp = READ_ONCE_NOCHECK(regs->sp);
return true;
}
@@ -402,12 +402,12 @@ static bool get_reg(struct unwind_state *state, unsigned int reg_off,
return false;
if (state->full_regs) {
- *val = ((unsigned long *)state->regs)[reg];
+ *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]);
return true;
}
if (state->prev_regs) {
- *val = ((unsigned long *)state->prev_regs)[reg];
+ *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]);
return true;
}
--
2.29.2
From: Hongwei Ma <mahongwei_123(a)126.com>
tuple is only allcated for standard SDIO card, especially it causes memory
corruption issue when the non-standard SDIO card has removed since the card
device's reference counter does not increase for it at sdio_init_func, but
all SDIO card device reference counter has decreased at sdio_release_func.
Cc: <stable(a)vger.kernel.org> #v5.4+
Signed-off-by: Peter Chen <peter.chen(a)kernel.org>
---
drivers/mmc/core/sdio_bus.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c
index 3d709029e07c..50279f805a53 100644
--- a/drivers/mmc/core/sdio_bus.c
+++ b/drivers/mmc/core/sdio_bus.c
@@ -292,7 +292,8 @@ static void sdio_release_func(struct device *dev)
{
struct sdio_func *func = dev_to_sdio_func(dev);
- sdio_free_func_cis(func);
+ if (!(card->quirks & MMC_QUIRK_NONSTD_SDIO))
+ sdio_free_func_cis(func);
kfree(func->info);
kfree(func->tmpbuf);
--
2.17.1
From: Xiao Ni <xni(a)redhat.com>
One customer reports a crash problem which causes by flush request. It
triggers a warning before crash.
/* new request after previous flush is completed */
if (ktime_after(req_start, mddev->prev_flush_start)) {
WARN_ON(mddev->flush_bio);
mddev->flush_bio = bio;
bio = NULL;
}
The WARN_ON is triggered. We use spin lock to protect prev_flush_start and
flush_bio in md_flush_request. But there is no lock protection in
md_submit_flush_data. It can set flush_bio to NULL first because of
compiler reordering write instructions.
For example, flush bio1 sets flush bio to NULL first in
md_submit_flush_data. An interrupt or vmware causing an extended stall
happen between updating flush_bio and prev_flush_start. Because flush_bio
is NULL, flush bio2 can get the lock and submit to underlayer disks. Then
flush bio1 updates prev_flush_start after the interrupt or extended stall.
Then flush bio3 enters in md_flush_request. The start time req_start is
behind prev_flush_start. The flush_bio is not NULL(flush bio2 hasn't
finished). So it can trigger the WARN_ON now. Then it calls INIT_WORK
again. INIT_WORK() will re-initialize the list pointers in the
work_struct, which then can result in a corrupted work list and the
work_struct queued a second time. With the work list corrupted, it can
lead in invalid work items being used and cause a crash in
process_one_work.
We need to make sure only one flush bio can be handled at one same time.
So add spin lock in md_submit_flush_data to protect prev_flush_start and
flush_bio in an atomic way.
Reviewed-by: David Jeffery <djeffery(a)redhat.com>
Signed-off-by: Xiao Ni <xni(a)redhat.com>
Signed-off-by: Song Liu <songliubraving(a)fb.com>
[jwang: backport dc5d17a3c39b06aef866afca19245a9cfb533a79]
Signed-off-by: Jack Wang <jinpu.wang(a)cloud.ionos.com>
---
drivers/md/md.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ec5dfb7ae4e1..cc38530804c9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -538,8 +538,10 @@ static void md_submit_flush_data(struct work_struct *ws)
* could wait for this and below md_handle_request could wait for those
* bios because of suspend check
*/
+ spin_lock_irq(&mddev->lock);
mddev->last_flush = mddev->start_flush;
mddev->flush_bio = NULL;
+ spin_unlock_irq(&mddev->lock);
wake_up(&mddev->sb_wait);
if (bio->bi_iter.bi_size == 0) {
--
2.25.1
Ref: https://git.kernel.org/pub/scm/linux/kernel/git/stable/stable-queue.git/tre…
This set required 1 additional patch dragged back from v4.14 to avoid build errors.
Arnd Bergmann (1):
y2038: futex: Move compat implementation into futex.c
Thomas Gleixner (11):
futex: Move futex exit handling into futex code
futex: Replace PF_EXITPIDONE with a state
exit/exec: Seperate mm_release()
futex: Split futex_mm_release() for exit/exec
futex: Set task::futex_state to DEAD right after handling futex exit
futex: Mark the begin of futex exit explicitly
futex: Sanitize exit state handling
futex: Provide state handling for exec() as well
futex: Add mutex around futex exit
futex: Provide distinct return value when owner is exiting
futex: Prevent exit livelock
fs/exec.c | 2 +-
include/linux/compat.h | 2 -
include/linux/futex.h | 44 ++--
include/linux/sched.h | 9 +-
kernel/Makefile | 3 -
kernel/exit.c | 25 +--
kernel/fork.c | 40 ++--
kernel/futex.c | 446 ++++++++++++++++++++++++++++++++++++++---
kernel/futex_compat.c | 201 -------------------
9 files changed, 466 insertions(+), 306 deletions(-)
delete mode 100644 kernel/futex_compat.c
Cc: Stable Team <stable(a)vger.kernel.org>
--
2.25.1
We allocate 2MB chunks at a time, so it might appear that a page fault
has already been handled by a previous page fault when we reach
panfrost_mmu_map_fault_addr(). Bail out in that case to avoid mapping the
same area twice.
Cc: <stable(a)vger.kernel.org>
Fixes: 187d2929206e ("drm/panfrost: Add support for GPU heap allocations")
Signed-off-by: Boris Brezillon <boris.brezillon(a)collabora.com>
Reviewed-by: Steven Price <steven.price(a)arm.com>
---
drivers/gpu/drm/panfrost/panfrost_mmu.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c
index 904d63450862..21e552d1ac71 100644
--- a/drivers/gpu/drm/panfrost/panfrost_mmu.c
+++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c
@@ -488,8 +488,14 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as,
}
bo->base.pages = pages;
bo->base.pages_use_count = 1;
- } else
+ } else {
pages = bo->base.pages;
+ if (pages[page_offset]) {
+ /* Pages are already mapped, bail out. */
+ mutex_unlock(&bo->base.pages_lock);
+ goto out;
+ }
+ }
mapping = bo->base.base.filp->f_mapping;
mapping_set_unevictable(mapping);
@@ -522,6 +528,7 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as,
dev_dbg(pfdev->dev, "mapped page fault @ AS%d %llx", as, addr);
+out:
panfrost_gem_mapping_put(bomapping);
return 0;
--
2.26.2
When a fault is handled it will unblock the GPU which will continue
executing its shader and might fault almost immediately on a different
page. If we clear interrupts after handling the fault we might miss new
faults, so clear them before.
Cc: <stable(a)vger.kernel.org>
Fixes: 187d2929206e ("drm/panfrost: Add support for GPU heap allocations")
Signed-off-by: Boris Brezillon <boris.brezillon(a)collabora.com>
Reviewed-by: Steven Price <steven.price(a)arm.com>
---
drivers/gpu/drm/panfrost/panfrost_mmu.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c
index 7c1b3481b785..904d63450862 100644
--- a/drivers/gpu/drm/panfrost/panfrost_mmu.c
+++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c
@@ -593,6 +593,8 @@ static irqreturn_t panfrost_mmu_irq_handler_thread(int irq, void *data)
access_type = (fault_status >> 8) & 0x3;
source_id = (fault_status >> 16);
+ mmu_write(pfdev, MMU_INT_CLEAR, mask);
+
/* Page fault only */
ret = -1;
if ((status & mask) == BIT(i) && (exception_type & 0xF8) == 0xC0)
@@ -616,8 +618,6 @@ static irqreturn_t panfrost_mmu_irq_handler_thread(int irq, void *data)
access_type, access_type_name(pfdev, fault_status),
source_id);
- mmu_write(pfdev, MMU_INT_CLEAR, mask);
-
status &= ~mask;
}
--
2.26.2
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3d1cf435e201d1fd63e4346b141881aed086effd Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki(a)intel.com>
Date: Fri, 15 Jan 2021 19:30:51 +0100
Subject: [PATCH] driver core: Extend device_is_dependent()
If the device passed as the target (second argument) to
device_is_dependent() is not completely registered (that is, it has
been initialized, but not added yet), but the parent pointer of it
is set, it may be missing from the list of the parent's children
and device_for_each_child() called by device_is_dependent() cannot
be relied on to catch that dependency.
For this reason, modify device_is_dependent() to check the ancestors
of the target device by following its parent pointer in addition to
the device_for_each_child() walk.
Fixes: 9ed9895370ae ("driver core: Functional dependencies tracking support")
Reported-by: Stephan Gerhold <stephan(a)gerhold.net>
Tested-by: Stephan Gerhold <stephan(a)gerhold.net>
Reviewed-by: Saravana Kannan <saravanak(a)google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
Link: https://lore.kernel.org/r/17705994.d592GUb2YH@kreacher
Cc: stable <stable(a)vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 25e08e5f40bd..3819fd012e27 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -208,6 +208,16 @@ int device_links_read_lock_held(void)
#endif
#endif /* !CONFIG_SRCU */
+static bool device_is_ancestor(struct device *dev, struct device *target)
+{
+ while (target->parent) {
+ target = target->parent;
+ if (dev == target)
+ return true;
+ }
+ return false;
+}
+
/**
* device_is_dependent - Check if one device depends on another one
* @dev: Device to check dependencies for.
@@ -221,7 +231,12 @@ int device_is_dependent(struct device *dev, void *target)
struct device_link *link;
int ret;
- if (dev == target)
+ /*
+ * The "ancestors" check is needed to catch the case when the target
+ * device has not been completely initialized yet and it is still
+ * missing from the list of children of its parent device.
+ */
+ if (dev == target || device_is_ancestor(dev, target))
return 1;
ret = device_for_each_child(dev, target, device_is_dependent);
Hi,
please apply the following commits to 5.4.x tree. They're fixing a problem with
a long running transaction and there are users that have hit that. Other users
from the community have been using the patches on 5.4 base so they can be
considered tested.
All apply cleanly on top of current 5.4 tree. Thanks.
7ac8b88ee668a5b4743ebf3e9888fabac85c334a
ed58f2e66e849c34826083e5a6c1b506ee8a4d8e
cfc0eed0ec89db7c4a8d461174cabfaa4a0912c7
b25b0b871f206936d5bca02b80d38c05623e27da
Short ids with subjects:
7ac8b88ee668 ("btrfs: backref, only collect file extent items matching backref offset")
ed58f2e66e84 ("btrfs: backref, don't add refs from shared block when resolving normal backref")
cfc0eed0ec89 ("btrfs: backref, only search backref entries from leaves of the same root")
b25b0b871f20 ("btrfs: backref, use correct count to resolve normal data refs")
Hi Greg and Sasha,
Please apply commit 28187dc8ebd9 ("ARM: 9025/1: Kconfig: CPU_BIG_ENDIAN
depends on !LD_IS_LLD") to linux-5.10.y, as it fixes a series of errors
that are seen with ARCH=arm LLVM=1 all{mod,yes}config. CONFIG_LD_IS_LLD
was introduced in 5.8 so it currently does not need to go back farther
than 5.10.
Cheers,
Nathan
Hi,
this upstream patch 7f2923c4f73f21cfd714d12a2d48de8c21f11cfe, should
also be applied to 4.19.
The other patch of this series (sysctl: handle overflow for file-max)
was already applied.
This was found by ltp test (sysctl02).
Thanks,
Joerg
Hi Greg, hi Sasha,
Please consider to include following fixes in to stable tree.
The 6 patches from Ming was fixing a deadlock, they are included around kernel
5.3/4. Would be good to included in 4.19, we hit it during testing, with the fix
we no longer hit the deadlock.
The md fixes is for a panic regarding handle flush.
the last one is simple NULL pointer deref fix.
Thanks!
Jack Wang @ IONOS Cloud.
Ming Lei (6):
block: don't hold q->sysfs_lock in elevator_init_mq
blk-mq: don't hold q->sysfs_lock in blk_mq_map_swqueue
block: add helper for checking if queue is registered
block: split .sysfs_lock into two locks
block: fix race between switching elevator and removing queues
block: don't release queue's sysfs lock during switching elevator
Xiao Ni (1):
md: Set prev_flush_start and flush_bio in an atomic way
zhengbin (1):
block: fix NULL pointer dereference in register_disk
block/blk-core.c | 1 +
block/blk-mq-sysfs.c | 12 +++++------
block/blk-mq.c | 7 ------
block/blk-sysfs.c | 49 +++++++++++++++++++++++++++---------------
block/blk-wbt.c | 2 +-
block/blk.h | 2 +-
block/elevator.c | 44 +++++++++++++++++++++----------------
block/genhd.c | 10 +++++----
drivers/md/md.c | 2 ++
include/linux/blkdev.h | 2 ++
10 files changed, 76 insertions(+), 55 deletions(-)
--
2.25.1
On Mon, Jan 25, 2021 at 12:49:39PM -0800, Linus Torvalds wrote:
> On Mon, Jan 25, 2021 at 12:35 PM Chris Wilson <chris(a)chris-wilson.co.uk> wrote:
>
> Mike: should we perhaps revert the first patch too (commit
> bde9cfa3afe4: "x86/setup: don't remove E820_TYPE_RAM for pfn 0")?
Unfortunately, I was too optimistic and didn't take into account that this
commit changes the way /dev/mem sees the first page of memory.
There were reports of slackware users about issues with lilo after upgrade
from 5.10.11 to 5.10.12
https://www.linuxquestions.org/questions/slackware-14/slackware-current-lil…
The root cause is that lilo is no longer able to access the first memory
page via /dev/mem because its type was changed from E820_TYPE_RESERVED to
E820_TYPE_RAM, so this became a part of the "System RAM" resource and
devmem_is_allowed() considers it disallowed area.
So here's the revert of bde9cfa3afe4 as well.
>From a7fdc4117010d393dd77b99da5b573a5c98453ce Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt(a)linux.ibm.com>
Date: Thu, 4 Feb 2021 20:12:37 +0200
Subject: [PATCH] Revert "x86/setup: don't remove E820_TYPE_RAM for pfn 0"
This reverts commit bde9cfa3afe4324ec251e4af80ebf9b7afaf7afe.
Changing the first memory page type from E820_TYPE_RESERVED to
E820_TYPE_RAM makes it a part of "System RAM" resource rather than a
reserved resource and this in turn causes devmem_is_allowed() to treat is
as area that can be accessed but it is filled with zeroes instead of the
actual data as previously.
The change in /dev/mem output causes lilo to fail as was reported at
slakware users forum [1], and probably other legacy applications will
experience similar problems.
[1] https://www.linuxquestions.org/questions/slackware-14/slackware-current-lil…
Signed-off-by: Mike Rapoport <rppt(a)linux.ibm.com>
---
arch/x86/kernel/setup.c | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3412c4595efd..740f3bdb3f61 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -660,6 +660,17 @@ static void __init trim_platform_memory_ranges(void)
static void __init trim_bios_range(void)
{
+ /*
+ * A special case is the first 4Kb of memory;
+ * This is a BIOS owned area, not kernel ram, but generally
+ * not listed as such in the E820 table.
+ *
+ * This typically reserves additional memory (64KiB by default)
+ * since some BIOSes are known to corrupt low memory. See the
+ * Kconfig help text for X86_RESERVE_LOW.
+ */
+ e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
+
/*
* special case: Some BIOSes report the PC BIOS
* area (640Kb -> 1Mb) as RAM even though it is not.
@@ -717,15 +728,6 @@ early_param("reservelow", parse_reservelow);
static void __init trim_low_memory_range(void)
{
- /*
- * A special case is the first 4Kb of memory;
- * This is a BIOS owned area, not kernel ram, but generally
- * not listed as such in the E820 table.
- *
- * This typically reserves additional memory (64KiB by default)
- * since some BIOSes are known to corrupt low memory. See the
- * Kconfig help text for X86_RESERVE_LOW.
- */
memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
}
--
2.29.2
> Linus
--
Sincerely yours,
Mike.
From: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
The BXT/GLK DPLL can't generate certain frequencies. We already
reject the 233-240MHz range on both. But on GLK the DPLL max
frequency was bumped from 300MHz to 594MHz, so now we get to
also worry about the 446-480MHz range (double the original
problem range). Reject any frequency within the higher
problematic range as well.
Cc: stable(a)vger.kernel.org
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/3000
Signed-off-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
---
drivers/gpu/drm/i915/display/intel_hdmi.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/i915/display/intel_hdmi.c b/drivers/gpu/drm/i915/display/intel_hdmi.c
index 66e1ac3887c6..b593a71e6517 100644
--- a/drivers/gpu/drm/i915/display/intel_hdmi.c
+++ b/drivers/gpu/drm/i915/display/intel_hdmi.c
@@ -2218,7 +2218,11 @@ hdmi_port_clock_valid(struct intel_hdmi *hdmi,
has_hdmi_sink))
return MODE_CLOCK_HIGH;
- /* BXT DPLL can't generate 223-240 MHz */
+ /* GLK DPLL can't generate 446-480 MHz */
+ if (IS_GEMINILAKE(dev_priv) && clock > 446666 && clock < 480000)
+ return MODE_CLOCK_RANGE;
+
+ /* BXT/GLK DPLL can't generate 223-240 MHz */
if (IS_GEN9_LP(dev_priv) && clock > 223333 && clock < 240000)
return MODE_CLOCK_RANGE;
--
2.26.2
From: Muchun Song <songmuchun(a)bytedance.com>
Subject: mm: hugetlb: fix missing put_page in gather_surplus_pages()
The VM_BUG_ON_PAGE avoids the generation of any code, even if that
expression has side-effects when !CONFIG_DEBUG_VM.
Link: https://lkml.kernel.org/r/20210126031009.96266-1-songmuchun@bytedance.com
Fixes: e5dfacebe4a4 ("mm/hugetlb.c: just use put_page_testzero() instead of page_count()")
Signed-off-by: Muchun Song <songmuchun(a)bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reviewed-by: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
--- a/mm/hugetlb.c~mm-hugetlb-fix-missing-put_page-in-gather_surplus_pages
+++ a/mm/hugetlb.c
@@ -2047,13 +2047,16 @@ retry:
/* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+ int zeroed;
+
if ((--needed) < 0)
break;
/*
* This page is now managed by the hugetlb allocator and has
* no users -- drop the buddy allocator's reference.
*/
- VM_BUG_ON_PAGE(!put_page_testzero(page), page);
+ zeroed = put_page_testzero(page);
+ VM_BUG_ON_PAGE(!zeroed, page);
enqueue_huge_page(h, page);
}
free:
_
From: Roman Gushchin <guro(a)fb.com>
Subject: memblock: do not start bottom-up allocations with kernel_end
With kaslr the kernel image is placed at a random place, so starting the
bottom-up allocation with the kernel_end can result in an allocation
failure and a warning like this one:
[ 0.002920] hugetlb_cma: reserve 2048 MiB, up to 2048 MiB per node
[ 0.002921] ------------[ cut here ]------------
[ 0.002922] memblock: bottom-up allocation failed, memory hotremove may be affected
[ 0.002937] WARNING: CPU: 0 PID: 0 at mm/memblock.c:332 memblock_find_in_range_node+0x178/0x25a
[ 0.002937] Modules linked in:
[ 0.002939] CPU: 0 PID: 0 Comm: swapper Not tainted 5.10.0+ #1169
[ 0.002940] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014
[ 0.002942] RIP: 0010:memblock_find_in_range_node+0x178/0x25a
[ 0.002944] Code: e9 6d ff ff ff 48 85 c0 0f 85 da 00 00 00 80 3d 9b 35 df 00 00 75 15 48 c7 c7 c0 75 59 88 c6 05 8b 35 df 00 01 e8 25 8a fa ff <0f> 0b 48 c7 44 24 20 ff ff ff ff 44 89 e6 44 89 ea 48 c7 c1 70 5c
[ 0.002945] RSP: 0000:ffffffff88803d18 EFLAGS: 00010086 ORIG_RAX: 0000000000000000
[ 0.002947] RAX: 0000000000000000 RBX: 0000000240000000 RCX: 00000000ffffdfff
[ 0.002948] RDX: 00000000ffffdfff RSI: 00000000ffffffea RDI: 0000000000000046
[ 0.002948] RBP: 0000000100000000 R08: ffffffff88922788 R09: 0000000000009ffb
[ 0.002949] R10: 00000000ffffe000 R11: 3fffffffffffffff R12: 0000000000000000
[ 0.002950] R13: 0000000000000000 R14: 0000000080000000 R15: 00000001fb42c000
[ 0.002952] FS: 0000000000000000(0000) GS:ffffffff88f71000(0000) knlGS:0000000000000000
[ 0.002953] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 0.002954] CR2: ffffa080fb401000 CR3: 00000001fa80a000 CR4: 00000000000406b0
[ 0.002956] Call Trace:
[ 0.002961] ? memblock_alloc_range_nid+0x8d/0x11e
[ 0.002963] ? cma_declare_contiguous_nid+0x2c4/0x38c
[ 0.002964] ? hugetlb_cma_reserve+0xdc/0x128
[ 0.002968] ? flush_tlb_one_kernel+0xc/0x20
[ 0.002969] ? native_set_fixmap+0x82/0xd0
[ 0.002971] ? flat_get_apic_id+0x5/0x10
[ 0.002973] ? register_lapic_address+0x8e/0x97
[ 0.002975] ? setup_arch+0x8a5/0xc3f
[ 0.002978] ? start_kernel+0x66/0x547
[ 0.002980] ? load_ucode_bsp+0x4c/0xcd
[ 0.002982] ? secondary_startup_64_no_verify+0xb0/0xbb
[ 0.002986] random: get_random_bytes called from __warn+0xab/0x110 with crng_init=0
[ 0.002988] ---[ end trace f151227d0b39be70 ]---
At the same time, the kernel image is protected with memblock_reserve(),
so we can just start searching at PAGE_SIZE. In this case the bottom-up
allocation has the same chances to success as a top-down allocation, so
there is no reason to fallback in the case of a failure. All together it
simplifies the logic.
Link: https://lkml.kernel.org/r/20201217201214.3414100-2-guro@fb.com
Fixes: 8fabc623238e ("powerpc: Ensure that swiotlb buffer is allocated from low memory")
Signed-off-by: Roman Gushchin <guro(a)fb.com>
Reviewed-by: Mike Rapoport <rppt(a)linux.ibm.com>
Cc: Joonsoo Kim <iamjoonsoo.kim(a)lge.com>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: Rik van Riel <riel(a)surriel.com>
Cc: Wonhyuk Yang <vvghjk1234(a)gmail.com>
Cc: Thiago Jung Bauermann <bauerman(a)linux.ibm.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memblock.c | 49 +++++-------------------------------------------
1 file changed, 6 insertions(+), 43 deletions(-)
--- a/mm/memblock.c~memblock-do-not-start-bottom-up-allocations-with-kernel_end
+++ a/mm/memblock.c
@@ -275,14 +275,6 @@ __memblock_find_range_top_down(phys_addr
*
* Find @size free area aligned to @align in the specified range and node.
*
- * When allocation direction is bottom-up, the @start should be greater
- * than the end of the kernel image. Otherwise, it will be trimmed. The
- * reason is that we want the bottom-up allocation just near the kernel
- * image so it is highly likely that the allocated memory and the kernel
- * will reside in the same node.
- *
- * If bottom-up allocation failed, will try to allocate memory top-down.
- *
* Return:
* Found address on success, 0 on failure.
*/
@@ -291,8 +283,6 @@ static phys_addr_t __init_memblock membl
phys_addr_t end, int nid,
enum memblock_flags flags)
{
- phys_addr_t kernel_end, ret;
-
/* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE ||
end == MEMBLOCK_ALLOC_KASAN)
@@ -301,40 +291,13 @@ static phys_addr_t __init_memblock membl
/* avoid allocating the first page */
start = max_t(phys_addr_t, start, PAGE_SIZE);
end = max(start, end);
- kernel_end = __pa_symbol(_end);
-
- /*
- * try bottom-up allocation only when bottom-up mode
- * is set and @end is above the kernel image.
- */
- if (memblock_bottom_up() && end > kernel_end) {
- phys_addr_t bottom_up_start;
-
- /* make sure we will allocate above the kernel */
- bottom_up_start = max(start, kernel_end);
-
- /* ok, try bottom-up allocation first */
- ret = __memblock_find_range_bottom_up(bottom_up_start, end,
- size, align, nid, flags);
- if (ret)
- return ret;
-
- /*
- * we always limit bottom-up allocation above the kernel,
- * but top-down allocation doesn't have the limit, so
- * retrying top-down allocation may succeed when bottom-up
- * allocation failed.
- *
- * bottom-up allocation is expected to be fail very rarely,
- * so we use WARN_ONCE() here to see the stack trace if
- * fail happens.
- */
- WARN_ONCE(IS_ENABLED(CONFIG_MEMORY_HOTREMOVE),
- "memblock: bottom-up allocation failed, memory hotremove may be affected\n");
- }
- return __memblock_find_range_top_down(start, end, size, align, nid,
- flags);
+ if (memblock_bottom_up())
+ return __memblock_find_range_bottom_up(start, end, size, align,
+ nid, flags);
+ else
+ return __memblock_find_range_top_down(start, end, size, align,
+ nid, flags);
}
/**
_
From: Hugh Dickins <hughd(a)google.com>
Subject: mm: thp: fix MADV_REMOVE deadlock on shmem THP
Sergey reported deadlock between kswapd correctly doing its usual
lock_page(page) followed by down_read(page->mapping->i_mmap_rwsem), and
madvise(MADV_REMOVE) on an madvise(MADV_HUGEPAGE) area doing
down_write(page->mapping->i_mmap_rwsem) followed by lock_page(page).
This happened when shmem_fallocate(punch hole)'s unmap_mapping_range()
reaches zap_pmd_range()'s call to __split_huge_pmd(). The same deadlock
could occur when partially truncating a mapped huge tmpfs file, or using
fallocate(FALLOC_FL_PUNCH_HOLE) on it.
__split_huge_pmd()'s page lock was added in 5.8, to make sure that any
concurrent use of reuse_swap_page() (holding page lock) could not catch
the anon THP's mapcounts and swapcounts while they were being split.
Fortunately, reuse_swap_page() is never applied to a shmem or file THP
(not even by khugepaged, which checks PageSwapCache before calling), and
anonymous THPs are never created in shmem or file areas: so that
__split_huge_pmd()'s page lock can only be necessary for anonymous THPs,
on which there is no risk of deadlock with i_mmap_rwsem.
Link: https://lkml.kernel.org/r/alpine.LSU.2.11.2101161409470.2022@eggly.anvils
Fixes: c444eb564fb1 ("mm: thp: make the THP mapcount atomic against __split_huge_pmd_locked()")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Reported-by: Sergey Senozhatsky <sergey.senozhatsky.work(a)gmail.com>
Reviewed-by: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/huge_memory.c | 37 +++++++++++++++++++++++--------------
1 file changed, 23 insertions(+), 14 deletions(-)
--- a/mm/huge_memory.c~mm-thp-fix-madv_remove-deadlock-on-shmem-thp
+++ a/mm/huge_memory.c
@@ -2202,7 +2202,7 @@ void __split_huge_pmd(struct vm_area_str
{
spinlock_t *ptl;
struct mmu_notifier_range range;
- bool was_locked = false;
+ bool do_unlock_page = false;
pmd_t _pmd;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
@@ -2218,7 +2218,6 @@ void __split_huge_pmd(struct vm_area_str
VM_BUG_ON(freeze && !page);
if (page) {
VM_WARN_ON_ONCE(!PageLocked(page));
- was_locked = true;
if (page != pmd_page(*pmd))
goto out;
}
@@ -2227,19 +2226,29 @@ repeat:
if (pmd_trans_huge(*pmd)) {
if (!page) {
page = pmd_page(*pmd);
- if (unlikely(!trylock_page(page))) {
- get_page(page);
- _pmd = *pmd;
- spin_unlock(ptl);
- lock_page(page);
- spin_lock(ptl);
- if (unlikely(!pmd_same(*pmd, _pmd))) {
- unlock_page(page);
+ /*
+ * An anonymous page must be locked, to ensure that a
+ * concurrent reuse_swap_page() sees stable mapcount;
+ * but reuse_swap_page() is not used on shmem or file,
+ * and page lock must not be taken when zap_pmd_range()
+ * calls __split_huge_pmd() while i_mmap_lock is held.
+ */
+ if (PageAnon(page)) {
+ if (unlikely(!trylock_page(page))) {
+ get_page(page);
+ _pmd = *pmd;
+ spin_unlock(ptl);
+ lock_page(page);
+ spin_lock(ptl);
+ if (unlikely(!pmd_same(*pmd, _pmd))) {
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+ goto repeat;
+ }
put_page(page);
- page = NULL;
- goto repeat;
}
- put_page(page);
+ do_unlock_page = true;
}
}
if (PageMlocked(page))
@@ -2249,7 +2258,7 @@ repeat:
__split_huge_pmd_locked(vma, pmd, range.start, freeze);
out:
spin_unlock(ptl);
- if (!was_locked && page)
+ if (do_unlock_page)
unlock_page(page);
/*
* No need to double call mmu_notifier->invalidate_range() callback.
_
From: Rick Edgecombe <rick.p.edgecombe(a)intel.com>
Subject: mm/vmalloc: separate put pages and flush VM flags
When VM_MAP_PUT_PAGES was added, it was defined with the same value as
VM_FLUSH_RESET_PERMS. This doesn't seem like it will cause any big
functional problems other than some excess flushing for VM_MAP_PUT_PAGES
allocations.
Redefine VM_MAP_PUT_PAGES to have its own value. Also, rearrange things
so flags are less likely to be missed in the future.
Link: https://lkml.kernel.org/r/20210122233706.9304-1-rick.p.edgecombe@intel.com
Fixes: b944afc9d64d ("mm: add a VM_MAP_PUT_PAGES flag for vmap")
Signed-off-by: Rick Edgecombe <rick.p.edgecombe(a)intel.com>
Suggested-by: Matthew Wilcox <willy(a)infradead.org>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Daniel Axtens <dja(a)axtens.net>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/vmalloc.h | 9 ++-------
1 file changed, 2 insertions(+), 7 deletions(-)
--- a/include/linux/vmalloc.h~mm-vmalloc-separate-put-pages-and-flush-vm-flags
+++ a/include/linux/vmalloc.h
@@ -24,7 +24,8 @@ struct notifier_block; /* in notifier.h
#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
#define VM_NO_GUARD 0x00000040 /* don't add guard page */
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
-#define VM_MAP_PUT_PAGES 0x00000100 /* put pages and free array in vfree */
+#define VM_FLUSH_RESET_PERMS 0x00000100 /* reset direct map and flush TLB on unmap, can't be freed in atomic context */
+#define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */
/*
* VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
@@ -37,12 +38,6 @@ struct notifier_block; /* in notifier.h
* determine which allocations need the module shadow freed.
*/
-/*
- * Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with
- * vfree_atomic().
- */
-#define VM_FLUSH_RESET_PERMS 0x00000100 /* Reset direct map and flush TLB on unmap */
-
/* bits [20..32] reserved for arch specific ioremap internals */
/*
_
From: Rokudo Yan <wu-yan(a)tcl.com>
Subject: mm, compaction: move high_pfn to the for loop scope
In fast_isolate_freepages, high_pfn will be used if a prefered one(PFN >=
low_fn) not found. But the high_pfn is not reset before searching an free
area, so when it was used as freepage, it may from another free area
searched before. And move_freelist_head(freelist, freepage) will have
unexpected behavior(eg. corrupt the MOVABLE freelist)
Unable to handle kernel paging request at virtual address dead000000000200
Mem abort info:
ESR = 0x96000044
Exception class = DABT (current EL), IL = 32 bits
SET = 0, FnV = 0
EA = 0, S1PTW = 0
Data abort info:
ISV = 0, ISS = 0x00000044
CM = 0, WnR = 1
[dead000000000200] address between user and kernel address ranges
-000|list_cut_before(inline)
-000|move_freelist_head(inline)
-000|fast_isolate_freepages(inline)
-000|isolate_freepages(inline)
-000|compaction_alloc(?, ?)
-001|unmap_and_move(inline)
-001|migrate_pages([NSD:0xFFFFFF80088CBBD0] from = 0xFFFFFF80088CBD88, [NSD:0xFFFFFF80088CBBC8] get_new_p
-002|__read_once_size(inline)
-002|static_key_count(inline)
-002|static_key_false(inline)
-002|trace_mm_compaction_migratepages(inline)
-002|compact_zone(?, [NSD:0xFFFFFF80088CBCB0] capc = 0x0)
-003|kcompactd_do_work(inline)
-003|kcompactd([X19] p = 0xFFFFFF93227FBC40)
-004|kthread([X20] _create = 0xFFFFFFE1AFB26380)
-005|ret_from_fork(asm)
---|end of frame
The issue was reported on an smart phone product with 6GB ram and 3GB zram
as swap device.
This patch fixes the issue by reset high_pfn before searching each free
area, which ensure freepage and freelist match when call
move_freelist_head in fast_isolate_freepages().
Link: http://lkml.kernel.org/r/20190118175136.31341-12-mgorman@techsingularity.net
Link: https://lkml.kernel.org/r/20210112094720.1238444-1-wu-yan@tcl.com
Fixes: 5a811889de10f1eb ("mm, compaction: use free lists to quickly locate a migration target")
Signed-off-by: Rokudo Yan <wu-yan(a)tcl.com>
Acked-by: Mel Gorman <mgorman(a)techsingularity.net>
Acked-by: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/compaction.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/mm/compaction.c~mm-compaction-move-high_pfn-to-the-for-loop-scope
+++ a/mm/compaction.c
@@ -1342,7 +1342,7 @@ fast_isolate_freepages(struct compact_co
{
unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1);
unsigned int nr_scanned = 0;
- unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0;
+ unsigned long low_pfn, min_pfn, highest = 0;
unsigned long nr_isolated = 0;
unsigned long distance;
struct page *page = NULL;
@@ -1387,6 +1387,7 @@ fast_isolate_freepages(struct compact_co
struct page *freepage;
unsigned long flags;
unsigned int order_scanned = 0;
+ unsigned long high_pfn = 0;
if (!area->nr_free)
continue;
_
From: Muchun Song <songmuchun(a)bytedance.com>
Subject: mm: hugetlb: remove VM_BUG_ON_PAGE from page_huge_active
The page_huge_active() can be called from scan_movable_pages() which do
not hold a reference count to the HugeTLB page. So when we call
page_huge_active() from scan_movable_pages(), the HugeTLB page can be
freed parallel. Then we will trigger a BUG_ON which is in the
page_huge_active() when CONFIG_DEBUG_VM is enabled. Just remove the
VM_BUG_ON_PAGE.
Link: https://lkml.kernel.org/r/20210115124942.46403-6-songmuchun@bytedance.com
Fixes: 7e1f049efb86 ("mm: hugetlb: cleanup using paeg_huge_active()")
Signed-off-by: Muchun Song <songmuchun(a)bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
--- a/mm/hugetlb.c~mm-hugetlb-remove-vm_bug_on_page-from-page_huge_active
+++ a/mm/hugetlb.c
@@ -1361,8 +1361,7 @@ struct hstate *size_to_hstate(unsigned l
*/
bool page_huge_active(struct page *page)
{
- VM_BUG_ON_PAGE(!PageHuge(page), page);
- return PageHead(page) && PagePrivate(&page[1]);
+ return PageHeadHuge(page) && PagePrivate(&page[1]);
}
/* never called for tail page */
_
From: Muchun Song <songmuchun(a)bytedance.com>
Subject: mm: hugetlb: fix a race between isolating and freeing page
There is a race between isolate_huge_page() and __free_huge_page().
CPU0: CPU1:
if (PageHuge(page))
put_page(page)
__free_huge_page(page)
spin_lock(&hugetlb_lock)
update_and_free_page(page)
set_compound_page_dtor(page,
NULL_COMPOUND_DTOR)
spin_unlock(&hugetlb_lock)
isolate_huge_page(page)
// trigger BUG_ON
VM_BUG_ON_PAGE(!PageHead(page), page)
spin_lock(&hugetlb_lock)
page_huge_active(page)
// trigger BUG_ON
VM_BUG_ON_PAGE(!PageHuge(page), page)
spin_unlock(&hugetlb_lock)
When we isolate a HugeTLB page on CPU0. Meanwhile, we free it to the
buddy allocator on CPU1. Then, we can trigger a BUG_ON on CPU0. Because
it is already freed to the buddy allocator.
Link: https://lkml.kernel.org/r/20210115124942.46403-5-songmuchun@bytedance.com
Fixes: c8721bbbdd36 ("mm: memory-hotplug: enable memory hotplug to handle hugepage")
Signed-off-by: Muchun Song <songmuchun(a)bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/mm/hugetlb.c~mm-hugetlb-fix-a-race-between-isolating-and-freeing-page
+++ a/mm/hugetlb.c
@@ -5594,9 +5594,9 @@ bool isolate_huge_page(struct page *page
{
bool ret = true;
- VM_BUG_ON_PAGE(!PageHead(page), page);
spin_lock(&hugetlb_lock);
- if (!page_huge_active(page) || !get_page_unless_zero(page)) {
+ if (!PageHeadHuge(page) || !page_huge_active(page) ||
+ !get_page_unless_zero(page)) {
ret = false;
goto unlock;
}
_
From: Muchun Song <songmuchun(a)bytedance.com>
Subject: mm: hugetlb: fix a race between freeing and dissolving the page
There is a race condition between __free_huge_page()
and dissolve_free_huge_page().
CPU0: CPU1:
// page_count(page) == 1
put_page(page)
__free_huge_page(page)
dissolve_free_huge_page(page)
spin_lock(&hugetlb_lock)
// PageHuge(page) && !page_count(page)
update_and_free_page(page)
// page is freed to the buddy
spin_unlock(&hugetlb_lock)
spin_lock(&hugetlb_lock)
clear_page_huge_active(page)
enqueue_huge_page(page)
// It is wrong, the page is already freed
spin_unlock(&hugetlb_lock)
The race windows is between put_page() and dissolve_free_huge_page().
We should make sure that the page is already on the free list
when it is dissolved.
As a result __free_huge_page would corrupt page(s) already in the buddy
allocator.
Link: https://lkml.kernel.org/r/20210115124942.46403-4-songmuchun@bytedance.com
Fixes: c8721bbbdd36 ("mm: memory-hotplug: enable memory hotplug to handle hugepage")
Signed-off-by: Muchun Song <songmuchun(a)bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 39 +++++++++++++++++++++++++++++++++++++++
1 file changed, 39 insertions(+)
--- a/mm/hugetlb.c~mm-hugetlb-fix-a-race-between-freeing-and-dissolving-the-page
+++ a/mm/hugetlb.c
@@ -79,6 +79,21 @@ DEFINE_SPINLOCK(hugetlb_lock);
static int num_fault_mutexes;
struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
+static inline bool PageHugeFreed(struct page *head)
+{
+ return page_private(head + 4) == -1UL;
+}
+
+static inline void SetPageHugeFreed(struct page *head)
+{
+ set_page_private(head + 4, -1UL);
+}
+
+static inline void ClearPageHugeFreed(struct page *head)
+{
+ set_page_private(head + 4, 0);
+}
+
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
@@ -1028,6 +1043,7 @@ static void enqueue_huge_page(struct hst
list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
+ SetPageHugeFreed(page);
}
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
@@ -1044,6 +1060,7 @@ static struct page *dequeue_huge_page_no
list_move(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
+ ClearPageHugeFreed(page);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
return page;
@@ -1505,6 +1522,7 @@ static void prep_new_huge_page(struct hs
spin_lock(&hugetlb_lock);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
+ ClearPageHugeFreed(page);
spin_unlock(&hugetlb_lock);
}
@@ -1755,6 +1773,7 @@ int dissolve_free_huge_page(struct page
{
int rc = -EBUSY;
+retry:
/* Not to disrupt normal path by vainly holding hugetlb_lock */
if (!PageHuge(page))
return 0;
@@ -1771,6 +1790,26 @@ int dissolve_free_huge_page(struct page
int nid = page_to_nid(head);
if (h->free_huge_pages - h->resv_huge_pages == 0)
goto out;
+
+ /*
+ * We should make sure that the page is already on the free list
+ * when it is dissolved.
+ */
+ if (unlikely(!PageHugeFreed(head))) {
+ spin_unlock(&hugetlb_lock);
+ cond_resched();
+
+ /*
+ * Theoretically, we should return -EBUSY when we
+ * encounter this race. In fact, we have a chance
+ * to successfully dissolve the page if we do a
+ * retry. Because the race window is quite small.
+ * If we seize this opportunity, it is an optimization
+ * for increasing the success rate of dissolving page.
+ */
+ goto retry;
+ }
+
/*
* Move PageHWPoison flag from head page to the raw error page,
* which makes any subpages rather than the error page reusable.
_
From: Muchun Song <songmuchun(a)bytedance.com>
Subject: mm: hugetlbfs: fix cannot migrate the fallocated HugeTLB page
If a new hugetlb page is allocated during fallocate it will not be marked
as active (set_page_huge_active) which will result in a later
isolate_huge_page failure when the page migration code would like to move
that page. Such a failure would be unexpected and wrong.
Only export set_page_huge_active, just leave clear_page_huge_active as
static. Because there are no external users.
Link: https://lkml.kernel.org/r/20210115124942.46403-3-songmuchun@bytedance.com
Fixes: 70c3547e36f5 (hugetlbfs: add hugetlbfs_fallocate())
Signed-off-by: Muchun Song <songmuchun(a)bytedance.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/hugetlbfs/inode.c | 3 ++-
include/linux/hugetlb.h | 2 ++
mm/hugetlb.c | 2 +-
3 files changed, 5 insertions(+), 2 deletions(-)
--- a/fs/hugetlbfs/inode.c~mm-hugetlbfs-fix-cannot-migrate-the-fallocated-hugetlb-page
+++ a/fs/hugetlbfs/inode.c
@@ -735,9 +735,10 @@ static long hugetlbfs_fallocate(struct f
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ set_page_huge_active(page);
/*
* unlock_page because locked by add_to_page_cache()
- * page_put due to reference from alloc_huge_page()
+ * put_page() due to reference from alloc_huge_page()
*/
unlock_page(page);
put_page(page);
--- a/include/linux/hugetlb.h~mm-hugetlbfs-fix-cannot-migrate-the-fallocated-hugetlb-page
+++ a/include/linux/hugetlb.h
@@ -770,6 +770,8 @@ static inline void huge_ptep_modify_prot
}
#endif
+void set_page_huge_active(struct page *page);
+
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
--- a/mm/hugetlb.c~mm-hugetlbfs-fix-cannot-migrate-the-fallocated-hugetlb-page
+++ a/mm/hugetlb.c
@@ -1349,7 +1349,7 @@ bool page_huge_active(struct page *page)
}
/* never called for tail page */
-static void set_page_huge_active(struct page *page)
+void set_page_huge_active(struct page *page)
{
VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
SetPagePrivate(&page[1]);
_