The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 021ad274d7dc31611d4f47f7dd4ac7a224526f30 Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui(a)microsoft.com>
Date: Thu, 15 Mar 2018 14:20:53 +0000
Subject: [PATCH] PCI: hv: Serialize the present and eject work items
When we hot-remove the device, we first receive a PCI_EJECT message and
then receive a PCI_BUS_RELATIONS message with bus_rel->device_count == 0.
The first message is offloaded to hv_eject_device_work(), and the second
is offloaded to pci_devices_present_work(). Both the paths can be running
list_del(&hpdev->list_entry), causing general protection fault, because
system_wq can run them concurrently.
The patch eliminates the race condition.
Since access to present/eject work items is serialized, we do not need the
hbus->enum_sem anymore, so remove it.
Fixes: 4daace0d8ce8 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs")
Link: https://lkml.kernel.org/r/KL1P15301MB00064DA6B4D221123B5241CFBFD70@KL1P1530…
Tested-by: Adrian Suhov <v-adsuho(a)microsoft.com>
Tested-by: Chris Valean <v-chvale(a)microsoft.com>
Signed-off-by: Dexuan Cui <decui(a)microsoft.com>
[lorenzo.pieralisi(a)arm.com: squashed semaphore removal patch]
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi(a)arm.com>
Reviewed-by: Michael Kelley <mikelley(a)microsoft.com>
Acked-by: Haiyang Zhang <haiyangz(a)microsoft.com>
Cc: <stable(a)vger.kernel.org> # v4.6+
Cc: Vitaly Kuznetsov <vkuznets(a)redhat.com>
Cc: Jack Morgenstein <jackm(a)mellanox.com>
Cc: Stephen Hemminger <sthemmin(a)microsoft.com>
Cc: K. Y. Srinivasan <kys(a)microsoft.com>
diff --git a/drivers/pci/host/pci-hyperv.c b/drivers/pci/host/pci-hyperv.c
index 2faf38eab785..b7fd5c157d73 100644
--- a/drivers/pci/host/pci-hyperv.c
+++ b/drivers/pci/host/pci-hyperv.c
@@ -447,7 +447,6 @@ struct hv_pcibus_device {
spinlock_t device_list_lock; /* Protect lists below */
void __iomem *cfg_addr;
- struct semaphore enum_sem;
struct list_head resources_for_children;
struct list_head children;
@@ -461,6 +460,8 @@ struct hv_pcibus_device {
struct retarget_msi_interrupt retarget_msi_interrupt_params;
spinlock_t retarget_msi_interrupt_lock;
+
+ struct workqueue_struct *wq;
};
/*
@@ -1590,12 +1591,8 @@ static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
* It must also treat the omission of a previously observed device as
* notification that the device no longer exists.
*
- * Note that this function is a work item, and it may not be
- * invoked in the order that it was queued. Back to back
- * updates of the list of present devices may involve queuing
- * multiple work items, and this one may run before ones that
- * were sent later. As such, this function only does something
- * if is the last one in the queue.
+ * Note that this function is serialized with hv_eject_device_work(),
+ * because both are pushed to the ordered workqueue hbus->wq.
*/
static void pci_devices_present_work(struct work_struct *work)
{
@@ -1616,11 +1613,6 @@ static void pci_devices_present_work(struct work_struct *work)
INIT_LIST_HEAD(&removed);
- if (down_interruptible(&hbus->enum_sem)) {
- put_hvpcibus(hbus);
- return;
- }
-
/* Pull this off the queue and process it if it was the last one. */
spin_lock_irqsave(&hbus->device_list_lock, flags);
while (!list_empty(&hbus->dr_list)) {
@@ -1637,7 +1629,6 @@ static void pci_devices_present_work(struct work_struct *work)
spin_unlock_irqrestore(&hbus->device_list_lock, flags);
if (!dr) {
- up(&hbus->enum_sem);
put_hvpcibus(hbus);
return;
}
@@ -1724,7 +1715,6 @@ static void pci_devices_present_work(struct work_struct *work)
break;
}
- up(&hbus->enum_sem);
put_hvpcibus(hbus);
kfree(dr);
}
@@ -1770,7 +1760,7 @@ static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
spin_unlock_irqrestore(&hbus->device_list_lock, flags);
get_hvpcibus(hbus);
- schedule_work(&dr_wrk->wrk);
+ queue_work(hbus->wq, &dr_wrk->wrk);
}
/**
@@ -1848,7 +1838,7 @@ static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
get_pcichild(hpdev, hv_pcidev_ref_pnp);
INIT_WORK(&hpdev->wrk, hv_eject_device_work);
get_hvpcibus(hpdev->hbus);
- schedule_work(&hpdev->wrk);
+ queue_work(hpdev->hbus->wq, &hpdev->wrk);
}
/**
@@ -2461,13 +2451,18 @@ static int hv_pci_probe(struct hv_device *hdev,
spin_lock_init(&hbus->config_lock);
spin_lock_init(&hbus->device_list_lock);
spin_lock_init(&hbus->retarget_msi_interrupt_lock);
- sema_init(&hbus->enum_sem, 1);
init_completion(&hbus->remove_event);
+ hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0,
+ hbus->sysdata.domain);
+ if (!hbus->wq) {
+ ret = -ENOMEM;
+ goto free_bus;
+ }
ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
hv_pci_onchannelcallback, hbus);
if (ret)
- goto free_bus;
+ goto destroy_wq;
hv_set_drvdata(hdev, hbus);
@@ -2536,6 +2531,8 @@ static int hv_pci_probe(struct hv_device *hdev,
hv_free_config_window(hbus);
close:
vmbus_close(hdev->channel);
+destroy_wq:
+ destroy_workqueue(hbus->wq);
free_bus:
free_page((unsigned long)hbus);
return ret;
@@ -2615,6 +2612,7 @@ static int hv_pci_remove(struct hv_device *hdev)
irq_domain_free_fwnode(hbus->sysdata.fwnode);
put_hvpcibus(hbus);
wait_for_completion(&hbus->remove_event);
+ destroy_workqueue(hbus->wq);
free_page((unsigned long)hbus);
return 0;
}
The current rescind processing code will not correctly handle
the case where the host immediately rescinds a channel that has
been offerred. In this case, we could be blocked in the open call and
since the channel is rescinded, the host will not respond and we could
be blocked forever in the vmbus open call.i Fix this problem.
Signed-off-by: K. Y. Srinivasan <kys(a)microsoft.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
(cherry picked from commit 7fa32e5ec28b1609abc0b797b58267f725fc3964)
Signed-off-by: Mohammed Gamal <mgamal(a)redhat.com>
---
drivers/hv/channel.c | 10 ++++++++--
drivers/hv/channel_mgmt.c | 7 ++++---
include/linux/hyperv.h | 1 +
3 files changed, 13 insertions(+), 5 deletions(-)
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 1c967f5..1d58986 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -659,22 +659,28 @@ void vmbus_close(struct vmbus_channel *channel)
*/
return;
}
- mutex_lock(&vmbus_connection.channel_mutex);
/*
* Close all the sub-channels first and then close the
* primary channel.
*/
list_for_each_safe(cur, tmp, &channel->sc_list) {
cur_channel = list_entry(cur, struct vmbus_channel, sc_list);
- vmbus_close_internal(cur_channel);
if (cur_channel->rescind) {
+ wait_for_completion(&cur_channel->rescind_event);
+ mutex_lock(&vmbus_connection.channel_mutex);
+ vmbus_close_internal(cur_channel);
hv_process_channel_removal(
cur_channel->offermsg.child_relid);
+ } else {
+ mutex_lock(&vmbus_connection.channel_mutex);
+ vmbus_close_internal(cur_channel);
}
+ mutex_unlock(&vmbus_connection.channel_mutex);
}
/*
* Now close the primary.
*/
+ mutex_lock(&vmbus_connection.channel_mutex);
vmbus_close_internal(channel);
mutex_unlock(&vmbus_connection.channel_mutex);
}
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index ec5454f..c21020b 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -333,6 +333,7 @@ static struct vmbus_channel *alloc_channel(void)
return NULL;
spin_lock_init(&channel->lock);
+ init_completion(&channel->rescind_event);
INIT_LIST_HEAD(&channel->sc_list);
INIT_LIST_HEAD(&channel->percpu_list);
@@ -898,6 +899,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
/*
* Now wait for offer handling to complete.
*/
+ vmbus_rescind_cleanup(channel);
while (READ_ONCE(channel->probe_done) == false) {
/*
* We wait here until any channel offer is currently
@@ -913,7 +915,6 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
if (channel->device_obj) {
if (channel->chn_rescind_callback) {
channel->chn_rescind_callback(channel);
- vmbus_rescind_cleanup(channel);
return;
}
/*
@@ -922,7 +923,6 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
*/
dev = get_device(&channel->device_obj->device);
if (dev) {
- vmbus_rescind_cleanup(channel);
vmbus_device_unregister(channel->device_obj);
put_device(dev);
}
@@ -936,13 +936,14 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
* 2. Then close the primary channel.
*/
mutex_lock(&vmbus_connection.channel_mutex);
- vmbus_rescind_cleanup(channel);
if (channel->state == CHANNEL_OPEN_STATE) {
/*
* The channel is currently not open;
* it is safe for us to cleanup the channel.
*/
hv_process_channel_removal(rescind->child_relid);
+ } else {
+ complete(&channel->rescind_event);
}
mutex_unlock(&vmbus_connection.channel_mutex);
}
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 1552a5f..465d5db 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -724,6 +724,7 @@ struct vmbus_channel {
u8 monitor_bit;
bool rescind; /* got rescind msg */
+ struct completion rescind_event;
u32 ringbuffer_gpadlhandle;
--
1.8.3.1
This is the start of the stable review cycle for the 3.18.106 release.
There are 52 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Tue Apr 24 13:53:02 UTC 2018.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v3.x/stable-review/patch-3.18.106-r…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-3.18.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 3.18.106-rc1
Amir Goldstein <amir73il(a)gmail.com>
fanotify: fix logic of events on child
Ian Kent <raven(a)themaw.net>
autofs: mount point create should honour passed in mode
Al Viro <viro(a)zeniv.linux.org.uk>
Don't leak MNT_INTERNAL away from internal mounts
Al Viro <viro(a)zeniv.linux.org.uk>
rpc_pipefs: fix double-dput()
Al Viro <viro(a)zeniv.linux.org.uk>
hypfs_kill_super(): deal with failed allocations
Al Viro <viro(a)zeniv.linux.org.uk>
jffs2_kill_sb(): deal with failed allocations
Michael Ellerman <mpe(a)ellerman.id.au>
powerpc/lib: Fix off-by-one in alternate feature patching
Matt Redfearn <matt.redfearn(a)mips.com>
MIPS: memset.S: Fix clobber of v1 in last_fixup
Matt Redfearn <matt.redfearn(a)mips.com>
MIPS: memset.S: Fix return of __clear_user from Lpartial_fixup
Matt Redfearn <matt.redfearn(a)mips.com>
MIPS: memset.S: EVA & fault support for small_memset
Rodrigo Rivas Costa <rodrigorivascosta(a)gmail.com>
HID: hidraw: Fix crash on HIDIOCGFEATURE with a destroyed device
Takashi Iwai <tiwai(a)suse.de>
ALSA: rawmidi: Fix missing input substream checks in compat ioctls
Paul Parsons <lost.distance(a)yahoo.com>
drm/radeon: Fix PCIe lane width calculation
Theodore Ts'o <tytso(a)mit.edu>
ext4: don't allow r/w mounts if metadata blocks overlap the superblock
Theodore Ts'o <tytso(a)mit.edu>
ext4: fail ext4_iget for root directory if unallocated
Theodore Ts'o <tytso(a)mit.edu>
ext4: add validity checks for bitmap block numbers
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Fix endless loop for XRUN recovery in OSS emulation
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Fix mutex unbalance in OSS emulation ioctls
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Return -EBUSY for OSS ioctls changing busy streams
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Avoid potential races between OSS ioctls and read/write
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Use ERESTARTSYS instead of EINTR in OSS emulation
Nicholas Mc Guire <hofrat(a)osadl.org>
ALSA: oss: consolidate kmalloc/memset 0 call to kzalloc
Igor Pylypiv <igor.pylypiv(a)gmail.com>
watchdog: f71808e_wdt: Fix WD_EN register read
Mikhail Lappo <mikhail.lappo(a)esrlabs.com>
thermal: imx: Fix race condition in imx_thermal_probe()
Richard Genoud <richard.genoud(a)gmail.com>
clk: mvebu: armada-38x: add support for missing clocks
Ralph Sennhauser <ralph.sennhauser(a)gmail.com>
clk: mvebu: armada-38x: add support for 1866MHz variants
Alex Smith <alex.smith(a)imgtec.com>
mmc: jz4740: Fix race condition in IRQ mask update
Theodore Ts'o <tytso(a)mit.edu>
jbd2: if the journal is aborted then don't allow update of the log tail
Theodore Ts'o <tytso(a)mit.edu>
random: use a tighter cap in credit_entropy_bits_safe()
Mika Westerberg <mika.westerberg(a)linux.intel.com>
thunderbolt: Resume control channel after hibernation image is created
James Kelly <jamespeterkelly(a)gmail.com>
ASoC: ssm2602: Replace reg_default_raw with reg_default
Nicholas Piggin <npiggin(a)gmail.com>
powerpc/powernv: Fix OPAL NVRAM driver OPAL_BUSY loops
Nicholas Piggin <npiggin(a)gmail.com>
powerpc/64: Fix smp_wmb barrier definition use use lwsync consistently
Nicholas Piggin <npiggin(a)gmail.com>
powerpc/powernv: Handle unknown OPAL errors in opal_nvram_write()
Aaron Ma <aaron.ma(a)canonical.com>
HID: i2c-hid: fix size check and type usage
Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
usb: dwc3: pci: Properly cleanup resource
Zhengjun Xing <zhengjun.xing(a)linux.intel.com>
USB:fix USB3 devices behind USB3 hubs not resuming at hibernate thaw
Mika Westerberg <mika.westerberg(a)linux.intel.com>
ACPI / hotplug / PCI: Check presence of slot itself in get_slot_status()
Jason Andryuk <jandryuk(a)gmail.com>
xen-netfront: Fix hang on device removal
Nicolas Ferre <nicolas.ferre(a)microchip.com>
ARM: dts: at91: at91sam9g25: fix mux-mask pinctrl property
Heinrich Schuchardt <xypron.glpk(a)gmx.de>
usb: musb: gadget: misplaced out of bounds check
Takashi Iwai <tiwai(a)suse.de>
resource: fix integer overflow at reallocation
Andrew Morton <akpm(a)linux-foundation.org>
fs/reiserfs/journal.c: add missing resierfs_warning() arg
Richard Weinberger <richard(a)nod.at>
ubi: Reject MLC NAND
Romain Izard <romain.izard.pro(a)gmail.com>
ubi: Fix error for write access
Richard Weinberger <richard(a)nod.at>
ubifs: Check ubifs_wbuf_sync() return code
Tejaswi Tanikella <tejaswit(a)codeaurora.org>
slip: Check if rstate is initialized before uncompressing
Vasily Gorbik <gor(a)linux.ibm.com>
s390/ipl: ensure loadparm valid flag is set
Julian Wiedmann <jwi(a)linux.vnet.ibm.com>
s390/qdio: don't merge ERROR output buffers
Julian Wiedmann <jwi(a)linux.vnet.ibm.com>
s390/qdio: don't retry EQBS after CCQ 96
Helge Deller <deller(a)gmx.de>
parisc: Fix out of array access in match_pci_device()
Mauro Carvalho Chehab <mchehab(a)s-opensource.com>
media: v4l2-compat-ioctl32: don't oops on overlay
-------------
Diffstat:
Makefile | 4 +-
arch/arm/boot/dts/at91sam9g25.dtsi | 2 +-
arch/mips/lib/memset.S | 11 +-
arch/parisc/kernel/drivers.c | 4 +
arch/powerpc/include/asm/barrier.h | 3 +-
arch/powerpc/include/asm/synch.h | 4 -
arch/powerpc/lib/feature-fixups.c | 2 +-
arch/powerpc/platforms/powernv/opal-nvram.c | 11 +-
arch/s390/hypfs/inode.c | 2 +-
arch/s390/kernel/ipl.c | 1 +
drivers/char/random.c | 2 +-
drivers/clk/mvebu/armada-38x.c | 15 +-
drivers/gpu/drm/radeon/si_dpm.c | 4 +-
drivers/hid/hidraw.c | 5 +
drivers/hid/i2c-hid/i2c-hid.c | 13 +-
drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 4 +-
drivers/mmc/host/jz4740_mmc.c | 2 +-
drivers/mtd/ubi/block.c | 2 +-
drivers/mtd/ubi/build.c | 11 ++
drivers/net/slip/slhc.c | 5 +
drivers/net/xen-netfront.c | 7 +-
drivers/pci/hotplug/acpiphp_glue.c | 23 +++-
drivers/s390/cio/qdio_main.c | 42 +++---
drivers/thermal/imx_thermal.c | 6 +-
drivers/thunderbolt/nhi.c | 1 +
drivers/usb/core/generic.c | 9 +-
drivers/usb/dwc3/dwc3-pci.c | 2 +-
drivers/usb/musb/musb_gadget_ep0.c | 14 +-
drivers/watchdog/f71808e_wdt.c | 2 +-
fs/autofs4/root.c | 2 +-
fs/ext4/balloc.c | 16 ++-
fs/ext4/ialloc.c | 8 +-
fs/ext4/inode.c | 6 +
fs/ext4/super.c | 6 +
fs/jbd2/journal.c | 5 +-
fs/jffs2/super.c | 2 +-
fs/namespace.c | 3 +-
fs/notify/fanotify/fanotify.c | 34 ++---
fs/reiserfs/journal.c | 2 +-
fs/ubifs/super.c | 14 +-
include/net/slhc_vj.h | 1 +
include/sound/pcm_oss.h | 1 +
kernel/resource.c | 3 +-
net/sunrpc/rpc_pipe.c | 1 +
sound/core/oss/pcm_oss.c | 189 ++++++++++++++++++++------
sound/core/rawmidi_compat.c | 18 ++-
sound/soc/codecs/ssm2602.c | 19 ++-
47 files changed, 388 insertions(+), 155 deletions(-)
From: Long Li <longli(a)microsoft.com>
When sending the last iov that breaks into smaller buffers to fit the
transfer size, it's necessary to check if this is the last iov.
If this is the latest iov, stop and proceed to send pages.
Signed-off-by: Long Li <longli(a)microsoft.com>
Cc: stable(a)vger.kernel.org
---
fs/cifs/smbdirect.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 90e673c..b5c6c0d 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -2197,6 +2197,8 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst)
goto done;
}
i++;
+ if (i == rqst->rq_nvec)
+ break;
}
start = i;
buflen = 0;
--
2.7.4
From: Thor Thayer <thor.thayer(a)linux.intel.com>
The current Cadence QSPI driver caused a kernel panic when loading
a Root Filesystem from QSPI. The problem was caused by reading more
bytes than needed because the QSPI operated on 4 bytes at a time.
<snip>
[ 7.947754] spi_nor_read[1048]:from 0x037cad74, len 1 [bfe07fff]
[ 7.956247] cqspi_read[910]:offset 0x58502516, buffer=bfe07fff
[ 7.956247]
[ 7.966046] Unable to handle kernel paging request at virtual
address bfe08002
[ 7.973239] pgd = eebfc000
[ 7.975931] [bfe08002] *pgd=2fffb811, *pte=00000000, *ppte=00000000
</snip>
Notice above how only 1 byte needed to be read but by reading 4 bytes
into the end of a mapped page, an unrecoverable page fault occurred.
This patch uses a temporary buffer to hold the 4 bytes read and then
copies only the bytes required into the buffer. A min() function is
used to limit the length to prevent buffer overflows.
Request testing of this patch on other platforms. This was tested
on the Intel Arria10 SoCFPGA DevKit.
Fixes: 0cf1725676a97fc8 ("mtd: spi-nor: cqspi: Fix build on arches missing readsl/writesl")
Signed-off-by: Thor Thayer <thor.thayer(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org>
Reviewed-by: Marek Vasut <marek.vasut(a)gmail.com>
---
v2 Changes to only write dangling bytes at end of transfer since
previous patch may have multiple dangling byte transfers.
Remove write patch since no errors reported and write timeout
needs more investigation.
v3 Add Fixes tag Cc-stable tag.
---
drivers/mtd/spi-nor/cadence-quadspi.c | 19 +++++++++++++++++--
1 file changed, 17 insertions(+), 2 deletions(-)
diff --git a/drivers/mtd/spi-nor/cadence-quadspi.c b/drivers/mtd/spi-nor/cadence-quadspi.c
index 2f3a4d4232b3..c3f7aaa5d18f 100644
--- a/drivers/mtd/spi-nor/cadence-quadspi.c
+++ b/drivers/mtd/spi-nor/cadence-quadspi.c
@@ -507,7 +507,9 @@ static int cqspi_indirect_read_execute(struct spi_nor *nor, u8 *rxbuf,
void __iomem *reg_base = cqspi->iobase;
void __iomem *ahb_base = cqspi->ahb_base;
unsigned int remaining = n_rx;
+ unsigned int mod_bytes = n_rx % 4;
unsigned int bytes_to_read = 0;
+ u8 *rxbuf_end = rxbuf + n_rx;
int ret = 0;
writel(from_addr, reg_base + CQSPI_REG_INDIRECTRDSTARTADDR);
@@ -536,11 +538,24 @@ static int cqspi_indirect_read_execute(struct spi_nor *nor, u8 *rxbuf,
}
while (bytes_to_read != 0) {
+ unsigned int word_remain = round_down(remaining, 4);
+
bytes_to_read *= cqspi->fifo_width;
bytes_to_read = bytes_to_read > remaining ?
remaining : bytes_to_read;
- ioread32_rep(ahb_base, rxbuf,
- DIV_ROUND_UP(bytes_to_read, 4));
+ bytes_to_read = round_down(bytes_to_read, 4);
+ /* Read 4 byte word chunks then single bytes */
+ if (bytes_to_read) {
+ ioread32_rep(ahb_base, rxbuf,
+ (bytes_to_read / 4));
+ } else if (!word_remain && mod_bytes) {
+ unsigned int temp = ioread32(ahb_base);
+
+ bytes_to_read = mod_bytes;
+ memcpy(rxbuf, &temp, min((unsigned int)
+ (rxbuf_end - rxbuf),
+ bytes_to_read));
+ }
rxbuf += bytes_to_read;
remaining -= bytes_to_read;
bytes_to_read = cqspi_get_rd_sram_level(cqspi);
--
2.7.4
From: Greg Thelen <gthelen(a)google.com>
commit 2e898e4c0a3897ccd434adac5abb8330194f527b upstream.
lock_page_memcg()/unlock_page_memcg() use spin_lock_irqsave/restore() if
the page's memcg is undergoing move accounting, which occurs when a
process leaves its memcg for a new one that has
memory.move_charge_at_immigrate set.
unlocked_inode_to_wb_begin,end() use spin_lock_irq/spin_unlock_irq() if
the given inode is switching writeback domains. Switches occur when
enough writes are issued from a new domain.
This existing pattern is thus suspicious:
lock_page_memcg(page);
unlocked_inode_to_wb_begin(inode, &locked);
...
unlocked_inode_to_wb_end(inode, locked);
unlock_page_memcg(page);
If both inode switch and process memcg migration are both in-flight then
unlocked_inode_to_wb_end() will unconditionally enable interrupts while
still holding the lock_page_memcg() irq spinlock. This suggests the
possibility of deadlock if an interrupt occurs before unlock_page_memcg().
truncate
__cancel_dirty_page
lock_page_memcg
unlocked_inode_to_wb_begin
unlocked_inode_to_wb_end
<interrupts mistakenly enabled>
<interrupt>
end_page_writeback
test_clear_page_writeback
lock_page_memcg
<deadlock>
unlock_page_memcg
Due to configuration limitations this deadlock is not currently possible
because we don't mix cgroup writeback (a cgroupv2 feature) and
memory.move_charge_at_immigrate (a cgroupv1 feature).
If the kernel is hacked to always claim inode switching and memcg
moving_account, then this script triggers lockup in less than a minute:
cd /mnt/cgroup/memory
mkdir a b
echo 1 > a/memory.move_charge_at_immigrate
echo 1 > b/memory.move_charge_at_immigrate
(
echo $BASHPID > a/cgroup.procs
while true; do
dd if=/dev/zero of=/mnt/big bs=1M count=256
done
) &
while true; do
sync
done &
sleep 1h &
SLEEP=$!
while true; do
echo $SLEEP > a/cgroup.procs
echo $SLEEP > b/cgroup.procs
done
The deadlock does not seem possible, so it's debatable if there's any
reason to modify the kernel. I suggest we should to prevent future
surprises. And Wang Long said "this deadlock occurs three times in our
environment", so there's more reason to apply this, even to stable.
Stable 4.4 has minor conflicts applying this patch. For a clean 4.4 patch
see "[PATCH for-4.4] writeback: safer lock nesting"
https://lkml.org/lkml/2018/4/11/146
Wang Long said "this deadlock occurs three times in our environment"
[gthelen(a)google.com: v4]
Link: http://lkml.kernel.org/r/20180411084653.254724-1-gthelen@google.com
[akpm(a)linux-foundation.org: comment tweaks, struct initialization simplification]
Change-Id: Ibb773e8045852978f6207074491d262f1b3fb613
Link: http://lkml.kernel.org/r/20180410005908.167976-1-gthelen@google.com
Fixes: 682aa8e1a6a1 ("writeback: implement unlocked_inode_to_wb transaction and use it for stat updates")
Signed-off-by: Greg Thelen <gthelen(a)google.com>
Reported-by: Wang Long <wanglong19(a)meituan.com>
Acked-by: Wang Long <wanglong19(a)meituan.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Reviewed-by: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Tejun Heo <tj(a)kernel.org>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: <stable(a)vger.kernel.org> [v4.2+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
[natechancellor: Adjust context due to lack of b93b016313b3b]
Signed-off-by: Nathan Chancellor <natechancellor(a)gmail.com>
---
fs/fs-writeback.c | 7 ++++---
include/linux/backing-dev-defs.h | 5 +++++
include/linux/backing-dev.h | 30 ++++++++++++++++--------------
mm/page-writeback.c | 18 +++++++++---------
4 files changed, 34 insertions(+), 26 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d4d04fee568a..40c34a0ef58a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -745,11 +745,12 @@ int inode_congested(struct inode *inode, int cong_bits)
*/
if (inode && inode_to_wb_is_valid(inode)) {
struct bdi_writeback *wb;
- bool locked, congested;
+ struct wb_lock_cookie lock_cookie = {};
+ bool congested;
- wb = unlocked_inode_to_wb_begin(inode, &locked);
+ wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
congested = wb_congested(wb, cong_bits);
- unlocked_inode_to_wb_end(inode, locked);
+ unlocked_inode_to_wb_end(inode, &lock_cookie);
return congested;
}
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index bfe86b54f6c1..0bd432a4d7bd 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -223,6 +223,11 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync)
set_wb_congested(bdi->wb.congested, sync);
}
+struct wb_lock_cookie {
+ bool locked;
+ unsigned long flags;
+};
+
#ifdef CONFIG_CGROUP_WRITEBACK
/**
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 3e4ce54d84ab..82e8b73117d1 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -346,7 +346,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
/**
* unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
* @inode: target inode
- * @lockedp: temp bool output param, to be passed to the end function
+ * @cookie: output param, to be passed to the end function
*
* The caller wants to access the wb associated with @inode but isn't
* holding inode->i_lock, mapping->tree_lock or wb->list_lock. This
@@ -354,12 +354,12 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
* association doesn't change until the transaction is finished with
* unlocked_inode_to_wb_end().
*
- * The caller must call unlocked_inode_to_wb_end() with *@lockdep
- * afterwards and can't sleep during transaction. IRQ may or may not be
- * disabled on return.
+ * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and
+ * can't sleep during the transaction. IRQs may or may not be disabled on
+ * return.
*/
static inline struct bdi_writeback *
-unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
{
rcu_read_lock();
@@ -367,10 +367,10 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
* Paired with store_release in inode_switch_wb_work_fn() and
* ensures that we see the new wb if we see cleared I_WB_SWITCH.
*/
- *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
+ cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
- if (unlikely(*lockedp))
- spin_lock_irq(&inode->i_mapping->tree_lock);
+ if (unlikely(cookie->locked))
+ spin_lock_irqsave(&inode->i_mapping->tree_lock, cookie->flags);
/*
* Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock.
@@ -382,12 +382,13 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
/**
* unlocked_inode_to_wb_end - end inode wb access transaction
* @inode: target inode
- * @locked: *@lockedp from unlocked_inode_to_wb_begin()
+ * @cookie: @cookie from unlocked_inode_to_wb_begin()
*/
-static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+static inline void unlocked_inode_to_wb_end(struct inode *inode,
+ struct wb_lock_cookie *cookie)
{
- if (unlikely(locked))
- spin_unlock_irq(&inode->i_mapping->tree_lock);
+ if (unlikely(cookie->locked))
+ spin_unlock_irqrestore(&inode->i_mapping->tree_lock, cookie->flags);
rcu_read_unlock();
}
@@ -434,12 +435,13 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
}
static inline struct bdi_writeback *
-unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
{
return inode_to_wb(inode);
}
-static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+static inline void unlocked_inode_to_wb_end(struct inode *inode,
+ struct wb_lock_cookie *cookie)
{
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 586f31261c83..8369572e1f7d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2501,13 +2501,13 @@ void account_page_redirty(struct page *page)
if (mapping && mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- bool locked;
+ struct wb_lock_cookie cookie = {};
- wb = unlocked_inode_to_wb_begin(inode, &locked);
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
current->nr_dirtied--;
dec_node_page_state(page, NR_DIRTIED);
dec_wb_stat(wb, WB_DIRTIED);
- unlocked_inode_to_wb_end(inode, locked);
+ unlocked_inode_to_wb_end(inode, &cookie);
}
}
EXPORT_SYMBOL(account_page_redirty);
@@ -2613,15 +2613,15 @@ void __cancel_dirty_page(struct page *page)
if (mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- bool locked;
+ struct wb_lock_cookie cookie = {};
lock_page_memcg(page);
- wb = unlocked_inode_to_wb_begin(inode, &locked);
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (TestClearPageDirty(page))
account_page_cleaned(page, mapping, wb);
- unlocked_inode_to_wb_end(inode, locked);
+ unlocked_inode_to_wb_end(inode, &cookie);
unlock_page_memcg(page);
} else {
ClearPageDirty(page);
@@ -2653,7 +2653,7 @@ int clear_page_dirty_for_io(struct page *page)
if (mapping && mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- bool locked;
+ struct wb_lock_cookie cookie = {};
/*
* Yes, Virginia, this is indeed insane.
@@ -2690,14 +2690,14 @@ int clear_page_dirty_for_io(struct page *page)
* always locked coming in here, so we get the desired
* exclusion.
*/
- wb = unlocked_inode_to_wb_begin(inode, &locked);
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (TestClearPageDirty(page)) {
dec_lruvec_page_state(page, NR_FILE_DIRTY);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE);
ret = 1;
}
- unlocked_inode_to_wb_end(inode, locked);
+ unlocked_inode_to_wb_end(inode, &cookie);
return ret;
}
return TestClearPageDirty(page);
--
2.17.0
Hi Greg,
Commit 7dac4a1726a9 ("ext4: add validity checks for bitmap block
numbers") seems to be the cause of the regression reported here:
https://marc.info/?l=linux-ext4&m=152416385122029&w=2
ext4 folks are probably busy at LSF, so no reply yet. Should this
commit be held until we get word from Ted?
Please excuse broken threading.
Thanks,
Ilya
commit 5e1df40f40ee45a97bb1066c3d71f0ae920a9672 upstream.
Currently we see sporadic timeouts during CDCLK changing both on BXT and
GLK as reported by the Bugzilla: ticket. It's easy to reproduce this by
changing the frequency in a tight loop after blanking the display. The
upper bound for the completion time is 800us based on my tests, so
increase it from the current 500us to 2ms; with that I couldn't trigger
the problem either on BXT or GLK.
Note that timeouts happened during both the change notification and the
voltage level setting PCODE request. (For the latter one BSpec doesn't
require us to wait for completion before further HW programming.)
This issue is similar to
commit 2c7d0602c815 ("drm/i915/gen9: Fix PCODE polling during CDCLK
change notification")
but there the PCODE request does complete (as shown by the mbox
busy flag), only the reply we get from PCODE indicates a failure.
So there we keep resending the request until a success reply, here we
just have to increase the timeout for the one PCODE request we send.
v2:
- s/snb_pcode_request/sandybridge_pcode_write_timeout/ (Ville)
Cc: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> # v4.15
Acked-by: Chris Wilson <chris(a)chris-wilson.co.uk> (v1)
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103326
Reviewed-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Signed-off-by: Imre Deak <imre.deak(a)intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180130142939.17983-1-imre.d…
(cherry picked from commit e76019a81921e87a4d9e7b3d86102bc708a6c227)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi(a)intel.com>
(Rebased for v4.15 stable tree due to upstream s/DIV_ROUND_UP(cdclk, 25000)/cdclk_state->voltage_level/ change )
Signed-off-by: Imre Deak <imre.deak(a)intel.com>
---
drivers/gpu/drm/i915/i915_drv.h | 6 +++++-
drivers/gpu/drm/i915/intel_cdclk.c | 22 +++++++++++++++++-----
drivers/gpu/drm/i915/intel_pm.c | 6 +++---
3 files changed, 25 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e143004e66d5..ffc75271bf7e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -4140,7 +4140,11 @@ extern void intel_display_print_error_state(struct drm_i915_error_state_buf *e,
struct intel_display_error_state *error);
int sandybridge_pcode_read(struct drm_i915_private *dev_priv, u32 mbox, u32 *val);
-int sandybridge_pcode_write(struct drm_i915_private *dev_priv, u32 mbox, u32 val);
+int sandybridge_pcode_write_timeout(struct drm_i915_private *dev_priv, u32 mbox,
+ u32 val, int timeout_us);
+#define sandybridge_pcode_write(dev_priv, mbox, val) \
+ sandybridge_pcode_write_timeout(dev_priv, mbox, val, 500)
+
int skl_pcode_request(struct drm_i915_private *dev_priv, u32 mbox, u32 request,
u32 reply_mask, u32 reply, int timeout_base_ms);
diff --git a/drivers/gpu/drm/i915/intel_cdclk.c b/drivers/gpu/drm/i915/intel_cdclk.c
index 60cf4e58389a..658f681eb927 100644
--- a/drivers/gpu/drm/i915/intel_cdclk.c
+++ b/drivers/gpu/drm/i915/intel_cdclk.c
@@ -1284,10 +1284,15 @@ static void bxt_set_cdclk(struct drm_i915_private *dev_priv,
break;
}
- /* Inform power controller of upcoming frequency change */
+ /*
+ * Inform power controller of upcoming frequency change. BSpec
+ * requires us to wait up to 150usec, but that leads to timeouts;
+ * the 2ms used here is based on experiment.
+ */
mutex_lock(&dev_priv->pcu_lock);
- ret = sandybridge_pcode_write(dev_priv, HSW_PCODE_DE_WRITE_FREQ_REQ,
- 0x80000000);
+ ret = sandybridge_pcode_write_timeout(dev_priv,
+ HSW_PCODE_DE_WRITE_FREQ_REQ,
+ 0x80000000, 2000);
mutex_unlock(&dev_priv->pcu_lock);
if (ret) {
@@ -1318,8 +1323,15 @@ static void bxt_set_cdclk(struct drm_i915_private *dev_priv,
I915_WRITE(CDCLK_CTL, val);
mutex_lock(&dev_priv->pcu_lock);
- ret = sandybridge_pcode_write(dev_priv, HSW_PCODE_DE_WRITE_FREQ_REQ,
- DIV_ROUND_UP(cdclk, 25000));
+ /*
+ * The timeout isn't specified, the 2ms used here is based on
+ * experiment.
+ * FIXME: Waiting for the request completion could be delayed until
+ * the next PCODE request based on BSpec.
+ */
+ ret = sandybridge_pcode_write_timeout(dev_priv,
+ HSW_PCODE_DE_WRITE_FREQ_REQ,
+ DIV_ROUND_UP(cdclk, 25000), 2000);
mutex_unlock(&dev_priv->pcu_lock);
if (ret) {
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index f0d0dbab4150..c4e5db551fc2 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -9227,8 +9227,8 @@ int sandybridge_pcode_read(struct drm_i915_private *dev_priv, u32 mbox, u32 *val
return 0;
}
-int sandybridge_pcode_write(struct drm_i915_private *dev_priv,
- u32 mbox, u32 val)
+int sandybridge_pcode_write_timeout(struct drm_i915_private *dev_priv,
+ u32 mbox, u32 val, int timeout_us)
{
int status;
@@ -9251,7 +9251,7 @@ int sandybridge_pcode_write(struct drm_i915_private *dev_priv,
if (__intel_wait_for_register_fw(dev_priv,
GEN6_PCODE_MAILBOX, GEN6_PCODE_READY, 0,
- 500, 0, NULL)) {
+ timeout_us, 0, NULL)) {
DRM_ERROR("timeout waiting for pcode write of 0x%08x to mbox %x to finish for %ps\n",
val, mbox, __builtin_return_address(0));
return -ETIMEDOUT;
--
2.13.2