Since SCSI scanning occurs asynchronously, since sd_revalidate_disk()
is called from sd_probe_async() and since sd_revalidate_disk() calls
sd_zbc_read_zones() it can happen that sd_zbc_read_zones() is called
concurrently with blkdev_report_zones() and/or blkdev_reset_zones().
That can cause these functions to fail with -EIO because
sd_zbc_read_zones() e.g. sets q->nr_zones to zero before restoring it
to the actual value, even if no drive characteristics have changed.
Avoid that this can happen by making the following changes:
- Protect the code that updates zone information with blk_queue_enter()
and blk_queue_exit().
- Modify sd_zbc_setup_seq_zones_bitmap() and sd_zbc_setup() such that
these functions do not modify struct scsi_disk before all zone
information has been obtained.
Note: since commit 055f6e18e08f ("block: Make q_usage_counter also
track legacy requests"; kernel v4.15) the request queue freezing
mechanism also affects legacy request queues.
Signed-off-by: Bart Van Assche <bart.vanassche(a)wdc.com>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Damien Le Moal <damien.lemoal(a)wdc.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Hannes Reinecke <hare(a)suse.com>
Cc: stable(a)vger.kernel.org
---
drivers/scsi/sd_zbc.c | 140 +++++++++++++++++++++++++++++--------------------
include/linux/blkdev.h | 5 ++
2 files changed, 87 insertions(+), 58 deletions(-)
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index f290e8a9923d..88e1e80fa497 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -378,8 +378,10 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp, unsigned char *buf)
*
* Check that all zones of the device are equal. The last zone can however
* be smaller. The zone size must also be a power of two number of LBAs.
+ *
+ * Returns the zone size in bytes upon success or an error code upon failure.
*/
-static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
+static s64 sd_zbc_check_zone_size(struct scsi_disk *sdkp)
{
u64 zone_blocks = 0;
sector_t block = 0;
@@ -390,8 +392,6 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
int ret;
u8 same;
- sdkp->zone_blocks = 0;
-
/* Get a buffer */
buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
if (!buf)
@@ -423,16 +423,17 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
/* Parse zone descriptors */
while (rec < buf + buf_len) {
- zone_blocks = get_unaligned_be64(&rec[8]);
- if (sdkp->zone_blocks == 0) {
- sdkp->zone_blocks = zone_blocks;
- } else if (zone_blocks != sdkp->zone_blocks &&
- (block + zone_blocks < sdkp->capacity
- || zone_blocks > sdkp->zone_blocks)) {
- zone_blocks = 0;
+ u64 this_zone_blocks = get_unaligned_be64(&rec[8]);
+
+ if (zone_blocks == 0) {
+ zone_blocks = this_zone_blocks;
+ } else if (this_zone_blocks != zone_blocks &&
+ (block + this_zone_blocks < sdkp->capacity
+ || this_zone_blocks > zone_blocks)) {
+ this_zone_blocks = 0;
goto out;
}
- block += zone_blocks;
+ block += this_zone_blocks;
rec += 64;
}
@@ -445,8 +446,6 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
} while (block < sdkp->capacity);
- zone_blocks = sdkp->zone_blocks;
-
out:
if (!zone_blocks) {
if (sdkp->first_scan)
@@ -466,8 +465,7 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
"Zone size too large\n");
ret = -ENODEV;
} else {
- sdkp->zone_blocks = zone_blocks;
- sdkp->zone_shift = ilog2(zone_blocks);
+ ret = zone_blocks;
}
out_free:
@@ -478,15 +476,14 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
/**
* sd_zbc_alloc_zone_bitmap - Allocate a zone bitmap (one bit per zone).
- * @sdkp: The disk of the bitmap
+ * @nr_zones: Number of zones to allocate space for.
+ * @numa_node: NUMA node to allocate the memory from.
*/
-static inline unsigned long *sd_zbc_alloc_zone_bitmap(struct scsi_disk *sdkp)
+static inline unsigned long *
+sd_zbc_alloc_zone_bitmap(u32 nr_zones, int numa_node)
{
- struct request_queue *q = sdkp->disk->queue;
-
- return kzalloc_node(BITS_TO_LONGS(sdkp->nr_zones)
- * sizeof(unsigned long),
- GFP_KERNEL, q->node);
+ return kzalloc_node(BITS_TO_LONGS(nr_zones) * sizeof(unsigned long),
+ GFP_KERNEL, numa_node);
}
/**
@@ -494,6 +491,7 @@ static inline unsigned long *sd_zbc_alloc_zone_bitmap(struct scsi_disk *sdkp)
* @sdkp: disk used
* @buf: report reply buffer
* @buflen: length of @buf
+ * @zone_shift: logarithm base 2 of the number of blocks in a zone
* @seq_zones_bitmap: bitmap of sequential zones to set
*
* Parse reported zone descriptors in @buf to identify sequential zones and
@@ -503,7 +501,7 @@ static inline unsigned long *sd_zbc_alloc_zone_bitmap(struct scsi_disk *sdkp)
* Return the LBA after the last zone reported.
*/
static sector_t sd_zbc_get_seq_zones(struct scsi_disk *sdkp, unsigned char *buf,
- unsigned int buflen,
+ unsigned int buflen, u32 zone_shift,
unsigned long *seq_zones_bitmap)
{
sector_t lba, next_lba = sdkp->capacity;
@@ -522,7 +520,7 @@ static sector_t sd_zbc_get_seq_zones(struct scsi_disk *sdkp, unsigned char *buf,
if (type != ZBC_ZONE_TYPE_CONV &&
cond != ZBC_ZONE_COND_READONLY &&
cond != ZBC_ZONE_COND_OFFLINE)
- set_bit(lba >> sdkp->zone_shift, seq_zones_bitmap);
+ set_bit(lba >> zone_shift, seq_zones_bitmap);
next_lba = lba + get_unaligned_be64(&rec[8]);
rec += 64;
}
@@ -531,12 +529,16 @@ static sector_t sd_zbc_get_seq_zones(struct scsi_disk *sdkp, unsigned char *buf,
}
/**
- * sd_zbc_setup_seq_zones_bitmap - Initialize the disk seq zone bitmap.
+ * sd_zbc_setup_seq_zones_bitmap - Initialize a seq zone bitmap.
* @sdkp: target disk
+ * @zone_shift: logarithm base 2 of the number of blocks in a zone
+ * @nr_zones: number of zones to set up a seq zone bitmap for
*
* Allocate a zone bitmap and initialize it by identifying sequential zones.
*/
-static int sd_zbc_setup_seq_zones_bitmap(struct scsi_disk *sdkp)
+static unsigned long *
+sd_zbc_setup_seq_zones_bitmap(struct scsi_disk *sdkp, u32 zone_shift,
+ u32 nr_zones)
{
struct request_queue *q = sdkp->disk->queue;
unsigned long *seq_zones_bitmap;
@@ -544,9 +546,9 @@ static int sd_zbc_setup_seq_zones_bitmap(struct scsi_disk *sdkp)
unsigned char *buf;
int ret = -ENOMEM;
- seq_zones_bitmap = sd_zbc_alloc_zone_bitmap(sdkp);
+ seq_zones_bitmap = sd_zbc_alloc_zone_bitmap(nr_zones, q->node);
if (!seq_zones_bitmap)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
if (!buf)
@@ -557,7 +559,7 @@ static int sd_zbc_setup_seq_zones_bitmap(struct scsi_disk *sdkp)
if (ret)
goto out;
lba = sd_zbc_get_seq_zones(sdkp, buf, SD_ZBC_BUF_SIZE,
- seq_zones_bitmap);
+ zone_shift, seq_zones_bitmap);
}
if (lba != sdkp->capacity) {
@@ -569,12 +571,9 @@ static int sd_zbc_setup_seq_zones_bitmap(struct scsi_disk *sdkp)
kfree(buf);
if (ret) {
kfree(seq_zones_bitmap);
- return ret;
+ return ERR_PTR(ret);
}
-
- q->seq_zones_bitmap = seq_zones_bitmap;
-
- return 0;
+ return seq_zones_bitmap;
}
static void sd_zbc_cleanup(struct scsi_disk *sdkp)
@@ -590,44 +589,64 @@ static void sd_zbc_cleanup(struct scsi_disk *sdkp)
q->nr_zones = 0;
}
-static int sd_zbc_setup(struct scsi_disk *sdkp)
+static int sd_zbc_setup(struct scsi_disk *sdkp, u32 zone_blocks)
{
struct request_queue *q = sdkp->disk->queue;
+ u32 zone_shift = ilog2(zone_blocks);
+ u32 nr_zones;
int ret;
- /* READ16/WRITE16 is mandatory for ZBC disks */
- sdkp->device->use_16_for_rw = 1;
- sdkp->device->use_10_for_rw = 0;
-
/* chunk_sectors indicates the zone size */
- blk_queue_chunk_sectors(sdkp->disk->queue,
- logical_to_sectors(sdkp->device, sdkp->zone_blocks));
- sdkp->nr_zones =
- round_up(sdkp->capacity, sdkp->zone_blocks) >> sdkp->zone_shift;
+ blk_queue_chunk_sectors(q,
+ logical_to_sectors(sdkp->device, zone_blocks));
+ nr_zones = round_up(sdkp->capacity, zone_blocks) >> zone_shift;
/*
* Initialize the device request queue information if the number
* of zones changed.
*/
- if (sdkp->nr_zones != q->nr_zones) {
-
- sd_zbc_cleanup(sdkp);
-
- q->nr_zones = sdkp->nr_zones;
- if (sdkp->nr_zones) {
- q->seq_zones_wlock = sd_zbc_alloc_zone_bitmap(sdkp);
- if (!q->seq_zones_wlock) {
+ if (nr_zones != sdkp->nr_zones || nr_zones != q->nr_zones) {
+ unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL;
+ size_t zone_bitmap_size;
+
+ if (nr_zones) {
+ seq_zones_wlock = sd_zbc_alloc_zone_bitmap(nr_zones,
+ q->node);
+ if (!seq_zones_wlock) {
ret = -ENOMEM;
goto err;
}
- ret = sd_zbc_setup_seq_zones_bitmap(sdkp);
- if (ret) {
- sd_zbc_cleanup(sdkp);
+ seq_zones_bitmap = sd_zbc_setup_seq_zones_bitmap(sdkp,
+ zone_shift, nr_zones);
+ if (IS_ERR(seq_zones_bitmap)) {
+ ret = PTR_ERR(seq_zones_bitmap);
+ kfree(seq_zones_wlock);
goto err;
}
}
-
+ zone_bitmap_size = BITS_TO_LONGS(nr_zones) *
+ sizeof(unsigned long);
+ blk_mq_freeze_queue(q);
+ if (q->nr_zones != nr_zones) {
+ /* READ16/WRITE16 is mandatory for ZBC disks */
+ sdkp->device->use_16_for_rw = 1;
+ sdkp->device->use_10_for_rw = 0;
+
+ sdkp->zone_blocks = zone_blocks;
+ sdkp->zone_shift = zone_shift;
+ sdkp->nr_zones = nr_zones;
+ q->nr_zones = nr_zones;
+ swap(q->seq_zones_wlock, seq_zones_wlock);
+ swap(q->seq_zones_bitmap, seq_zones_bitmap);
+ } else if (memcmp(q->seq_zones_bitmap, seq_zones_bitmap,
+ zone_bitmap_size) != 0) {
+ memcpy(q->seq_zones_bitmap, seq_zones_bitmap,
+ zone_bitmap_size);
+ }
+ blk_mq_unfreeze_queue(q);
+ kfree(seq_zones_wlock);
+ kfree(seq_zones_bitmap);
}
return 0;
@@ -639,6 +658,7 @@ static int sd_zbc_setup(struct scsi_disk *sdkp)
int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
{
+ int64_t zone_blocks;
int ret;
if (!sd_is_zoned(sdkp))
@@ -675,12 +695,16 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
* Check zone size: only devices with a constant zone size (except
* an eventual last runt zone) that is a power of 2 are supported.
*/
- ret = sd_zbc_check_zone_size(sdkp);
- if (ret)
+ zone_blocks = sd_zbc_check_zone_size(sdkp);
+ ret = -EFBIG;
+ if (zone_blocks != (u32)zone_blocks)
+ goto err;
+ ret = zone_blocks;
+ if (ret < 0)
goto err;
/* The drive satisfies the kernel restrictions: set it up */
- ret = sd_zbc_setup(sdkp);
+ ret = sd_zbc_setup(sdkp, zone_blocks);
if (ret)
goto err;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9af3e0f430bc..21e21f273a21 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -605,6 +605,11 @@ struct request_queue {
* initialized by the low level device driver (e.g. scsi/sd.c).
* Stacking drivers (device mappers) may or may not initialize
* these fields.
+ *
+ * Reads of this information must be protected with blk_queue_enter() /
+ * blk_queue_exit(). Modifying this information is only allowed while
+ * no requests are being processed. See also blk_mq_freeze_queue() and
+ * blk_mq_unfreeze_queue().
*/
unsigned int nr_zones;
unsigned long *seq_zones_bitmap;
--
2.16.3
A zone reset only fails for conventional zones. Resetting a conventional
zone indicates a bug in the software that submitted the zone reset so
the information about a zone reset failure is useful information. Hence,
do not suppress information about zone reset failures.
Signed-off-by: Bart Van Assche <bart.vanassche(a)wdc.com>
Cc: Damien Le Moal <damien.lemoal(a)wdc.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Hannes Reinecke <hare(a)suse.com>
Cc: stable(a)vger.kernel.org
---
drivers/scsi/sd_zbc.c | 12 ------------
1 file changed, 12 deletions(-)
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 2d0c06f7db3e..f290e8a9923d 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -284,18 +284,6 @@ void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
switch (req_op(rq)) {
case REQ_OP_ZONE_RESET:
-
- if (result &&
- sshdr->sense_key == ILLEGAL_REQUEST &&
- sshdr->asc == 0x24)
- /*
- * INVALID FIELD IN CDB error: reset of a conventional
- * zone was attempted. Nothing to worry about, so be
- * quiet about the error.
- */
- rq->rq_flags |= RQF_QUIET;
- break;
-
case REQ_OP_WRITE:
case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE_SAME:
--
2.16.3
scsi_io_completion() translates the sense key ILLEGAL REQUEST / ASC
0x21 into ACTION_FAIL. That means that setting cmd->allowed to zero
in sd_zbc_complete() for this sense code / ASC combination is not
necessary. Hence remove the code that resets cmd->allowed from
sd_zbc_complete().
Signed-off-by: Bart Van Assche <bart.vanassche(a)wdc.com>
Cc: Damien Le Moal <damien.lemoal(a)wdc.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Hannes Reinecke <hare(a)suse.com>
Cc: stable(a)vger.kernel.org
---
drivers/scsi/sd_zbc.c | 10 ----------
1 file changed, 10 deletions(-)
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 41df75eea57b..2d0c06f7db3e 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -299,16 +299,6 @@ void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
case REQ_OP_WRITE:
case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE_SAME:
-
- if (result &&
- sshdr->sense_key == ILLEGAL_REQUEST &&
- sshdr->asc == 0x21)
- /*
- * INVALID ADDRESS FOR WRITE error: It is unlikely that
- * retrying write requests failed with any kind of
- * alignement error will result in success. So don't.
- */
- cmd->allowed = 0;
break;
case REQ_OP_ZONE_REPORT:
--
2.16.3
From: Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
(cherry picked from commit 36268223c1e9981d6cfc33aff8520b3bde4b8114)
As:
1) It's known that hypervisors lie about the environment anyhow (host
mismatch)
2) Even if the hypervisor (Xen, KVM, VMWare, etc) provided a valid
"correct" value, it all gets to be very murky when migration happens
(do you provide the "new" microcode of the machine?).
And in reality the cloud vendors are the ones that should make sure that
the microcode that is running is correct and we should just sing lalalala
and trust them.
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Paolo Bonzini <pbonzini(a)redhat.com>
Cc: Wanpeng Li <kernellwp(a)gmail.com>
Cc: kvm <kvm(a)vger.kernel.org>
Cc: Krčmář <rkrcmar(a)redhat.com>
Cc: Borislav Petkov <bp(a)alien8.de>
CC: "H. Peter Anvin" <hpa(a)zytor.com>
CC: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/20180226213019.GE9497@char.us.oracle.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
[Yi Sun: port to 4.4]
Signed-off-by: Yi Sun <yi.y.sun(a)linux.intel.com>
---
arch/x86/kernel/cpu/intel.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index af28610..221c030 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -71,6 +71,13 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
{
int i;
+ /*
+ * We know that the hypervisor lie to us on the microcode version so
+ * we may as well hope that it is running the correct version.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return false;
+
for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
if (c->x86_model == spectre_bad_microcodes[i].model &&
c->x86_mask == spectre_bad_microcodes[i].stepping)
--
1.9.1
From: Paolo Bonzini <pbonzini(a)redhat.com>
commit 946fbbc13dce68902f64515b610eeb2a6c3d7a64 upstream.
vmx_vcpu_run() and svm_vcpu_run() are large functions, and giving
branch hints to the compiler can actually make a substantial cycle
difference by keeping the fast path contiguous in memory.
With this optimization, the retpoline-guest/retpoline-host case is
about 50 cycles faster.
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
Reviewed-by: Jim Mattson <jmattson(a)google.com>
Cc: David Woodhouse <dwmw(a)amazon.co.uk>
Cc: KarimAllah Ahmed <karahmed(a)amazon.de>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Radim Krčmář <rkrcmar(a)redhat.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: kvm(a)vger.kernel.org
Cc: stable(a)vger.kernel.org
Link: http://lkml.kernel.org/r/20180222154318.20361-3-pbonzini@redhat.com
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
[Yi Sun: cherry pick to 4.4]
Signed-off-by: Yi Sun <yi.y.sun(a)linux.intel.com>
---
arch/x86/kvm/svm.c | 2 +-
arch/x86/kvm/vmx.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cb46661..35da1e2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4017,7 +4017,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
* If the L02 MSR bitmap does not intercept the MSR, then we need to
* save it.
*/
- if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
+ if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
if (svm->spec_ctrl)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6c109df..1f165cc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8743,7 +8743,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
* If the L02 MSR bitmap does not intercept the MSR, then we need to
* save it.
*/
- if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
+ if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
if (vmx->spec_ctrl)
--
1.9.1
When looking up the clock we must use the client->dev as device since that
is the one which is probed via DT.
Signed-off-by: Peter Ujfalusi <peter.ujfalusi(a)ti.com>
Cc: stable(a)vger.kernel.org # 4.16+
---
Hi,
Changes since v1:
- Removed the Fixes line and add tag only for v4.16 to avoid possible breakage
in pre v4.16.
Regards,
Peter
drivers/mfd/twl-core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/mfd/twl-core.c b/drivers/mfd/twl-core.c
index d3133a371e27..c649344fd7f2 100644
--- a/drivers/mfd/twl-core.c
+++ b/drivers/mfd/twl-core.c
@@ -1177,7 +1177,7 @@ twl_probe(struct i2c_client *client, const struct i2c_device_id *id)
twl_priv->ready = true;
/* setup clock framework */
- clocks_init(&pdev->dev, pdata ? pdata->clock : NULL);
+ clocks_init(&client->dev, pdata ? pdata->clock : NULL);
/* read TWL IDCODE Register */
if (twl_class_is_4030()) {
--
Peter
Texas Instruments Finland Oy, Porkkalankatu 22, 00180 Helsinki.
Y-tunnus/Business ID: 0615521-4. Kotipaikka/Domicile: Helsinki
From: Hector Martin <marcan(a)marcan.st>
[ Upstream commit 188775181bc05f29372b305ef96485840e351fde ]
At least some JMicron controllers issue buggy oversized DMA reads when
fetching context descriptors, always fetching 0x20 bytes at once for
descriptors which are only 0x10 bytes long. This is often harmless, but
can cause page faults on modern systems with IOMMUs:
DMAR: [DMA Read] Request device [05:00.0] fault addr fff56000 [fault reason 06] PTE Read access is not set
firewire_ohci 0000:05:00.0: DMA context IT0 has stopped, error code: evt_descriptor_read
This works around the problem by always leaving 0x10 padding bytes at
the end of descriptor buffer pages, which should be harmless to do
unconditionally for controllers in case others have the same behavior.
Signed-off-by: Hector Martin <marcan(a)marcan.st>
Reviewed-by: Clemens Ladisch <clemens(a)ladisch.de>
Signed-off-by: Stefan Richter <stefanr(a)s5r6.in-berlin.de>
Signed-off-by: Sasha Levin <alexander.levin(a)microsoft.com>
---
drivers/firewire/ohci.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index ccf52368a073..45c048751f3b 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -1128,7 +1128,13 @@ static int context_add_buffer(struct context *ctx)
return -ENOMEM;
offset = (void *)&desc->buffer - (void *)desc;
- desc->buffer_size = PAGE_SIZE - offset;
+ /*
+ * Some controllers, like JMicron ones, always issue 0x20-byte DMA reads
+ * for descriptors, even 0x10-byte ones. This can cause page faults when
+ * an IOMMU is in use and the oversized read crosses a page boundary.
+ * Work around this by always leaving at least 0x10 bytes of padding.
+ */
+ desc->buffer_size = PAGE_SIZE - offset - 0x10;
desc->buffer_bus = bus_addr + offset;
desc->used = 0;
--
2.15.1
From: Fabio Estevam <fabio.estevam(a)nxp.com>
imx6ul and imx7 report the following error:
caam_jr 2142000.jr1: 40000789: DECO: desc idx 7:
Protocol Size Error - A protocol has seen an error in size. When
running RSA, pdb size N < (size of F) when no formatting is used; or
pdb size N < (F + 11) when formatting is used.
------------[ cut here ]------------
WARNING: CPU: 0 PID: 1 at crypto/asymmetric_keys/public_key.c:148
public_key_verify_signature+0x27c/0x2b0
This error happens because the signature contains 257 bytes, including
a leading zero as the first element.
Fix the problem by stripping off the leading zero from input data
before feeding it to the CAAM accelerator.
Fixes: 8c419778ab57e497b5 ("crypto: caam - add support for RSA algorithm")
Cc: <stable(a)vger.kernel.org>
Reported-by: Martin Townsend <mtownsend1973(a)gmail.com>
Signed-off-by: Fabio Estevam <fabio.estevam(a)nxp.com>
---
Changes since v2:
- Check if the lenght is zero after calling caam_rsa_drop_leading_zeros()
drivers/crypto/caam/caampkc.c | 45 +++++++++++++++++++++++++++++++++++--------
1 file changed, 37 insertions(+), 8 deletions(-)
diff --git a/drivers/crypto/caam/caampkc.c b/drivers/crypto/caam/caampkc.c
index 7a897209..47467ff 100644
--- a/drivers/crypto/caam/caampkc.c
+++ b/drivers/crypto/caam/caampkc.c
@@ -166,6 +166,14 @@ static void rsa_priv_f3_done(struct device *dev, u32 *desc, u32 err,
akcipher_request_complete(req, err);
}
+static void caam_rsa_drop_leading_zeros(const u8 **ptr, size_t *nbytes)
+{
+ while (!**ptr && *nbytes) {
+ (*ptr)++;
+ (*nbytes)--;
+ }
+}
+
static struct rsa_edesc *rsa_edesc_alloc(struct akcipher_request *req,
size_t desclen)
{
@@ -178,7 +186,36 @@ static struct rsa_edesc *rsa_edesc_alloc(struct akcipher_request *req,
int sgc;
int sec4_sg_index, sec4_sg_len = 0, sec4_sg_bytes;
int src_nents, dst_nents;
+ const u8 *temp;
+ void *buffer;
+ size_t len;
+
+ buffer = kzalloc(req->src_len, GFP_ATOMIC);
+ if (!buffer)
+ return ERR_PTR(-ENOMEM);
+
+ sg_copy_to_buffer(req->src, sg_nents(req->src),
+ buffer, req->src_len);
+ temp = (u8 *)buffer;
+ len = req->src_len;
+ /*
+ * Check if the buffer contains leading zeros and if
+ * it does, drop the leading zeros
+ */
+ if (temp[0] == 0) {
+ caam_rsa_drop_leading_zeros(&temp, &len);
+ if (!len) {
+ kfree(buffer);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ req->src_len = len;
+ sg_copy_from_buffer(req->src, sg_nents(req->src),
+ (void *)temp, req->src_len);
+ }
+
+ kfree(buffer);
src_nents = sg_nents_for_len(req->src, req->src_len);
dst_nents = sg_nents_for_len(req->dst, req->dst_len);
@@ -683,14 +720,6 @@ static void caam_rsa_free_key(struct caam_rsa_key *key)
memset(key, 0, sizeof(*key));
}
-static void caam_rsa_drop_leading_zeros(const u8 **ptr, size_t *nbytes)
-{
- while (!**ptr && *nbytes) {
- (*ptr)++;
- (*nbytes)--;
- }
-}
-
/**
* caam_read_rsa_crt - Used for reading dP, dQ, qInv CRT members.
* dP, dQ and qInv could decode to less than corresponding p, q length, as the
--
2.7.4
The patch below does not apply to the 4.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 339b2ae0cd5d4a58f9efe06e4ee36adbeca59228 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe(a)redhat.com>
Date: Wed, 14 Feb 2018 13:46:53 +0800
Subject: [PATCH] x86/apic: Fix restoring boot IRQ mode in reboot and
kexec/kdump
This is a regression fix.
Before, to fix erratum AVR31, the following commit:
522e66464467 ("x86/apic: Disable I/O APIC before shutdown of the local APIC")
... moved the lapic_shutdown() call to after disable_IO_APIC() in the reboot
and kexec/kdump code paths.
This introduced the following regression: disable_IO_APIC() not only clears
the IO-APIC, but it also restores boot IRQ mode by setting the
LAPIC/APIC/IMCR, calling lapic_shutdown() after disable_IO_APIC() will
disable LAPIC and ruin the possible virtual wire mode setting which
the code has been trying to do all along.
The consequence is that a KVM guest kernel always prints the warning below
during kexec/kdump as the kernel boots up:
[ 0.001000] WARNING: CPU: 0 PID: 0 at arch/x86/kernel/apic/apic.c:1467 setup_local_APIC+0x228/0x330
[ ........]
[ 0.001000] Call Trace:
[ 0.001000] apic_bsp_setup+0x56/0x74
[ 0.001000] x86_late_time_init+0x11/0x16
[ 0.001000] start_kernel+0x3c9/0x486
[ 0.001000] secondary_startup_64+0xa5/0xb0
[ ........]
[ 0.001000] masked ExtINT on CPU#0
To fix this, just call clear_IO_APIC() to stop the IO-APIC where
disable_IO_APIC() was called, and call restore_boot_irq_mode() to
restore boot IRQ mode before a reboot or a kexec/kdump jump.
Signed-off-by: Baoquan He <bhe(a)redhat.com>
Reviewed-by: Eric W. Biederman <ebiederm(a)xmission.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: douly.fnst(a)cn.fujitsu.com
Cc: joro(a)8bytes.org
Cc: prarit(a)redhat.com
Cc: stable(a)vger.kernel.org
Cc: uobergfe(a)redhat.com
Fixes: commit 522e66464467 ("x86/apic: Disable I/O APIC before shutdown of the local APIC")
Link: http://lkml.kernel.org/r/20180214054656.3780-4-bhe@redhat.com
[ Rewrote the changelog. ]
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 10e74d4778a1..1f6680427ff0 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -199,9 +199,10 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_X86_IO_APIC
/* Prevent crash_kexec() from deadlocking on ioapic_lock. */
ioapic_zap_locks();
- disable_IO_APIC();
+ clear_IO_APIC();
#endif
lapic_shutdown();
+ restore_boot_irq_mode();
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 2126b9d27c34..725624b6c0c0 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -666,7 +666,7 @@ void native_machine_shutdown(void)
* Even without the erratum, it still makes sense to quiet IO APIC
* before disabling Local APIC.
*/
- disable_IO_APIC();
+ clear_IO_APIC();
#endif
#ifdef CONFIG_SMP
@@ -680,6 +680,7 @@ void native_machine_shutdown(void)
#endif
lapic_shutdown();
+ restore_boot_irq_mode();
#ifdef CONFIG_HPET_TIMER
hpet_disable();
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f572a034d9e07157dd07d2b6be3a1459b5574b58 Mon Sep 17 00:00:00 2001
From: Greentime Hu <greentime(a)andestech.com>
Date: Thu, 16 Nov 2017 19:33:35 +0800
Subject: [PATCH] earlycon: add reg-offset to physical address before mapping
It will get the wrong virtual address because port->mapbase is not added
the correct reg-offset yet. We have to update it before earlycon_map()
is called
Signed-off-by: Greentime Hu <greentime(a)andestech.com>
Acked-by: Arnd Bergmann <arnd(a)arndb.de>
Acked-by: Rob Herring <robh(a)kernel.org>
Cc: Peter Hurley <peter(a)hurleysoftware.com>
Cc: stable(a)vger.kernel.org
Fixes: 088da2a17619 ("of: earlycon: Initialize port fields from DT
properties")
diff --git a/drivers/tty/serial/earlycon.c b/drivers/tty/serial/earlycon.c
index 870e84fb6e39..a24278380fec 100644
--- a/drivers/tty/serial/earlycon.c
+++ b/drivers/tty/serial/earlycon.c
@@ -245,11 +245,12 @@ int __init of_setup_earlycon(const struct earlycon_id *match,
}
port->mapbase = addr;
port->uartclk = BASE_BAUD * 16;
- port->membase = earlycon_map(port->mapbase, SZ_4K);
val = of_get_flat_dt_prop(node, "reg-offset", NULL);
if (val)
port->mapbase += be32_to_cpu(*val);
+ port->membase = earlycon_map(port->mapbase, SZ_4K);
+
val = of_get_flat_dt_prop(node, "reg-shift", NULL);
if (val)
port->regshift = be32_to_cpu(*val);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f572a034d9e07157dd07d2b6be3a1459b5574b58 Mon Sep 17 00:00:00 2001
From: Greentime Hu <greentime(a)andestech.com>
Date: Thu, 16 Nov 2017 19:33:35 +0800
Subject: [PATCH] earlycon: add reg-offset to physical address before mapping
It will get the wrong virtual address because port->mapbase is not added
the correct reg-offset yet. We have to update it before earlycon_map()
is called
Signed-off-by: Greentime Hu <greentime(a)andestech.com>
Acked-by: Arnd Bergmann <arnd(a)arndb.de>
Acked-by: Rob Herring <robh(a)kernel.org>
Cc: Peter Hurley <peter(a)hurleysoftware.com>
Cc: stable(a)vger.kernel.org
Fixes: 088da2a17619 ("of: earlycon: Initialize port fields from DT
properties")
diff --git a/drivers/tty/serial/earlycon.c b/drivers/tty/serial/earlycon.c
index 870e84fb6e39..a24278380fec 100644
--- a/drivers/tty/serial/earlycon.c
+++ b/drivers/tty/serial/earlycon.c
@@ -245,11 +245,12 @@ int __init of_setup_earlycon(const struct earlycon_id *match,
}
port->mapbase = addr;
port->uartclk = BASE_BAUD * 16;
- port->membase = earlycon_map(port->mapbase, SZ_4K);
val = of_get_flat_dt_prop(node, "reg-offset", NULL);
if (val)
port->mapbase += be32_to_cpu(*val);
+ port->membase = earlycon_map(port->mapbase, SZ_4K);
+
val = of_get_flat_dt_prop(node, "reg-shift", NULL);
if (val)
port->regshift = be32_to_cpu(*val);
The patch below does not apply to the 4.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f572a034d9e07157dd07d2b6be3a1459b5574b58 Mon Sep 17 00:00:00 2001
From: Greentime Hu <greentime(a)andestech.com>
Date: Thu, 16 Nov 2017 19:33:35 +0800
Subject: [PATCH] earlycon: add reg-offset to physical address before mapping
It will get the wrong virtual address because port->mapbase is not added
the correct reg-offset yet. We have to update it before earlycon_map()
is called
Signed-off-by: Greentime Hu <greentime(a)andestech.com>
Acked-by: Arnd Bergmann <arnd(a)arndb.de>
Acked-by: Rob Herring <robh(a)kernel.org>
Cc: Peter Hurley <peter(a)hurleysoftware.com>
Cc: stable(a)vger.kernel.org
Fixes: 088da2a17619 ("of: earlycon: Initialize port fields from DT
properties")
diff --git a/drivers/tty/serial/earlycon.c b/drivers/tty/serial/earlycon.c
index 870e84fb6e39..a24278380fec 100644
--- a/drivers/tty/serial/earlycon.c
+++ b/drivers/tty/serial/earlycon.c
@@ -245,11 +245,12 @@ int __init of_setup_earlycon(const struct earlycon_id *match,
}
port->mapbase = addr;
port->uartclk = BASE_BAUD * 16;
- port->membase = earlycon_map(port->mapbase, SZ_4K);
val = of_get_flat_dt_prop(node, "reg-offset", NULL);
if (val)
port->mapbase += be32_to_cpu(*val);
+ port->membase = earlycon_map(port->mapbase, SZ_4K);
+
val = of_get_flat_dt_prop(node, "reg-shift", NULL);
if (val)
port->regshift = be32_to_cpu(*val);
The patch below does not apply to the 4.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f572a034d9e07157dd07d2b6be3a1459b5574b58 Mon Sep 17 00:00:00 2001
From: Greentime Hu <greentime(a)andestech.com>
Date: Thu, 16 Nov 2017 19:33:35 +0800
Subject: [PATCH] earlycon: add reg-offset to physical address before mapping
It will get the wrong virtual address because port->mapbase is not added
the correct reg-offset yet. We have to update it before earlycon_map()
is called
Signed-off-by: Greentime Hu <greentime(a)andestech.com>
Acked-by: Arnd Bergmann <arnd(a)arndb.de>
Acked-by: Rob Herring <robh(a)kernel.org>
Cc: Peter Hurley <peter(a)hurleysoftware.com>
Cc: stable(a)vger.kernel.org
Fixes: 088da2a17619 ("of: earlycon: Initialize port fields from DT
properties")
diff --git a/drivers/tty/serial/earlycon.c b/drivers/tty/serial/earlycon.c
index 870e84fb6e39..a24278380fec 100644
--- a/drivers/tty/serial/earlycon.c
+++ b/drivers/tty/serial/earlycon.c
@@ -245,11 +245,12 @@ int __init of_setup_earlycon(const struct earlycon_id *match,
}
port->mapbase = addr;
port->uartclk = BASE_BAUD * 16;
- port->membase = earlycon_map(port->mapbase, SZ_4K);
val = of_get_flat_dt_prop(node, "reg-offset", NULL);
if (val)
port->mapbase += be32_to_cpu(*val);
+ port->membase = earlycon_map(port->mapbase, SZ_4K);
+
val = of_get_flat_dt_prop(node, "reg-shift", NULL);
if (val)
port->regshift = be32_to_cpu(*val);
Hi,
Building 4.9.94 in the same way we have been building previous 4.9 releases yields the following error:
DEBUG: tests/code-reading.c: In function 'read_object_code':
DEBUG: tests/code-reading.c:228:19: error: 'KMOD_DECOMP_LEN' undeclared (first use in this function)
DEBUG: char decomp_name[KMOD_DECOMP_LEN];
DEBUG: ^
DEBUG: tests/code-reading.c:228:19: note: each undeclared identifier is reported only once for each
function it appears in
DEBUG: tests/code-reading.c:291:3: warning: implicit declaration of function
'dso__decompress_kmodule_path' [-Wimplicit-function-declaration]
DEBUG: if (dso__decompress_kmodule_path(al.map->dso, objdump_name,
DEBUG: ^
DEBUG: tests/code-reading.c:291:3: warning: nested extern declaration of
'dso__decompress_kmodule_path' [-Wnested-externs]
DEBUG: tests/code-reading.c:228:7: warning: unused variable 'decomp_name' [-Wunused-variable]
DEBUG: char decomp_name[KMOD_DECOMP_LEN];
DEBUG: ^
DEBUG: CC tests/topology.o
DEBUG: CC tests/cpumap.o
DEBUG: CC tests/stat.o
DEBUG: CC tests/event_update.o
DEBUG: mv: cannot stat 'tests/.code-reading.o.tmp': No such file or directory
DEBUG: make[3]: *** [tests/code-reading.o] Error 1
DEBUG: make[3]: *** Waiting for unfinished jobs....
DEBUG: make[2]: *** [util] Error 2
DEBUG: make[1]: *** [libperf-in.o] Error 2
DEBUG: make[1]: *** Waiting for unfinished jobs....
DEBUG: LD bench/perf-in.o
DEBUG: make[2]: *** [tests] Error 2
DEBUG: make[1]: *** [perf-in.o] Error 2
As far as I can see, KMOD_DECOMP_LEN was introduced by 7525a238be8f ("perf tests: Decompress kernel
module before objdump"), but I have zero deep knowledge in this area so I may be very wrong here.
Cheers,
Pavlos Parissis
The normal request completion can be done before or during handling
BLK_EH_RESET_TIMER, and this race may cause the request to never
be completed since driver's .timeout() may always return
BLK_EH_RESET_TIMER.
This issue can't be fixed completely by driver, since the normal
completion can be done between returning .timeout() and handling
BLK_EH_RESET_TIMER.
This patch fixes the race by introducing rq state of MQ_RQ_COMPLETE_IN_RESET,
and reading/writing rq's state by holding queue lock, which can be
per-request actually, but just not necessary to introduce one lock for
so unusual event.
Also handle the timeout requests in two steps:
1) in 1st step, call .timeout(), and reset timer for BLK_EH_RESET_TIMER
2) in 2nd step, sync with normal completion path by holding queue lock
for avoiding race between BLK_EH_RESET_TIMER and normal completion.
Cc: "jianchao.wang" <jianchao.w.wang(a)oracle.com>
Cc: Bart Van Assche <bart.vanassche(a)wdc.com>
Cc: Tejun Heo <tj(a)kernel.org>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Ming Lei <ming.lei(a)redhat.com>
Cc: Sagi Grimberg <sagi(a)grimberg.me>
Cc: Israel Rukshin <israelr(a)mellanox.com>,
Cc: Max Gurtovoy <maxg(a)mellanox.com>
Cc: stable(a)vger.kernel.org
Cc: Martin Steigerwald <Martin(a)Lichtvoll.de>
Signed-off-by: Ming Lei <ming.lei(a)redhat.com>
---
block/blk-mq.c | 116 ++++++++++++++++++++++++++++++++++++++++---------
block/blk-mq.h | 1 +
include/linux/blkdev.h | 6 +++
3 files changed, 102 insertions(+), 21 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d6a21898933d..9415e65302a8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -630,10 +630,27 @@ void blk_mq_complete_request(struct request *rq)
* However, that would complicate paths which want to synchronize
* against us. Let stay in sync with the issue path so that
* hctx_lock() covers both issue and completion paths.
+ *
+ * Cover complete vs BLK_EH_RESET_TIMER race in slow path with
+ * holding queue lock.
*/
hctx_lock(hctx, &srcu_idx);
if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
__blk_mq_complete_request(rq);
+ else {
+ unsigned long flags;
+ bool need_complete = false;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ if (!blk_mq_rq_aborted_gstate(rq))
+ need_complete = true;
+ else
+ blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE_IN_TIMEOUT);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+
+ if (need_complete)
+ __blk_mq_complete_request(rq);
+ }
hctx_unlock(hctx, srcu_idx);
}
EXPORT_SYMBOL(blk_mq_complete_request);
@@ -810,7 +827,7 @@ struct blk_mq_timeout_data {
unsigned int nr_expired;
};
-static void blk_mq_rq_timed_out(struct request *req, bool reserved)
+static void blk_mq_rq_pre_timed_out(struct request *req, bool reserved)
{
const struct blk_mq_ops *ops = req->q->mq_ops;
enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
@@ -818,18 +835,44 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
if (ops->timeout)
ret = ops->timeout(req, reserved);
+ if (ret == BLK_EH_RESET_TIMER)
+ blk_add_timer(req);
+
+ req->timeout_ret = ret;
+}
+
+static void blk_mq_rq_timed_out(struct request *req, bool reserved)
+{
+ enum blk_eh_timer_return ret = req->timeout_ret;
+ unsigned long flags;
+
switch (ret) {
case BLK_EH_HANDLED:
+ spin_lock_irqsave(req->q->queue_lock, flags);
+ complete_rq:
+ if (blk_mq_rq_state(req) == MQ_RQ_COMPLETE_IN_TIMEOUT)
+ blk_mq_rq_update_state(req, MQ_RQ_IN_FLIGHT);
+ spin_unlock_irqrestore(req->q->queue_lock, flags);
__blk_mq_complete_request(req);
break;
case BLK_EH_RESET_TIMER:
/*
- * As nothing prevents from completion happening while
- * ->aborted_gstate is set, this may lead to ignored
- * completions and further spurious timeouts.
+ * The normal completion may happen during handling the
+ * timeout, or even after returning from .timeout(), so
+ * once the request has been completed, we can't reset
+ * timer any more since this request may be handled as
+ * BLK_EH_RESET_TIMER in next timeout handling too, and
+ * it has to be completed in this situation.
+ *
+ * Holding the queue lock to cover read/write rq's
+ * aborted_gstate and normal state, so the race can be
+ * avoided completely.
*/
+ spin_lock_irqsave(req->q->queue_lock, flags);
blk_mq_rq_update_aborted_gstate(req, 0);
- blk_add_timer(req);
+ if (blk_mq_rq_state(req) == MQ_RQ_COMPLETE_IN_TIMEOUT)
+ goto complete_rq;
+ spin_unlock_irqrestore(req->q->queue_lock, flags);
break;
case BLK_EH_NOT_HANDLED:
req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
@@ -875,7 +918,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
}
}
-static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+static void blk_mq_prepare_expired(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv, bool reserved)
{
/*
@@ -887,9 +930,40 @@ static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
*/
if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
READ_ONCE(rq->gstate) == rq->aborted_gstate)
+ blk_mq_rq_pre_timed_out(rq, reserved);
+}
+
+static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+ struct request *rq, void *priv, bool reserved)
+{
+ if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
+ READ_ONCE(rq->gstate) == rq->aborted_gstate)
blk_mq_rq_timed_out(rq, reserved);
}
+static void blk_mq_timeout_synchronize_rcu(struct request_queue *q,
+ bool reset_expired)
+{
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+ bool has_rcu = false;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (!hctx->nr_expired)
+ continue;
+
+ if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+ has_rcu = true;
+ else
+ synchronize_srcu(hctx->srcu);
+
+ if (reset_expired)
+ hctx->nr_expired = 0;
+ }
+ if (has_rcu)
+ synchronize_rcu();
+}
+
static void blk_mq_timeout_work(struct work_struct *work)
{
struct request_queue *q =
@@ -899,8 +973,6 @@ static void blk_mq_timeout_work(struct work_struct *work)
.next_set = 0,
.nr_expired = 0,
};
- struct blk_mq_hw_ctx *hctx;
- int i;
/* A deadlock might occur if a request is stuck requiring a
* timeout at the same time a queue freeze is waiting
@@ -922,27 +994,26 @@ static void blk_mq_timeout_work(struct work_struct *work)
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
if (data.nr_expired) {
- bool has_rcu = false;
-
/*
* Wait till everyone sees ->aborted_gstate. The
* sequential waits for SRCUs aren't ideal. If this ever
* becomes a problem, we can add per-hw_ctx rcu_head and
* wait in parallel.
*/
- queue_for_each_hw_ctx(q, hctx, i) {
- if (!hctx->nr_expired)
- continue;
+ blk_mq_timeout_synchronize_rcu(q, false);
- if (!(hctx->flags & BLK_MQ_F_BLOCKING))
- has_rcu = true;
- else
- synchronize_srcu(hctx->srcu);
+ /* call .timeout() for timed-out requests */
+ blk_mq_queue_tag_busy_iter(q, blk_mq_prepare_expired, NULL);
- hctx->nr_expired = 0;
- }
- if (has_rcu)
- synchronize_rcu();
+ /*
+ * If .timeout returns BLK_EH_HANDLED, wait till current
+ * completion is done, for avoiding to update state on
+ * completed request.
+ *
+ * If .timeout returns BLK_EH_RESET_TIMER, wait till
+ * blk_add_timer() is commited before completing this rq.
+ */
+ blk_mq_timeout_synchronize_rcu(q, true);
/* terminate the ones we won */
blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
@@ -952,6 +1023,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
data.next = blk_rq_timeout(round_jiffies_up(data.next));
mod_timer(&q->timeout, data.next);
} else {
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
/*
* Request timeouts are handled as a forward rolling timer. If
* we end up here it means that no requests are pending and
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 88c558f71819..0426d048743d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -35,6 +35,7 @@ enum mq_rq_state {
MQ_RQ_IDLE = 0,
MQ_RQ_IN_FLIGHT = 1,
MQ_RQ_COMPLETE = 2,
+ MQ_RQ_COMPLETE_IN_TIMEOUT = 3,
MQ_RQ_STATE_BITS = 2,
MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9af3e0f430bc..8278f67d39a6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -252,8 +252,14 @@ struct request {
struct list_head timeout_list;
union {
+ /* used after completion */
struct __call_single_data csd;
+
+ /* used in io scheduler, before dispatch */
u64 fifo_time;
+
+ /* used after dispatch and before completion */
+ int timeout_ret;
};
/*
--
2.9.5