Presently when an error is encountered during probe of the cxlflash
adapter, a deadlock is seen with cpu thread stuck inside
cxlflash_remove(). Below is the trace of the deadlock as logged by
khungtaskd:
cxlflash 0006:00:00.0: cxlflash_probe: init_afu failed rc=-16
INFO: task kworker/80:1:890 blocked for more than 120 seconds.
Not tainted 5.0.0-rc4-capi2-kexec+ #2
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
kworker/80:1 D 0 890 2 0x00000808
Workqueue: events work_for_cpu_fn
Call Trace:
0x4d72136320 (unreliable)
__switch_to+0x2cc/0x460
__schedule+0x2bc/0xac0
schedule+0x40/0xb0
cxlflash_remove+0xec/0x640 [cxlflash]
cxlflash_probe+0x370/0x8f0 [cxlflash]
local_pci_probe+0x6c/0x140
work_for_cpu_fn+0x38/0x60
process_one_work+0x260/0x530
worker_thread+0x280/0x5d0
kthread+0x1a8/0x1b0
ret_from_kernel_thread+0x5c/0x80
INFO: task systemd-udevd:5160 blocked for more than 120 seconds.
The deadlock occurs as cxlflash_remove() is called from
cxlflash_probe() without setting 'cxlflash_cfg->state' to STATE_PROBED
and the probe thread starts to wait on
'cxlflash_cfg->reset_waitq'. Since the device was never successfully
probed the 'cxlflash_cfg->state' never changes from STATE_PROBING
hence the deadlock occurs.
We fix this deadlock by setting the variable 'cxlflash_cfg->state' to
STATE_PROBED in case an error occurs during cxlflash_probe() and just
before calling cxlflash_remove().
Cc: stable(a)vger.kernel.org
Fixes: c21e0bbfc485("cxlflash: Base support for IBM CXL Flash Adapter")
Signed-off-by: Vaibhav Jain <vaibhav(a)linux.ibm.com>
---
drivers/scsi/cxlflash/main.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
index bfa13e3b191c..c8bad2c093b8 100644
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -3687,6 +3687,7 @@ static int cxlflash_probe(struct pci_dev *pdev,
host->max_cmd_len = CXLFLASH_MAX_CDB_LEN;
cfg = shost_priv(host);
+ cfg->state = STATE_PROBING;
cfg->host = host;
rc = alloc_mem(cfg);
if (rc) {
@@ -3775,6 +3776,7 @@ static int cxlflash_probe(struct pci_dev *pdev,
return rc;
out_remove:
+ cfg->state = STATE_PROBED;
cxlflash_remove(pdev);
goto out;
}
--
2.20.1
Since .scsi_done() must only be called after scsi_queue_rq() has
finished, make sure that the SRP initiator driver does not call
.scsi_done() while scsi_queue_rq() is in progress. Although
invoking sg_reset -d while I/O is in progress works fine with kernel
v4.20 and before, that is not the case with kernel v5.0-rc1. This
patch avoids that the following crash is triggered with kernel
v5.0-rc1:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000138
CPU: 0 PID: 360 Comm: kworker/0:1H Tainted: G B 5.0.0-rc1-dbg+ #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
Workqueue: kblockd blk_mq_run_work_fn
RIP: 0010:blk_mq_dispatch_rq_list+0x116/0xb10
Call Trace:
blk_mq_sched_dispatch_requests+0x2f7/0x300
__blk_mq_run_hw_queue+0xd6/0x180
blk_mq_run_work_fn+0x27/0x30
process_one_work+0x4f1/0xa20
worker_thread+0x67/0x5b0
kthread+0x1cf/0x1f0
ret_from_fork+0x24/0x30
Cc: Sergey Gorenko <sergeygo(a)mellanox.com>
Cc: Max Gurtovoy <maxg(a)mellanox.com>
Cc: Laurence Oberman <loberman(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Fixes: 94a9174c630c ("IB/srp: reduce lock coverage of command completion") # v2.6.38
Signed-off-by: Bart Van Assche <bvanassche(a)acm.org>
---
Changes compared to v1: left out the code that waits until in-progress requests
have finished.
drivers/infiniband/ulp/srp/ib_srp.c | 10 ----------
1 file changed, 10 deletions(-)
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index af5197b5e7f1..085dba075651 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -3037,7 +3037,6 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
{
struct srp_target_port *target = host_to_target(scmnd->device->host);
struct srp_rdma_ch *ch;
- int i, j;
u8 status;
shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n");
@@ -3049,15 +3048,6 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
if (status)
return FAILED;
- for (i = 0; i < target->ch_count; i++) {
- ch = &target->ch[i];
- for (j = 0; j < target->req_ring_size; ++j) {
- struct srp_request *req = &ch->req_ring[j];
-
- srp_finish_req(ch, req, scmnd->device, DID_RESET << 16);
- }
- }
-
return SUCCESS;
}
--
2.20.1.495.gaa96b0ce6b-goog
There is one CVE: CVE-2018-5391 kernel: IP fragments with random offsets allow a
remote denial of service (FragmentSmack),
A fix is a merge commit in the Linux kernel tree:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
consisting of the following commits:
7969e5c40dfd04799d4341f1b7cd266b6e47f227 ip: discard IPv4 datagrams with overlapping segments.
385114dec8a49b5e5945e77ba7de6356106713f4 net: modify skb_rbtree_purge to return the truesize of all purged skbs.
fa0f527358bd900ef92f925878ed6bfbd51305cc ip: use rb trees for IP frag queue.
All above patches are with rb tree to fix this CVE, which is very similar the CVE-2018-5390, that I have backport
to stable 4.4 branch in last year.
In these patchset, I will backport some patches to fix CVE-2018-5391 with rb tree.
v1->v2: in this patch, ipv6: defrag: drop non-last frags smaller than min mtu
fix the incorrect return value of nf_ct_frag6_gather.
Dan Carpenter (1):
ipv4: frags: precedence bug in ip_expire()
Eric Dumazet (2):
net: speed up skb_rbtree_purge()
inet: frags: get rif of inet_frag_evicting()
Florian Westphal (1):
ipv6: defrag: drop non-last frags smaller than min mtu
Michal Kubecek (1):
net: ipv4: do not handle duplicate fragments as overlapping
Peter Oskolkov (5):
ip: discard IPv4 datagrams with overlapping segments.
net: modify skb_rbtree_purge to return the truesize of all purged
skbs.
ip: use rb trees for IP frag queue.
ip: add helpers to process in-order fragments faster.
ip: process in-order fragments efficiently
Taehee Yoo (1):
ip: frags: fix crash in ip_do_fragment()
include/linux/skbuff.h | 4 +-
include/net/inet_frag.h | 12 +-
include/uapi/linux/snmp.h | 1 +
net/core/skbuff.c | 17 +-
net/ipv4/inet_fragment.c | 16 +-
net/ipv4/ip_fragment.c | 410 +++++++++++++++++++-------------
net/ipv4/proc.c | 1 +
net/ipv6/netfilter/nf_conntrack_reasm.c | 6 +
net/ipv6/reassembly.c | 9 +-
9 files changed, 292 insertions(+), 184 deletions(-)
--
1.8.3.1
From: Benjamin Herrenschmidt <benh(a)kernel.crashing.org>
commit 726e41097920a73e4c7c33385dcc0debb1281e18 upstream
For devices with a class, we create a "glue" directory between
the parent device and the new device with the class name.
This directory is never "explicitely" removed when empty however,
this is left to the implicit sysfs removal done by kobject_release()
when the object loses its last reference via kobject_put().
This is problematic because as long as it's not been removed from
sysfs, it is still present in the class kset and in sysfs directory
structure.
The presence in the class kset exposes a use after free bug fixed
by the previous patch, but the presence in sysfs means that until
the kobject is released, which can take a while (especially with
kobject debugging), any attempt at re-creating such as binding a
new device for that class/parent pair, will result in a sysfs
duplicate file name error.
This fixes it by instead doing an explicit kobject_del() when
the glue dir is empty, by keeping track of the number of
child devices of the gluedir.
This is made easy by the fact that all glue dir operations are
done with a global mutex, and there's already a function
(cleanup_glue_dir) called in all the right places taking that
mutex that can be enhanced for this. It appears that this was
in fact the intent of the function, but the implementation was
wrong.
Backport Note: kref_read() is not present in 4.4. Hence,
use atomic_read(&kref.refcount) instead of kref_read(&kref).
Signed-off-by: Benjamin Herrenschmidt <benh(a)kernel.crashing.org>
Acked-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Zubin Mithra <zsm(a)chromium.org>
---
drivers/base/core.c | 2 ++
include/linux/kobject.h | 17 +++++++++++++++++
2 files changed, 19 insertions(+)
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 049ccc070ce56..cb5718d2669ee 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -862,6 +862,8 @@ static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
return;
mutex_lock(&gdp_mutex);
+ if (!kobject_has_children(glue_dir))
+ kobject_del(glue_dir);
kobject_put(glue_dir);
mutex_unlock(&gdp_mutex);
}
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index e6284591599ec..5957c6a3fd7f9 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -113,6 +113,23 @@ extern void kobject_put(struct kobject *kobj);
extern const void *kobject_namespace(struct kobject *kobj);
extern char *kobject_get_path(struct kobject *kobj, gfp_t flag);
+/**
+ * kobject_has_children - Returns whether a kobject has children.
+ * @kobj: the object to test
+ *
+ * This will return whether a kobject has other kobjects as children.
+ *
+ * It does NOT account for the presence of attribute files, only sub
+ * directories. It also assumes there is no concurrent addition or
+ * removal of such children, and thus relies on external locking.
+ */
+static inline bool kobject_has_children(struct kobject *kobj)
+{
+ WARN_ON_ONCE(atomic_read(&kobj->kref.refcount) == 0);
+
+ return kobj->sd && kobj->sd->dir.subdirs;
+}
+
struct kobj_type {
void (*release)(struct kobject *kobj);
const struct sysfs_ops *sysfs_ops;
--
2.20.1.495.gaa96b0ce6b-goog
The 'pss_locked' field of smaps_rollup was being calculated incorrectly
as it accumulated the current pss everytime a locked VMA was found.
Fix that by making sure we record the current pss value before each VMA
is walked. So, we can only add the delta if the VMA was found to be
VM_LOCKED.
Fixes: 493b0e9d945f ("mm: add /proc/pid/smaps_rollup")
Cc: stable(a)vger.kernel.org # 4.14.y 4.19.y
Signed-off-by: Sandeep Patil <sspatil(a)android.com>
---
fs/proc/task_mmu.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f0ec9edab2f3..51a00a2b4733 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -709,6 +709,7 @@ static void smap_gather_stats(struct vm_area_struct *vma,
#endif
.mm = vma->vm_mm,
};
+ unsigned long pss;
smaps_walk.private = mss;
@@ -737,11 +738,12 @@ static void smap_gather_stats(struct vm_area_struct *vma,
}
}
#endif
-
+ /* record current pss so we can calculate the delta after page walk */
+ pss = mss->pss;
/* mmap_sem is held in m_start */
walk_page_vma(vma, &smaps_walk);
if (vma->vm_flags & VM_LOCKED)
- mss->pss_locked += mss->pss;
+ mss->pss_locked += mss->pss - pss;
}
#define SEQ_PUT_DEC(str, val) \
--
2.20.1.321.g9e740568ce-goog
From: "Gustavo A. R. Silva" <gustavo(a)embeddedor.com>
[ Upstream commit a37805098900a6e73a55b3a43b7d3bcd987bb3f4 ]
idx can be indirectly controlled by user-space, hence leading to a
potential exploitation of the Spectre variant 1 vulnerability.
This issue was detected with the help of Smatch:
drivers/gpu/drm/drm_bufs.c:1420 drm_legacy_freebufs() warn: potential
spectre issue 'dma->buflist' [r] (local cap)
Fix this by sanitizing idx before using it to index dma->buflist
Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].
[1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2
Signed-off-by: Gustavo A. R. Silva <gustavo(a)embeddedor.com>
Signed-off-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20181016095549.GA23586@embedd…
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/gpu/drm/drm_bufs.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/drm_bufs.c b/drivers/gpu/drm/drm_bufs.c
index ba8cfe65c65b..e2f775d1c112 100644
--- a/drivers/gpu/drm/drm_bufs.c
+++ b/drivers/gpu/drm/drm_bufs.c
@@ -36,6 +36,8 @@
#include <drm/drmP.h>
#include "drm_legacy.h"
+#include <linux/nospec.h>
+
static struct drm_map_list *drm_find_matching_map(struct drm_device *dev,
struct drm_local_map *map)
{
@@ -1417,6 +1419,7 @@ int drm_legacy_freebufs(struct drm_device *dev, void *data,
idx, dma->buf_count - 1);
return -EINVAL;
}
+ idx = array_index_nospec(idx, dma->buf_count);
buf = dma->buflist[idx];
if (buf->file_priv != file_priv) {
DRM_ERROR("Process %d freeing buffer not owned\n",
--
2.19.1
This is an updated version of the patch previously posted on January 24.
Changes since then:
- The previous version didn't take filesystems with a single bitmap
block per filesystem into account, and could still loop endlessly
in that case.
- When starting a scan at the beginning of a bitmap block and we wrap
around, there is no need to rescan that bitmap block again. This
revised version takes that into account as well.
- At the end of gfs2_rbm_find, we update rd_extfail_pt when the scan
has started at the beginning of the resource group. However, we
should update rd_extfail_pt whenever we have scanned the entire
resource group, including when we have wrapped around.
--
The fix from commit 2d29f6b96d8f introduced an unexpected performance
regression when allocating blocks. Fix by rewriting the overly complicated
wrap-around logic in gfs2_rbm_find. Discovered and verified with iozone.
Fixes: 2d29f6b96d8f ("gfs2: Fix loop in gfs2_rbm_find")
Cc: stable(a)vger.kernel.org # v3.13+
Signed-off-by: Andreas Gruenbacher <agruenba(a)redhat.com>
---
fs/gfs2/rgrp.c | 54 +++++++++++++++++++++++---------------------------
1 file changed, 25 insertions(+), 29 deletions(-)
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 831d7cb5a49c4..52a4f340a8672 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1729,25 +1729,22 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
const struct gfs2_inode *ip, bool nowrap)
{
+ bool scan_from_start = rbm->bii == 0 && rbm->offset == 0;
struct buffer_head *bh;
- int initial_bii;
- u32 initial_offset;
- int first_bii = rbm->bii;
- u32 first_offset = rbm->offset;
+ int last_bii;
u32 offset;
u8 *buffer;
- int n = 0;
- int iters = rbm->rgd->rd_length;
+ bool wrapped = false;
int ret;
struct gfs2_bitmap *bi;
struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
- /* If we are not starting at the beginning of a bitmap, then we
- * need to add one to the bitmap count to ensure that we search
- * the starting bitmap twice.
+ /*
+ * Determine the last bitmap to search. If we're not starting at the
+ * beginning of a bitmap, we need to search that bitmap twice to scan
+ * the entire resource group.
*/
- if (rbm->offset != 0)
- iters++;
+ last_bii = rbm->bii - (rbm->offset == 0);
while(1) {
bi = rbm_bi(rbm);
@@ -1761,47 +1758,46 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
WARN_ON(!buffer_uptodate(bh));
if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
buffer = bi->bi_clone + bi->bi_offset;
- initial_offset = rbm->offset;
offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
- if (offset == BFITNOENT)
- goto bitmap_full;
+ if (offset == BFITNOENT) {
+ if (state == GFS2_BLKST_FREE && rbm->offset == 0)
+ set_bit(GBF_FULL, &bi->bi_flags);
+ goto next_bitmap;
+ }
rbm->offset = offset;
if (ip == NULL)
return 0;
- initial_bii = rbm->bii;
ret = gfs2_reservation_check_and_update(rbm, ip,
minext ? *minext : 0,
&maxext);
if (ret == 0)
return 0;
- if (ret > 0) {
- n += (rbm->bii - initial_bii);
+ if (ret > 0)
goto next_iter;
- }
if (ret == -E2BIG) {
- n += rbm->bii - initial_bii;
rbm->bii = 0;
rbm->offset = 0;
goto res_covered_end_of_rgrp;
}
return ret;
-bitmap_full: /* Mark bitmap as full and fall through */
- if ((state == GFS2_BLKST_FREE) && initial_offset == 0)
- set_bit(GBF_FULL, &bi->bi_flags);
-
next_bitmap: /* Find next bitmap in the rgrp */
rbm->offset = 0;
rbm->bii++;
if (rbm->bii == rbm->rgd->rd_length)
rbm->bii = 0;
res_covered_end_of_rgrp:
- if ((rbm->bii == 0) && nowrap)
- break;
- n++;
+ if (rbm->bii == 0) {
+ if (wrapped)
+ break;
+ wrapped = true;
+ if (nowrap)
+ break;
+ }
next_iter:
- if (n >= iters)
+ /* Have we scanned the entire resource group? */
+ if (wrapped && rbm->bii > last_bii)
break;
}
@@ -1811,8 +1807,8 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
/* If the extent was too small, and it's smaller than the smallest
to have failed before, remember for future reference that it's
useless to search this rgrp again for this amount or more. */
- if ((first_offset == 0) && (first_bii == 0) &&
- (*minext < rbm->rgd->rd_extfail_pt))
+ if (wrapped && (scan_from_start || rbm->bii > last_bii) &&
+ *minext < rbm->rgd->rd_extfail_pt)
rbm->rgd->rd_extfail_pt = *minext;
/* If the maximum extent we found is big enough to fulfill the
--
2.20.1