Hi Amir,
Here's a backport of this patch to 4.18 and earlier. Tested good with
ltp/fanotify09.
I didn't quite understand the relevance of masking against ALL_FSNOTIFY_EVENTS
in __fsnotify_parent() and the backport in fsnotify() is also not quite trivial.
So, can you please review?
Thanks,
Miklos
---
From: Amir Goldstein <amir73il(a)gmail.com>
Date: Tue, 30 Oct 2018 20:29:53 +0200
Subject: [PATCH] fanotify: fix handling of events on child sub-directory
When an event is reported on a sub-directory and the parent inode has
a mark mask with FS_EVENT_ON_CHILD|FS_ISDIR, the event will be sent to
fsnotify() even if the event type is not in the parent mark mask
(e.g. FS_OPEN).
Further more, if that event happened on a mount or a filesystem with
a mount/sb mark that does have that event type in their mask, the "on
child" event will be reported on the mount/sb mark. That is not
desired, because user will get a duplicate event for the same action.
Note that the event reported on the victim inode is never merged with
the event reported on the parent inode, because of the check in
should_merge(): old_fsn->inode == new_fsn->inode.
Fix this by looking for a match of an actual event type (i.e. not just
FS_ISDIR) in parent's inode mark mask and by not reporting an "on child"
event to group if event type is only found on mount/sb marks.
[backport hint: The bug seems to have always been in fanotify, but this
patch will only apply cleanly to v4.19.y]
Cc: <stable(a)vger.kernel.org> # v4.19
Signed-off-by: Amir Goldstein <amir73il(a)gmail.com>
Signed-off-by: Jan Kara <jack(a)suse.cz>
(cherry picked from commit b469e7e47c8a075cc08bcd1e85d4365134bdcdd5)
---
fs/notify/fanotify/fanotify.c | 10 +++++-----
fs/notify/fsnotify.c | 8 ++++++--
2 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index f90842efea13..78126bd7c162 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -113,12 +113,12 @@ static bool fanotify_should_send_event(struct fsnotify_iter_info *iter_info,
continue;
mark = iter_info->marks[type];
/*
- * if the event is for a child and this inode doesn't care about
- * events on the child, don't send it!
+ * If the event is for a child and this mark doesn't care about
+ * events on a child, don't send it!
*/
- if (type == FSNOTIFY_OBJ_TYPE_INODE &&
- (event_mask & FS_EVENT_ON_CHILD) &&
- !(mark->mask & FS_EVENT_ON_CHILD))
+ if (event_mask & FS_EVENT_ON_CHILD &&
+ (type != FSNOTIFY_OBJ_TYPE_INODE ||
+ !(mark->mask & FS_EVENT_ON_CHILD)))
continue;
marks_mask |= mark->mask;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ababdbfab537..46d27b357226 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -158,9 +158,9 @@ int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask
parent = dget_parent(dentry);
p_inode = parent->d_inode;
- if (unlikely(!fsnotify_inode_watches_children(p_inode)))
+ if (unlikely(!fsnotify_inode_watches_children(p_inode))) {
__fsnotify_update_child_dentry_flags(p_inode);
- else if (p_inode->i_fsnotify_mask & mask) {
+ } else if (p_inode->i_fsnotify_mask & mask & ~FS_EVENT_ON_CHILD) {
struct name_snapshot name;
/* we are notifying a parent so come up with the new mask which
@@ -329,6 +329,10 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
else
mnt = NULL;
+ /* An event "on child" is not intended for a mount mark */
+ if (mask & FS_EVENT_ON_CHILD)
+ mnt = NULL;
+
/*
* Optimization: srcu_read_lock() has a memory barrier which can
* be expensive. It protects walking the *_fsnotify_marks lists.
--
2.14.5
commit 2f31a67f01a8beb22cae754c53522cb61a005750 upstream.
Backport to apply on 4.14.y and older stable releases
USB3 roothub might autosuspend before a plugged USB3 device is detected,
causing USB3 device enumeration failure.
USB3 devices don't show up as connected and enabled until USB3 link trainig
completes. On a fast booting platform with a slow USB3 link training the
link might reach the connected enabled state just as the bus is suspending.
If this device is discovered first time by the xhci_bus_suspend() routine
it will be put to U3 suspended state like the other ports which failed to
suspend earlier.
The hub thread will notice the connect change and resume the bus,
moving the port back to U0
This U0 -> U3 -> U0 transition right after being connected seems to be
too much for some devices, causing them to first go to SS.Inactive state,
and finally end up stuck in a polling state with reset asserted
Fix this by failing the bus suspend if a port has a connect change or is
in a polling state in xhci_bus_suspend().
Don't do any port changes until all ports are checked, buffer all port
changes and only write them in the end if suspend can proceed
Cc: stable(a)vger.kernel.org
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/host/xhci-hub.c | 60 ++++++++++++++++++++++++++++++++++-----------
1 file changed, 46 insertions(+), 14 deletions(-)
diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c
index d2a9767..61db9ee 100644
--- a/drivers/usb/host/xhci-hub.c
+++ b/drivers/usb/host/xhci-hub.c
@@ -1481,13 +1481,16 @@ int xhci_bus_suspend(struct usb_hcd *hcd)
__le32 __iomem **port_array;
struct xhci_bus_state *bus_state;
unsigned long flags;
+ u32 portsc_buf[USB_MAXCHILDREN];
+ bool wake_enabled;
max_ports = xhci_get_ports(hcd, &port_array);
bus_state = &xhci->bus_state[hcd_index(hcd)];
+ wake_enabled = hcd->self.root_hub->do_remote_wakeup;
spin_lock_irqsave(&xhci->lock, flags);
- if (hcd->self.root_hub->do_remote_wakeup) {
+ if (wake_enabled) {
if (bus_state->resuming_ports || /* USB2 */
bus_state->port_remote_wakeup) { /* USB3 */
spin_unlock_irqrestore(&xhci->lock, flags);
@@ -1495,26 +1498,36 @@ int xhci_bus_suspend(struct usb_hcd *hcd)
return -EBUSY;
}
}
-
- port_index = max_ports;
+ /*
+ * Prepare ports for suspend, but don't write anything before all ports
+ * are checked and we know bus suspend can proceed
+ */
bus_state->bus_suspended = 0;
+ port_index = max_ports;
while (port_index--) {
- /* suspend the port if the port is not suspended */
u32 t1, t2;
- int slot_id;
t1 = readl(port_array[port_index]);
t2 = xhci_port_state_to_neutral(t1);
+ portsc_buf[port_index] = 0;
- if ((t1 & PORT_PE) && !(t1 & PORT_PLS_MASK)) {
- xhci_dbg(xhci, "port %d not suspended\n", port_index);
- slot_id = xhci_find_slot_id_by_port(hcd, xhci,
- port_index + 1);
- if (slot_id) {
+ /* Bail out if a USB3 port has a new device in link training */
+ if ((t1 & PORT_PLS_MASK) == XDEV_POLLING) {
+ bus_state->bus_suspended = 0;
+ spin_unlock_irqrestore(&xhci->lock, flags);
+ xhci_dbg(xhci, "Bus suspend bailout, port in polling\n");
+ return -EBUSY;
+ }
+
+ /* suspend ports in U0, or bail out for new connect changes */
+ if ((t1 & PORT_PE) && (t1 & PORT_PLS_MASK) == XDEV_U0) {
+ if ((t1 & PORT_CSC) && wake_enabled) {
+ bus_state->bus_suspended = 0;
spin_unlock_irqrestore(&xhci->lock, flags);
- xhci_stop_device(xhci, slot_id, 1);
- spin_lock_irqsave(&xhci->lock, flags);
+ xhci_dbg(xhci, "Bus suspend bailout, port connect change\n");
+ return -EBUSY;
}
+ xhci_dbg(xhci, "port %d not suspended\n", port_index);
t2 &= ~PORT_PLS_MASK;
t2 |= PORT_LINK_STROBE | XDEV_U3;
set_bit(port_index, &bus_state->bus_suspended);
@@ -1523,7 +1536,7 @@ int xhci_bus_suspend(struct usb_hcd *hcd)
* including the USB 3.0 roothub, but only if CONFIG_PM
* is enabled, so also enable remote wake here.
*/
- if (hcd->self.root_hub->do_remote_wakeup) {
+ if (wake_enabled) {
if (t1 & PORT_CONNECT) {
t2 |= PORT_WKOC_E | PORT_WKDISC_E;
t2 &= ~PORT_WKCONN_E;
@@ -1543,7 +1556,26 @@ int xhci_bus_suspend(struct usb_hcd *hcd)
t1 = xhci_port_state_to_neutral(t1);
if (t1 != t2)
- writel(t2, port_array[port_index]);
+ portsc_buf[port_index] = t2;
+ }
+
+ /* write port settings, stopping and suspending ports if needed */
+ port_index = max_ports;
+ while (port_index--) {
+ if (!portsc_buf[port_index])
+ continue;
+ if (test_bit(port_index, &bus_state->bus_suspended)) {
+ int slot_id;
+
+ slot_id = xhci_find_slot_id_by_port(hcd, xhci,
+ port_index + 1);
+ if (slot_id) {
+ spin_unlock_irqrestore(&xhci->lock, flags);
+ xhci_stop_device(xhci, slot_id, 1);
+ spin_lock_irqsave(&xhci->lock, flags);
+ }
+ }
+ writel(portsc_buf[port_index], port_array[port_index]);
}
hcd->state = HC_STATE_SUSPENDED;
bus_state->next_statechange = jiffies + msecs_to_jiffies(10);
--
2.7.4
From: Parav Pandit <parav(a)mellanox.com>
commit b2bedfb39541a7e14798d066b6f8685d84c8fcf5 upstream
Currently qp->port stores the port number whenever IB_QP_PORT
QP attribute mask is set (during QP state transition to INIT state).
This port number should be stored for the real QP when XRC target QP
is used.
Follow the ib_modify_qp() implementation and hide the access to ->real_qp.
This commit is required for proper operation of commit 65be9cbe1224
("RDMA/uverbs: Expand primary and alt AV port checks") which was added
to 4.14.61.
Without this commit, XRC qp's do not work.
Fixes: a512c2fbef9c ("IB/core: Introduce modify QP operation with udata")
Signed-off-by: Jack Morgenstein <jackm(a)dev.mellanox.co.il>
Signed-off-by: Parav Pandit <parav(a)mellanox.com>
Reviewed-by: Daniel Jurgens <danielj(a)mellanox.com>
Signed-off-by: Leon Romanovsky <leon(a)kernel.org>
Signed-off-by: Jason Gunthorpe <jgg(a)mellanox.com>
---
drivers/infiniband/core/verbs.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index feb80db..6d59af0 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1285,7 +1285,7 @@ int ib_resolve_eth_dmac(struct ib_device *device,
/**
* ib_modify_qp_with_udata - Modifies the attributes for the specified QP.
- * @qp: The QP to modify.
+ * @ib_qp: The QP to modify.
* @attr: On input, specifies the QP attributes to modify. On output,
* the current values of selected QP attributes are returned.
* @attr_mask: A bit-mask used to specify which attributes of the QP
@@ -1294,9 +1294,10 @@ int ib_resolve_eth_dmac(struct ib_device *device,
* are being modified.
* It returns 0 on success and returns appropriate error code on error.
*/
-int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr,
+int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr,
int attr_mask, struct ib_udata *udata)
{
+ struct ib_qp *qp = ib_qp->real_qp;
int ret;
if (attr_mask & IB_QP_AV) {
--
1.8.1.2
From: Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
vb2_start_streaming() already rolls back the buffers, so there is no
need to call __vb2_queue_cancel(). Especially since __vb2_queue_cancel()
does too much, such as zeroing the q->queued_count value, causing vb2
to think that no buffers have been queued.
It appears that this call to __vb2_queue_cancel() is a left-over from
before commit b3379c6201bb3.
Signed-off-by: Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
Fixes: b3379c6201bb3 ('vb2: only call start_streaming if sufficient buffers are queued')
Cc: <stable(a)vger.kernel.org> # for v4.16 and up
---
drivers/media/common/videobuf2/videobuf2-core.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c
index 0ca81d495bda..77e2bfe5e722 100644
--- a/drivers/media/common/videobuf2/videobuf2-core.c
+++ b/drivers/media/common/videobuf2/videobuf2-core.c
@@ -1941,10 +1941,8 @@ int vb2_core_streamon(struct vb2_queue *q, unsigned int type)
if (ret)
return ret;
ret = vb2_start_streaming(q);
- if (ret) {
- __vb2_queue_cancel(q);
+ if (ret)
return ret;
- }
}
q->streaming = 1;
--
2.19.1
If we race with inode destroy, it's possible for page->mapping to be
NULL before we even enter this routine, as well as after having slept
waiting for the dax entry to become unlocked.
Fixes: c2a7d2a11552 ("filesystem-dax: Introduce dax_lock_mapping_entry()")
Cc: stable(a)vger.kernel.org
Reported-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Matthew Wilcox <willy(a)infradead.org>
---
fs/dax.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/dax.c b/fs/dax.c
index 9bcce89ea18e..e69fc231833b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -365,7 +365,7 @@ bool dax_lock_mapping_entry(struct page *page)
struct address_space *mapping = READ_ONCE(page->mapping);
locked = false;
- if (!dax_mapping(mapping))
+ if (!mapping || !dax_mapping(mapping))
break;
/*
--
2.19.1
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 21556350ade3cb5d7afecc8b3544e56431d21695 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala(a)linux.intel.com>
Date: Wed, 14 Nov 2018 19:34:40 +0200
Subject: [PATCH] drm/i915: Disable LP3 watermarks on all SNB machines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
I have a Thinkpad X220 Tablet in my hands that is losing vblank
interrupts whenever LP3 watermarks are used.
If I nudge the latency value written to the WM3 register just
by one in either direction the problem disappears. That to me
suggests that the punit will not enter the corrsponding
powersave mode (MPLL shutdown IIRC) unless the latency value
in the register matches exactly what we read from SSKPD. Ie.
it's not really a latency value but rather just a cookie
by which the punit can identify the desired power saving state.
On HSW/BDW this was changed such that we actually just write
the WM level number into those bits, which makes much more
sense given the observed behaviour.
We could try to handle this by disallowing LP3 watermarks
only when vblank interrupts are enabled but we'd first have
to prove that only vblank interrupts are affected, which
seems unlikely. Also we can't grab the wm mutex from the
vblank enable/disable hooks because those are called with
various spinlocks held. Thus we'd have to redesigne the
watermark locking. So to play it safe and keep the code
simple we simply disable LP3 watermarks on all SNB machines.
To do that we simply zero out the latency values for
watermark level 3, and we adjust the watermark computation
to check for that. The behaviour now matches that of the
g4x/vlv/skl wm code in the presence of a zeroed latency
value.
v2: s/USHRT_MAX/U32_MAX/ for consistency with the types (Chris)
Cc: stable(a)vger.kernel.org
Cc: Chris Wilson <chris(a)chris-wilson.co.uk>
Acked-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101269
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103713
Signed-off-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181114173440.6730-1-ville.s…
(cherry picked from commit 03981c6ebec4fc7056b9b45f847393aeac90d060)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen(a)linux.intel.com>
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 245f0022bcfd..3fe358db1276 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -2493,6 +2493,9 @@ static uint32_t ilk_compute_pri_wm(const struct intel_crtc_state *cstate,
uint32_t method1, method2;
int cpp;
+ if (mem_value == 0)
+ return U32_MAX;
+
if (!intel_wm_plane_visible(cstate, pstate))
return 0;
@@ -2522,6 +2525,9 @@ static uint32_t ilk_compute_spr_wm(const struct intel_crtc_state *cstate,
uint32_t method1, method2;
int cpp;
+ if (mem_value == 0)
+ return U32_MAX;
+
if (!intel_wm_plane_visible(cstate, pstate))
return 0;
@@ -2545,6 +2551,9 @@ static uint32_t ilk_compute_cur_wm(const struct intel_crtc_state *cstate,
{
int cpp;
+ if (mem_value == 0)
+ return U32_MAX;
+
if (!intel_wm_plane_visible(cstate, pstate))
return 0;
@@ -3008,6 +3017,34 @@ static void snb_wm_latency_quirk(struct drm_i915_private *dev_priv)
intel_print_wm_latency(dev_priv, "Cursor", dev_priv->wm.cur_latency);
}
+static void snb_wm_lp3_irq_quirk(struct drm_i915_private *dev_priv)
+{
+ /*
+ * On some SNB machines (Thinkpad X220 Tablet at least)
+ * LP3 usage can cause vblank interrupts to be lost.
+ * The DEIIR bit will go high but it looks like the CPU
+ * never gets interrupted.
+ *
+ * It's not clear whether other interrupt source could
+ * be affected or if this is somehow limited to vblank
+ * interrupts only. To play it safe we disable LP3
+ * watermarks entirely.
+ */
+ if (dev_priv->wm.pri_latency[3] == 0 &&
+ dev_priv->wm.spr_latency[3] == 0 &&
+ dev_priv->wm.cur_latency[3] == 0)
+ return;
+
+ dev_priv->wm.pri_latency[3] = 0;
+ dev_priv->wm.spr_latency[3] = 0;
+ dev_priv->wm.cur_latency[3] = 0;
+
+ DRM_DEBUG_KMS("LP3 watermarks disabled due to potential for lost interrupts\n");
+ intel_print_wm_latency(dev_priv, "Primary", dev_priv->wm.pri_latency);
+ intel_print_wm_latency(dev_priv, "Sprite", dev_priv->wm.spr_latency);
+ intel_print_wm_latency(dev_priv, "Cursor", dev_priv->wm.cur_latency);
+}
+
static void ilk_setup_wm_latency(struct drm_i915_private *dev_priv)
{
intel_read_wm_latency(dev_priv, dev_priv->wm.pri_latency);
@@ -3024,8 +3061,10 @@ static void ilk_setup_wm_latency(struct drm_i915_private *dev_priv)
intel_print_wm_latency(dev_priv, "Sprite", dev_priv->wm.spr_latency);
intel_print_wm_latency(dev_priv, "Cursor", dev_priv->wm.cur_latency);
- if (IS_GEN6(dev_priv))
+ if (IS_GEN6(dev_priv)) {
snb_wm_latency_quirk(dev_priv);
+ snb_wm_lp3_irq_quirk(dev_priv);
+ }
}
static void skl_setup_wm_latency(struct drm_i915_private *dev_priv)
From: Michael J. Ruhl <michael.j.ruhl(a)intel.com>
commit a0e0cb82804a6a21d9067022c2dfdf80d11da429 upstream
pq_update() can only be called in two places: from the completion
function when the complete (npkts) sequence of packets has been
submitted and processed, or from setup function if a subset of the
packets were submitted (i.e. the error path).
Currently both paths can call pq_update() if an error occurrs. This
race will cause the n_req value to go negative, hanging file_close(),
or cause a crash by freeing the txlist more than once.
Several variables are used to determine SDMA send state. Most of
these are unnecessary, and have code inspectible races between the
setup function and the completion function, in both the send path and
the error path.
The request 'status' value can be set by the setup or by the
completion function. This is code inspectibly racy. Since the status
is not needed in the completion code or by the caller it has been
removed.
The request 'done' value races between usage by the setup and the
completion function. The completion function does not need this.
When the number of processed packets matches npkts, it is done.
The 'has_error' value races between usage of the setup and the
completion function. This can cause incorrect error handling and leave
the n_req in an incorrect value (i.e. negative).
Simplify the code by removing all of the unneeded state checks and
variables.
Clean up iovs node when it is freed.
Eliminate race conditions in the error path:
If all packets are submitted, the completion handler will set the
completion status correctly (ok or aborted).
If all packets are not submitted, the caller must wait until the
submitted packets have completed, and then set the completion status.
These two change eliminate the race condition in the error path.
Cc: <stable(a)vger.kernel.org> # 4.14.0
Reviewed-by: Mitko Haralanov <mitko.haralanov(a)intel.com>
Reviewed-by: Mike Marciniszyn <mike.marciniszyn(a)intel.com>
Signed-off-by: Michael J. Ruhl <michael.j.ruhl(a)intel.com>
---
drivers/infiniband/hw/hfi1/user_sdma.c | 87 ++++++++++++++------------------
drivers/infiniband/hw/hfi1/user_sdma.h | 3 -
2 files changed, 39 insertions(+), 51 deletions(-)
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index 8c954a0..c14ec04 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -328,7 +328,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
u8 opcode, sc, vl;
u16 pkey;
u32 slid;
- int req_queued = 0;
u16 dlid;
u32 selector;
@@ -392,7 +391,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
req->data_len = 0;
req->pq = pq;
req->cq = cq;
- req->status = -1;
req->ahg_idx = -1;
req->iov_idx = 0;
req->sent = 0;
@@ -400,12 +398,14 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
req->seqcomp = 0;
req->seqsubmitted = 0;
req->tids = NULL;
- req->done = 0;
req->has_error = 0;
INIT_LIST_HEAD(&req->txps);
memcpy(&req->info, &info, sizeof(info));
+ /* The request is initialized, count it */
+ atomic_inc(&pq->n_reqs);
+
if (req_opcode(info.ctrl) == EXPECTED) {
/* expected must have a TID info and at least one data vector */
if (req->data_iovs < 2) {
@@ -500,7 +500,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
ret = pin_vector_pages(req, &req->iovs[i]);
if (ret) {
req->data_iovs = i;
- req->status = ret;
goto free_req;
}
req->data_len += req->iovs[i].iov.iov_len;
@@ -561,14 +560,10 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
req->ahg_idx = sdma_ahg_alloc(req->sde);
set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
- atomic_inc(&pq->n_reqs);
- req_queued = 1;
/* Send the first N packets in the request to buy us some time */
ret = user_sdma_send_pkts(req, pcount);
- if (unlikely(ret < 0 && ret != -EBUSY)) {
- req->status = ret;
+ if (unlikely(ret < 0 && ret != -EBUSY))
goto free_req;
- }
/*
* It is possible that the SDMA engine would have processed all the
@@ -588,14 +583,8 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
while (req->seqsubmitted != req->info.npkts) {
ret = user_sdma_send_pkts(req, pcount);
if (ret < 0) {
- if (ret != -EBUSY) {
- req->status = ret;
- WRITE_ONCE(req->has_error, 1);
- if (ACCESS_ONCE(req->seqcomp) ==
- req->seqsubmitted - 1)
- goto free_req;
- return ret;
- }
+ if (ret != -EBUSY)
+ goto free_req;
wait_event_interruptible_timeout(
pq->busy.wait_dma,
(pq->state == SDMA_PKT_Q_ACTIVE),
@@ -606,10 +595,19 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
*count += idx;
return 0;
free_req:
- user_sdma_free_request(req, true);
- if (req_queued)
+ /*
+ * If the submitted seqsubmitted == npkts, the completion routine
+ * controls the final state. If sequbmitted < npkts, wait for any
+ * outstanding packets to finish before cleaning up.
+ */
+ if (req->seqsubmitted < req->info.npkts) {
+ if (req->seqsubmitted)
+ wait_event(pq->busy.wait_dma,
+ (req->seqcomp == req->seqsubmitted - 1));
+ user_sdma_free_request(req, true);
pq_update(pq);
- set_comp_state(pq, cq, info.comp_idx, ERROR, req->status);
+ set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
+ }
return ret;
}
@@ -917,7 +915,6 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count);
req->seqsubmitted += count;
if (req->seqsubmitted == req->info.npkts) {
- WRITE_ONCE(req->done, 1);
/*
* The txreq has already been submitted to the HW queue
* so we can free the AHG entry now. Corruption will not
@@ -1347,11 +1344,15 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
return diff;
}
-/*
- * SDMA tx request completion callback. Called when the SDMA progress
- * state machine gets notification that the SDMA descriptors for this
- * tx request have been processed by the DMA engine. Called in
- * interrupt context.
+/**
+ * user_sdma_txreq_cb() - SDMA tx request completion callback.
+ * @txreq: valid sdma tx request
+ * @status: success/failure of request
+ *
+ * Called when the SDMA progress state machine gets notification that
+ * the SDMA descriptors for this tx request have been processed by the
+ * DMA engine. Called in interrupt context.
+ * Only do work on completed sequences.
*/
static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
{
@@ -1360,7 +1361,7 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
struct user_sdma_request *req;
struct hfi1_user_sdma_pkt_q *pq;
struct hfi1_user_sdma_comp_q *cq;
- u16 idx;
+ enum hfi1_sdma_comp_state state = COMPLETE;
if (!tx->req)
return;
@@ -1373,31 +1374,19 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
SDMA_DBG(req, "SDMA completion with error %d",
status);
WRITE_ONCE(req->has_error, 1);
+ state = ERROR;
}
req->seqcomp = tx->seqnum;
kmem_cache_free(pq->txreq_cache, tx);
- tx = NULL;
-
- idx = req->info.comp_idx;
- if (req->status == -1 && status == SDMA_TXREQ_S_OK) {
- if (req->seqcomp == req->info.npkts - 1) {
- req->status = 0;
- user_sdma_free_request(req, false);
- pq_update(pq);
- set_comp_state(pq, cq, idx, COMPLETE, 0);
- }
- } else {
- if (status != SDMA_TXREQ_S_OK)
- req->status = status;
- if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) &&
- (READ_ONCE(req->done) ||
- READ_ONCE(req->has_error))) {
- user_sdma_free_request(req, false);
- pq_update(pq);
- set_comp_state(pq, cq, idx, ERROR, req->status);
- }
- }
+
+ /* sequence isn't complete? We are done */
+ if (req->seqcomp != req->info.npkts - 1)
+ return;
+
+ user_sdma_free_request(req, false);
+ set_comp_state(pq, cq, req->info.comp_idx, state, status);
+ pq_update(pq);
}
static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
@@ -1430,6 +1419,8 @@ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
if (!node)
continue;
+ req->iovs[i].node = NULL;
+
if (unpin)
hfi1_mmu_rb_remove(req->pq->handler,
&node->rb);
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h
index 9b8bb56..5af5233 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.h
+++ b/drivers/infiniband/hw/hfi1/user_sdma.h
@@ -196,8 +196,6 @@ struct user_sdma_request {
/* Writeable fields shared with interrupt */
u64 seqcomp ____cacheline_aligned_in_smp;
u64 seqsubmitted;
- /* status of the last txreq completed */
- int status;
/* Send side fields */
struct list_head txps ____cacheline_aligned_in_smp;
@@ -219,7 +217,6 @@ struct user_sdma_request {
u16 tididx;
/* progress index moving along the iovs array */
u8 iov_idx;
- u8 done;
u8 has_error;
struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];