The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 04c26faa51d1e2fe71cf13c45791f5174c37f986 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin(a)gmail.com>
Date: Mon, 17 May 2021 02:28:58 +0800
Subject: [PATCH] tipc: wait and exit until all work queues are done
On some host, a crash could be triggered simply by repeating these
commands several times:
# modprobe tipc
# tipc bearer enable media udp name UDP1 localip 127.0.0.1
# rmmod tipc
[] BUG: unable to handle kernel paging request at ffffffffc096bb00
[] Workqueue: events 0xffffffffc096bb00
[] Call Trace:
[] ? process_one_work+0x1a7/0x360
[] ? worker_thread+0x30/0x390
[] ? create_worker+0x1a0/0x1a0
[] ? kthread+0x116/0x130
[] ? kthread_flush_work_fn+0x10/0x10
[] ? ret_from_fork+0x35/0x40
When removing the TIPC module, the UDP tunnel sock will be delayed to
release in a work queue as sock_release() can't be done in rtnl_lock().
If the work queue is schedule to run after the TIPC module is removed,
kernel will crash as the work queue function cleanup_beareri() code no
longer exists when trying to invoke it.
To fix it, this patch introduce a member wq_count in tipc_net to track
the numbers of work queues in schedule, and wait and exit until all
work queues are done in tipc_exit_net().
Fixes: d0f91938bede ("tipc: add ip/udp media type")
Reported-by: Shuang Li <shuali(a)redhat.com>
Signed-off-by: Xin Long <lucien.xin(a)gmail.com>
Acked-by: Jon Maloy <jmaloy(a)redhat.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 5cc1f0307215..72f3ac73779b 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -119,6 +119,8 @@ static void __net_exit tipc_exit_net(struct net *net)
#ifdef CONFIG_TIPC_CRYPTO
tipc_crypto_stop(&tipc_net(net)->crypto_tx);
#endif
+ while (atomic_read(&tn->wq_count))
+ cond_resched();
}
static void __net_exit tipc_pernet_pre_exit(struct net *net)
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 03de7b213f55..5741ae488bb5 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -149,6 +149,8 @@ struct tipc_net {
#endif
/* Work item for net finalize */
struct tipc_net_work final_work;
+ /* The numbers of work queues in schedule */
+ atomic_t wq_count;
};
static inline struct tipc_net *tipc_net(struct net *net)
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index e556d2cdc064..c2bb818704c8 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -814,6 +814,7 @@ static void cleanup_bearer(struct work_struct *work)
kfree_rcu(rcast, rcu);
}
+ atomic_dec(&tipc_net(sock_net(ub->ubsock->sk))->wq_count);
dst_cache_destroy(&ub->rcast.dst_cache);
udp_tunnel_sock_release(ub->ubsock);
synchronize_net();
@@ -834,6 +835,7 @@ static void tipc_udp_disable(struct tipc_bearer *b)
RCU_INIT_POINTER(ub->bearer, NULL);
/* sock_release need to be done outside of rtnl lock */
+ atomic_inc(&tipc_net(sock_net(ub->ubsock->sk))->wq_count);
INIT_WORK(&ub->work, cleanup_bearer);
schedule_work(&ub->work);
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 04c26faa51d1e2fe71cf13c45791f5174c37f986 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin(a)gmail.com>
Date: Mon, 17 May 2021 02:28:58 +0800
Subject: [PATCH] tipc: wait and exit until all work queues are done
On some host, a crash could be triggered simply by repeating these
commands several times:
# modprobe tipc
# tipc bearer enable media udp name UDP1 localip 127.0.0.1
# rmmod tipc
[] BUG: unable to handle kernel paging request at ffffffffc096bb00
[] Workqueue: events 0xffffffffc096bb00
[] Call Trace:
[] ? process_one_work+0x1a7/0x360
[] ? worker_thread+0x30/0x390
[] ? create_worker+0x1a0/0x1a0
[] ? kthread+0x116/0x130
[] ? kthread_flush_work_fn+0x10/0x10
[] ? ret_from_fork+0x35/0x40
When removing the TIPC module, the UDP tunnel sock will be delayed to
release in a work queue as sock_release() can't be done in rtnl_lock().
If the work queue is schedule to run after the TIPC module is removed,
kernel will crash as the work queue function cleanup_beareri() code no
longer exists when trying to invoke it.
To fix it, this patch introduce a member wq_count in tipc_net to track
the numbers of work queues in schedule, and wait and exit until all
work queues are done in tipc_exit_net().
Fixes: d0f91938bede ("tipc: add ip/udp media type")
Reported-by: Shuang Li <shuali(a)redhat.com>
Signed-off-by: Xin Long <lucien.xin(a)gmail.com>
Acked-by: Jon Maloy <jmaloy(a)redhat.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 5cc1f0307215..72f3ac73779b 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -119,6 +119,8 @@ static void __net_exit tipc_exit_net(struct net *net)
#ifdef CONFIG_TIPC_CRYPTO
tipc_crypto_stop(&tipc_net(net)->crypto_tx);
#endif
+ while (atomic_read(&tn->wq_count))
+ cond_resched();
}
static void __net_exit tipc_pernet_pre_exit(struct net *net)
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 03de7b213f55..5741ae488bb5 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -149,6 +149,8 @@ struct tipc_net {
#endif
/* Work item for net finalize */
struct tipc_net_work final_work;
+ /* The numbers of work queues in schedule */
+ atomic_t wq_count;
};
static inline struct tipc_net *tipc_net(struct net *net)
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index e556d2cdc064..c2bb818704c8 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -814,6 +814,7 @@ static void cleanup_bearer(struct work_struct *work)
kfree_rcu(rcast, rcu);
}
+ atomic_dec(&tipc_net(sock_net(ub->ubsock->sk))->wq_count);
dst_cache_destroy(&ub->rcast.dst_cache);
udp_tunnel_sock_release(ub->ubsock);
synchronize_net();
@@ -834,6 +835,7 @@ static void tipc_udp_disable(struct tipc_bearer *b)
RCU_INIT_POINTER(ub->bearer, NULL);
/* sock_release need to be done outside of rtnl lock */
+ atomic_inc(&tipc_net(sock_net(ub->ubsock->sk))->wq_count);
INIT_WORK(&ub->work, cleanup_bearer);
schedule_work(&ub->work);
}
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 04c26faa51d1e2fe71cf13c45791f5174c37f986 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin(a)gmail.com>
Date: Mon, 17 May 2021 02:28:58 +0800
Subject: [PATCH] tipc: wait and exit until all work queues are done
On some host, a crash could be triggered simply by repeating these
commands several times:
# modprobe tipc
# tipc bearer enable media udp name UDP1 localip 127.0.0.1
# rmmod tipc
[] BUG: unable to handle kernel paging request at ffffffffc096bb00
[] Workqueue: events 0xffffffffc096bb00
[] Call Trace:
[] ? process_one_work+0x1a7/0x360
[] ? worker_thread+0x30/0x390
[] ? create_worker+0x1a0/0x1a0
[] ? kthread+0x116/0x130
[] ? kthread_flush_work_fn+0x10/0x10
[] ? ret_from_fork+0x35/0x40
When removing the TIPC module, the UDP tunnel sock will be delayed to
release in a work queue as sock_release() can't be done in rtnl_lock().
If the work queue is schedule to run after the TIPC module is removed,
kernel will crash as the work queue function cleanup_beareri() code no
longer exists when trying to invoke it.
To fix it, this patch introduce a member wq_count in tipc_net to track
the numbers of work queues in schedule, and wait and exit until all
work queues are done in tipc_exit_net().
Fixes: d0f91938bede ("tipc: add ip/udp media type")
Reported-by: Shuang Li <shuali(a)redhat.com>
Signed-off-by: Xin Long <lucien.xin(a)gmail.com>
Acked-by: Jon Maloy <jmaloy(a)redhat.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 5cc1f0307215..72f3ac73779b 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -119,6 +119,8 @@ static void __net_exit tipc_exit_net(struct net *net)
#ifdef CONFIG_TIPC_CRYPTO
tipc_crypto_stop(&tipc_net(net)->crypto_tx);
#endif
+ while (atomic_read(&tn->wq_count))
+ cond_resched();
}
static void __net_exit tipc_pernet_pre_exit(struct net *net)
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 03de7b213f55..5741ae488bb5 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -149,6 +149,8 @@ struct tipc_net {
#endif
/* Work item for net finalize */
struct tipc_net_work final_work;
+ /* The numbers of work queues in schedule */
+ atomic_t wq_count;
};
static inline struct tipc_net *tipc_net(struct net *net)
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index e556d2cdc064..c2bb818704c8 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -814,6 +814,7 @@ static void cleanup_bearer(struct work_struct *work)
kfree_rcu(rcast, rcu);
}
+ atomic_dec(&tipc_net(sock_net(ub->ubsock->sk))->wq_count);
dst_cache_destroy(&ub->rcast.dst_cache);
udp_tunnel_sock_release(ub->ubsock);
synchronize_net();
@@ -834,6 +835,7 @@ static void tipc_udp_disable(struct tipc_bearer *b)
RCU_INIT_POINTER(ub->bearer, NULL);
/* sock_release need to be done outside of rtnl lock */
+ atomic_inc(&tipc_net(sock_net(ub->ubsock->sk))->wq_count);
INIT_WORK(&ub->work, cleanup_bearer);
schedule_work(&ub->work);
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From fe7738eb3ca3631a75844e790f6cb576c0fe7b00 Mon Sep 17 00:00:00 2001
From: Dima Chumak <dchumak(a)nvidia.com>
Date: Mon, 26 Apr 2021 15:16:26 +0300
Subject: [PATCH] net/mlx5e: Fix nullptr in mlx5e_tc_add_fdb_flow()
The result of __dev_get_by_index() is not checked for NULL, which then
passed to mlx5e_attach_encap() and gets dereferenced.
Also, in case of a successful lookup, the net_device reference count is
not incremented, which may result in net_device pointer becoming invalid
at any time during mlx5e_attach_encap() execution.
Fix by using dev_get_by_index(), which does proper reference counting on
the net_device pointer. Also, handle nullptr return value when mirred
device is not found.
It's safe to call dev_put() on the mirred net_device pointer, right
after mlx5e_attach_encap() call, because it's not being saved/copied
down the call chain.
Fixes: 3c37745ec614 ("net/mlx5e: Properly deal with encap flows add/del under neigh update")
Addresses-Coverity: ("Dereference null return value")
Signed-off-by: Dima Chumak <dchumak(a)nvidia.com>
Reviewed-by: Vlad Buslov <vladbu(a)nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm(a)nvidia.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 46945d04b5b8..882bafba43f2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1322,10 +1322,10 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
struct netlink_ext_ack *extack)
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
- struct net_device *out_dev, *encap_dev = NULL;
struct mlx5e_tc_flow_parse_attr *parse_attr;
struct mlx5_flow_attr *attr = flow->attr;
bool vf_tun = false, encap_valid = true;
+ struct net_device *encap_dev = NULL;
struct mlx5_esw_flow_attr *esw_attr;
struct mlx5_fc *counter = NULL;
struct mlx5e_rep_priv *rpriv;
@@ -1371,16 +1371,22 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
esw_attr = attr->esw_attr;
for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) {
+ struct net_device *out_dev;
int mirred_ifindex;
if (!(esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP))
continue;
mirred_ifindex = parse_attr->mirred_ifindex[out_index];
- out_dev = __dev_get_by_index(dev_net(priv->netdev),
- mirred_ifindex);
+ out_dev = dev_get_by_index(dev_net(priv->netdev), mirred_ifindex);
+ if (!out_dev) {
+ NL_SET_ERR_MSG_MOD(extack, "Requested mirred device not found");
+ err = -ENODEV;
+ goto err_out;
+ }
err = mlx5e_attach_encap(priv, flow, out_dev, out_index,
extack, &encap_dev, &encap_valid);
+ dev_put(out_dev);
if (err)
goto err_out;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e877a88d1f069edced4160792f42c2a8e2dba942 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb(a)suse.de>
Date: Mon, 17 May 2021 09:59:10 +1000
Subject: [PATCH] SUNRPC in case of backlog, hand free slots directly to
waiting task
If sunrpc.tcp_max_slot_table_entries is small and there are tasks
on the backlog queue, then when a request completes it is freed and the
first task on the queue is woken. The expectation is that it will wake
and claim that request. However if it was a sync task and the waiting
process was killed at just that moment, it will wake and NOT claim the
request.
As long as TASK_CONGESTED remains set, requests can only be claimed by
tasks woken from the backlog, and they are woken only as requests are
freed, so when a task doesn't claim a request, no other task can ever
get that request until TASK_CONGESTED is cleared. Each time this
happens the number of available requests is decreased by one.
With a sufficiently high workload and sufficiently low setting of
max_slot (16 in the case where this was seen), TASK_CONGESTED can remain
set for an extended period, and the above scenario (of a process being
killed just as its task was woken) can repeat until no requests can be
allocated. Then traffic stops.
This patch addresses the problem by introducing a positive handover of a
request from a completing task to a backlog task - the request is never
freed when there is a backlog.
When a task is woken it might not already have a request attached in
which case it is *not* freed (as with current code) but is initialised
(if needed) and used. If it isn't used it will eventually be freed by
rpc_exit_task(). xprt_release() is enhanced to be able to correctly
release an uninitialised request.
Fixes: ba60eb25ff6b ("SUNRPC: Fix a livelock problem in the xprt->backlog queue")
Signed-off-by: NeilBrown <neilb(a)suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust(a)hammerspace.com>
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f555d335e910..42623d6b8f0e 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1677,13 +1677,6 @@ call_reserveresult(struct rpc_task *task)
return;
}
- /*
- * Even though there was an error, we may have acquired
- * a request slot somehow. Make sure not to leak it.
- */
- if (task->tk_rqstp)
- xprt_release(task);
-
switch (status) {
case -ENOMEM:
rpc_delay(task, HZ >> 2);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index e5b5a960a69b..5b3981fd3783 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -70,6 +70,7 @@
static void xprt_init(struct rpc_xprt *xprt, struct net *net);
static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
static void xprt_destroy(struct rpc_xprt *xprt);
+static void xprt_request_init(struct rpc_task *task);
static DEFINE_SPINLOCK(xprt_list_lock);
static LIST_HEAD(xprt_list);
@@ -1612,10 +1613,26 @@ static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
rpc_sleep_on(&xprt->backlog, task, NULL);
}
-static void xprt_wake_up_backlog(struct rpc_xprt *xprt)
+static bool __xprt_set_rq(struct rpc_task *task, void *data)
{
- if (rpc_wake_up_next(&xprt->backlog) == NULL)
+ struct rpc_rqst *req = data;
+
+ if (task->tk_rqstp == NULL) {
+ memset(req, 0, sizeof(*req)); /* mark unused */
+ task->tk_status = -EAGAIN;
+ task->tk_rqstp = req;
+ return true;
+ }
+ return false;
+}
+
+static bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req)
+{
+ if (rpc_wake_up_first(&xprt->backlog, __xprt_set_rq, req) == NULL) {
clear_bit(XPRT_CONGESTED, &xprt->state);
+ return false;
+ }
+ return true;
}
static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task)
@@ -1703,11 +1720,11 @@ EXPORT_SYMBOL_GPL(xprt_alloc_slot);
void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
spin_lock(&xprt->reserve_lock);
- if (!xprt_dynamic_free_slot(xprt, req)) {
+ if (!xprt_wake_up_backlog(xprt, req) &&
+ !xprt_dynamic_free_slot(xprt, req)) {
memset(req, 0, sizeof(*req)); /* mark unused */
list_add(&req->rq_list, &xprt->free);
}
- xprt_wake_up_backlog(xprt);
spin_unlock(&xprt->reserve_lock);
}
EXPORT_SYMBOL_GPL(xprt_free_slot);
@@ -1795,6 +1812,10 @@ xprt_request_init(struct rpc_task *task)
struct rpc_xprt *xprt = task->tk_xprt;
struct rpc_rqst *req = task->tk_rqstp;
+ if (req->rq_task)
+ /* Already initialized */
+ return;
+
req->rq_task = task;
req->rq_xprt = xprt;
req->rq_buffer = NULL;
@@ -1855,8 +1876,10 @@ void xprt_retry_reserve(struct rpc_task *task)
struct rpc_xprt *xprt = task->tk_xprt;
task->tk_status = 0;
- if (task->tk_rqstp != NULL)
+ if (task->tk_rqstp != NULL) {
+ xprt_request_init(task);
return;
+ }
task->tk_status = -EAGAIN;
xprt_do_reserve(xprt, task);
@@ -1881,23 +1904,26 @@ void xprt_release(struct rpc_task *task)
}
xprt = req->rq_xprt;
- xprt_request_dequeue_xprt(task);
- spin_lock(&xprt->transport_lock);
- xprt->ops->release_xprt(xprt, task);
- if (xprt->ops->release_request)
- xprt->ops->release_request(task);
- xprt_schedule_autodisconnect(xprt);
- spin_unlock(&xprt->transport_lock);
- if (req->rq_buffer)
- xprt->ops->buf_free(task);
- xdr_free_bvec(&req->rq_rcv_buf);
- xdr_free_bvec(&req->rq_snd_buf);
- if (req->rq_cred != NULL)
- put_rpccred(req->rq_cred);
- task->tk_rqstp = NULL;
- if (req->rq_release_snd_buf)
- req->rq_release_snd_buf(req);
+ if (xprt) {
+ xprt_request_dequeue_xprt(task);
+ spin_lock(&xprt->transport_lock);
+ xprt->ops->release_xprt(xprt, task);
+ if (xprt->ops->release_request)
+ xprt->ops->release_request(task);
+ xprt_schedule_autodisconnect(xprt);
+ spin_unlock(&xprt->transport_lock);
+ if (req->rq_buffer)
+ xprt->ops->buf_free(task);
+ xdr_free_bvec(&req->rq_rcv_buf);
+ xdr_free_bvec(&req->rq_snd_buf);
+ if (req->rq_cred != NULL)
+ put_rpccred(req->rq_cred);
+ if (req->rq_release_snd_buf)
+ req->rq_release_snd_buf(req);
+ } else
+ xprt = task->tk_xprt;
+ task->tk_rqstp = NULL;
if (likely(!bc_prealloc(req)))
xprt->ops->free_slot(xprt, req);
else
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e877a88d1f069edced4160792f42c2a8e2dba942 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb(a)suse.de>
Date: Mon, 17 May 2021 09:59:10 +1000
Subject: [PATCH] SUNRPC in case of backlog, hand free slots directly to
waiting task
If sunrpc.tcp_max_slot_table_entries is small and there are tasks
on the backlog queue, then when a request completes it is freed and the
first task on the queue is woken. The expectation is that it will wake
and claim that request. However if it was a sync task and the waiting
process was killed at just that moment, it will wake and NOT claim the
request.
As long as TASK_CONGESTED remains set, requests can only be claimed by
tasks woken from the backlog, and they are woken only as requests are
freed, so when a task doesn't claim a request, no other task can ever
get that request until TASK_CONGESTED is cleared. Each time this
happens the number of available requests is decreased by one.
With a sufficiently high workload and sufficiently low setting of
max_slot (16 in the case where this was seen), TASK_CONGESTED can remain
set for an extended period, and the above scenario (of a process being
killed just as its task was woken) can repeat until no requests can be
allocated. Then traffic stops.
This patch addresses the problem by introducing a positive handover of a
request from a completing task to a backlog task - the request is never
freed when there is a backlog.
When a task is woken it might not already have a request attached in
which case it is *not* freed (as with current code) but is initialised
(if needed) and used. If it isn't used it will eventually be freed by
rpc_exit_task(). xprt_release() is enhanced to be able to correctly
release an uninitialised request.
Fixes: ba60eb25ff6b ("SUNRPC: Fix a livelock problem in the xprt->backlog queue")
Signed-off-by: NeilBrown <neilb(a)suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust(a)hammerspace.com>
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f555d335e910..42623d6b8f0e 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1677,13 +1677,6 @@ call_reserveresult(struct rpc_task *task)
return;
}
- /*
- * Even though there was an error, we may have acquired
- * a request slot somehow. Make sure not to leak it.
- */
- if (task->tk_rqstp)
- xprt_release(task);
-
switch (status) {
case -ENOMEM:
rpc_delay(task, HZ >> 2);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index e5b5a960a69b..5b3981fd3783 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -70,6 +70,7 @@
static void xprt_init(struct rpc_xprt *xprt, struct net *net);
static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
static void xprt_destroy(struct rpc_xprt *xprt);
+static void xprt_request_init(struct rpc_task *task);
static DEFINE_SPINLOCK(xprt_list_lock);
static LIST_HEAD(xprt_list);
@@ -1612,10 +1613,26 @@ static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
rpc_sleep_on(&xprt->backlog, task, NULL);
}
-static void xprt_wake_up_backlog(struct rpc_xprt *xprt)
+static bool __xprt_set_rq(struct rpc_task *task, void *data)
{
- if (rpc_wake_up_next(&xprt->backlog) == NULL)
+ struct rpc_rqst *req = data;
+
+ if (task->tk_rqstp == NULL) {
+ memset(req, 0, sizeof(*req)); /* mark unused */
+ task->tk_status = -EAGAIN;
+ task->tk_rqstp = req;
+ return true;
+ }
+ return false;
+}
+
+static bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req)
+{
+ if (rpc_wake_up_first(&xprt->backlog, __xprt_set_rq, req) == NULL) {
clear_bit(XPRT_CONGESTED, &xprt->state);
+ return false;
+ }
+ return true;
}
static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task)
@@ -1703,11 +1720,11 @@ EXPORT_SYMBOL_GPL(xprt_alloc_slot);
void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
spin_lock(&xprt->reserve_lock);
- if (!xprt_dynamic_free_slot(xprt, req)) {
+ if (!xprt_wake_up_backlog(xprt, req) &&
+ !xprt_dynamic_free_slot(xprt, req)) {
memset(req, 0, sizeof(*req)); /* mark unused */
list_add(&req->rq_list, &xprt->free);
}
- xprt_wake_up_backlog(xprt);
spin_unlock(&xprt->reserve_lock);
}
EXPORT_SYMBOL_GPL(xprt_free_slot);
@@ -1795,6 +1812,10 @@ xprt_request_init(struct rpc_task *task)
struct rpc_xprt *xprt = task->tk_xprt;
struct rpc_rqst *req = task->tk_rqstp;
+ if (req->rq_task)
+ /* Already initialized */
+ return;
+
req->rq_task = task;
req->rq_xprt = xprt;
req->rq_buffer = NULL;
@@ -1855,8 +1876,10 @@ void xprt_retry_reserve(struct rpc_task *task)
struct rpc_xprt *xprt = task->tk_xprt;
task->tk_status = 0;
- if (task->tk_rqstp != NULL)
+ if (task->tk_rqstp != NULL) {
+ xprt_request_init(task);
return;
+ }
task->tk_status = -EAGAIN;
xprt_do_reserve(xprt, task);
@@ -1881,23 +1904,26 @@ void xprt_release(struct rpc_task *task)
}
xprt = req->rq_xprt;
- xprt_request_dequeue_xprt(task);
- spin_lock(&xprt->transport_lock);
- xprt->ops->release_xprt(xprt, task);
- if (xprt->ops->release_request)
- xprt->ops->release_request(task);
- xprt_schedule_autodisconnect(xprt);
- spin_unlock(&xprt->transport_lock);
- if (req->rq_buffer)
- xprt->ops->buf_free(task);
- xdr_free_bvec(&req->rq_rcv_buf);
- xdr_free_bvec(&req->rq_snd_buf);
- if (req->rq_cred != NULL)
- put_rpccred(req->rq_cred);
- task->tk_rqstp = NULL;
- if (req->rq_release_snd_buf)
- req->rq_release_snd_buf(req);
+ if (xprt) {
+ xprt_request_dequeue_xprt(task);
+ spin_lock(&xprt->transport_lock);
+ xprt->ops->release_xprt(xprt, task);
+ if (xprt->ops->release_request)
+ xprt->ops->release_request(task);
+ xprt_schedule_autodisconnect(xprt);
+ spin_unlock(&xprt->transport_lock);
+ if (req->rq_buffer)
+ xprt->ops->buf_free(task);
+ xdr_free_bvec(&req->rq_rcv_buf);
+ xdr_free_bvec(&req->rq_snd_buf);
+ if (req->rq_cred != NULL)
+ put_rpccred(req->rq_cred);
+ if (req->rq_release_snd_buf)
+ req->rq_release_snd_buf(req);
+ } else
+ xprt = task->tk_xprt;
+ task->tk_rqstp = NULL;
if (likely(!bc_prealloc(req)))
xprt->ops->free_slot(xprt, req);
else
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e877a88d1f069edced4160792f42c2a8e2dba942 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb(a)suse.de>
Date: Mon, 17 May 2021 09:59:10 +1000
Subject: [PATCH] SUNRPC in case of backlog, hand free slots directly to
waiting task
If sunrpc.tcp_max_slot_table_entries is small and there are tasks
on the backlog queue, then when a request completes it is freed and the
first task on the queue is woken. The expectation is that it will wake
and claim that request. However if it was a sync task and the waiting
process was killed at just that moment, it will wake and NOT claim the
request.
As long as TASK_CONGESTED remains set, requests can only be claimed by
tasks woken from the backlog, and they are woken only as requests are
freed, so when a task doesn't claim a request, no other task can ever
get that request until TASK_CONGESTED is cleared. Each time this
happens the number of available requests is decreased by one.
With a sufficiently high workload and sufficiently low setting of
max_slot (16 in the case where this was seen), TASK_CONGESTED can remain
set for an extended period, and the above scenario (of a process being
killed just as its task was woken) can repeat until no requests can be
allocated. Then traffic stops.
This patch addresses the problem by introducing a positive handover of a
request from a completing task to a backlog task - the request is never
freed when there is a backlog.
When a task is woken it might not already have a request attached in
which case it is *not* freed (as with current code) but is initialised
(if needed) and used. If it isn't used it will eventually be freed by
rpc_exit_task(). xprt_release() is enhanced to be able to correctly
release an uninitialised request.
Fixes: ba60eb25ff6b ("SUNRPC: Fix a livelock problem in the xprt->backlog queue")
Signed-off-by: NeilBrown <neilb(a)suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust(a)hammerspace.com>
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f555d335e910..42623d6b8f0e 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1677,13 +1677,6 @@ call_reserveresult(struct rpc_task *task)
return;
}
- /*
- * Even though there was an error, we may have acquired
- * a request slot somehow. Make sure not to leak it.
- */
- if (task->tk_rqstp)
- xprt_release(task);
-
switch (status) {
case -ENOMEM:
rpc_delay(task, HZ >> 2);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index e5b5a960a69b..5b3981fd3783 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -70,6 +70,7 @@
static void xprt_init(struct rpc_xprt *xprt, struct net *net);
static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
static void xprt_destroy(struct rpc_xprt *xprt);
+static void xprt_request_init(struct rpc_task *task);
static DEFINE_SPINLOCK(xprt_list_lock);
static LIST_HEAD(xprt_list);
@@ -1612,10 +1613,26 @@ static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
rpc_sleep_on(&xprt->backlog, task, NULL);
}
-static void xprt_wake_up_backlog(struct rpc_xprt *xprt)
+static bool __xprt_set_rq(struct rpc_task *task, void *data)
{
- if (rpc_wake_up_next(&xprt->backlog) == NULL)
+ struct rpc_rqst *req = data;
+
+ if (task->tk_rqstp == NULL) {
+ memset(req, 0, sizeof(*req)); /* mark unused */
+ task->tk_status = -EAGAIN;
+ task->tk_rqstp = req;
+ return true;
+ }
+ return false;
+}
+
+static bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req)
+{
+ if (rpc_wake_up_first(&xprt->backlog, __xprt_set_rq, req) == NULL) {
clear_bit(XPRT_CONGESTED, &xprt->state);
+ return false;
+ }
+ return true;
}
static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task)
@@ -1703,11 +1720,11 @@ EXPORT_SYMBOL_GPL(xprt_alloc_slot);
void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
spin_lock(&xprt->reserve_lock);
- if (!xprt_dynamic_free_slot(xprt, req)) {
+ if (!xprt_wake_up_backlog(xprt, req) &&
+ !xprt_dynamic_free_slot(xprt, req)) {
memset(req, 0, sizeof(*req)); /* mark unused */
list_add(&req->rq_list, &xprt->free);
}
- xprt_wake_up_backlog(xprt);
spin_unlock(&xprt->reserve_lock);
}
EXPORT_SYMBOL_GPL(xprt_free_slot);
@@ -1795,6 +1812,10 @@ xprt_request_init(struct rpc_task *task)
struct rpc_xprt *xprt = task->tk_xprt;
struct rpc_rqst *req = task->tk_rqstp;
+ if (req->rq_task)
+ /* Already initialized */
+ return;
+
req->rq_task = task;
req->rq_xprt = xprt;
req->rq_buffer = NULL;
@@ -1855,8 +1876,10 @@ void xprt_retry_reserve(struct rpc_task *task)
struct rpc_xprt *xprt = task->tk_xprt;
task->tk_status = 0;
- if (task->tk_rqstp != NULL)
+ if (task->tk_rqstp != NULL) {
+ xprt_request_init(task);
return;
+ }
task->tk_status = -EAGAIN;
xprt_do_reserve(xprt, task);
@@ -1881,23 +1904,26 @@ void xprt_release(struct rpc_task *task)
}
xprt = req->rq_xprt;
- xprt_request_dequeue_xprt(task);
- spin_lock(&xprt->transport_lock);
- xprt->ops->release_xprt(xprt, task);
- if (xprt->ops->release_request)
- xprt->ops->release_request(task);
- xprt_schedule_autodisconnect(xprt);
- spin_unlock(&xprt->transport_lock);
- if (req->rq_buffer)
- xprt->ops->buf_free(task);
- xdr_free_bvec(&req->rq_rcv_buf);
- xdr_free_bvec(&req->rq_snd_buf);
- if (req->rq_cred != NULL)
- put_rpccred(req->rq_cred);
- task->tk_rqstp = NULL;
- if (req->rq_release_snd_buf)
- req->rq_release_snd_buf(req);
+ if (xprt) {
+ xprt_request_dequeue_xprt(task);
+ spin_lock(&xprt->transport_lock);
+ xprt->ops->release_xprt(xprt, task);
+ if (xprt->ops->release_request)
+ xprt->ops->release_request(task);
+ xprt_schedule_autodisconnect(xprt);
+ spin_unlock(&xprt->transport_lock);
+ if (req->rq_buffer)
+ xprt->ops->buf_free(task);
+ xdr_free_bvec(&req->rq_rcv_buf);
+ xdr_free_bvec(&req->rq_snd_buf);
+ if (req->rq_cred != NULL)
+ put_rpccred(req->rq_cred);
+ if (req->rq_release_snd_buf)
+ req->rq_release_snd_buf(req);
+ } else
+ xprt = task->tk_xprt;
+ task->tk_rqstp = NULL;
if (likely(!bc_prealloc(req)))
xprt->ops->free_slot(xprt, req);
else
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e877a88d1f069edced4160792f42c2a8e2dba942 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb(a)suse.de>
Date: Mon, 17 May 2021 09:59:10 +1000
Subject: [PATCH] SUNRPC in case of backlog, hand free slots directly to
waiting task
If sunrpc.tcp_max_slot_table_entries is small and there are tasks
on the backlog queue, then when a request completes it is freed and the
first task on the queue is woken. The expectation is that it will wake
and claim that request. However if it was a sync task and the waiting
process was killed at just that moment, it will wake and NOT claim the
request.
As long as TASK_CONGESTED remains set, requests can only be claimed by
tasks woken from the backlog, and they are woken only as requests are
freed, so when a task doesn't claim a request, no other task can ever
get that request until TASK_CONGESTED is cleared. Each time this
happens the number of available requests is decreased by one.
With a sufficiently high workload and sufficiently low setting of
max_slot (16 in the case where this was seen), TASK_CONGESTED can remain
set for an extended period, and the above scenario (of a process being
killed just as its task was woken) can repeat until no requests can be
allocated. Then traffic stops.
This patch addresses the problem by introducing a positive handover of a
request from a completing task to a backlog task - the request is never
freed when there is a backlog.
When a task is woken it might not already have a request attached in
which case it is *not* freed (as with current code) but is initialised
(if needed) and used. If it isn't used it will eventually be freed by
rpc_exit_task(). xprt_release() is enhanced to be able to correctly
release an uninitialised request.
Fixes: ba60eb25ff6b ("SUNRPC: Fix a livelock problem in the xprt->backlog queue")
Signed-off-by: NeilBrown <neilb(a)suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust(a)hammerspace.com>
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f555d335e910..42623d6b8f0e 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1677,13 +1677,6 @@ call_reserveresult(struct rpc_task *task)
return;
}
- /*
- * Even though there was an error, we may have acquired
- * a request slot somehow. Make sure not to leak it.
- */
- if (task->tk_rqstp)
- xprt_release(task);
-
switch (status) {
case -ENOMEM:
rpc_delay(task, HZ >> 2);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index e5b5a960a69b..5b3981fd3783 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -70,6 +70,7 @@
static void xprt_init(struct rpc_xprt *xprt, struct net *net);
static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
static void xprt_destroy(struct rpc_xprt *xprt);
+static void xprt_request_init(struct rpc_task *task);
static DEFINE_SPINLOCK(xprt_list_lock);
static LIST_HEAD(xprt_list);
@@ -1612,10 +1613,26 @@ static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
rpc_sleep_on(&xprt->backlog, task, NULL);
}
-static void xprt_wake_up_backlog(struct rpc_xprt *xprt)
+static bool __xprt_set_rq(struct rpc_task *task, void *data)
{
- if (rpc_wake_up_next(&xprt->backlog) == NULL)
+ struct rpc_rqst *req = data;
+
+ if (task->tk_rqstp == NULL) {
+ memset(req, 0, sizeof(*req)); /* mark unused */
+ task->tk_status = -EAGAIN;
+ task->tk_rqstp = req;
+ return true;
+ }
+ return false;
+}
+
+static bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req)
+{
+ if (rpc_wake_up_first(&xprt->backlog, __xprt_set_rq, req) == NULL) {
clear_bit(XPRT_CONGESTED, &xprt->state);
+ return false;
+ }
+ return true;
}
static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task)
@@ -1703,11 +1720,11 @@ EXPORT_SYMBOL_GPL(xprt_alloc_slot);
void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
spin_lock(&xprt->reserve_lock);
- if (!xprt_dynamic_free_slot(xprt, req)) {
+ if (!xprt_wake_up_backlog(xprt, req) &&
+ !xprt_dynamic_free_slot(xprt, req)) {
memset(req, 0, sizeof(*req)); /* mark unused */
list_add(&req->rq_list, &xprt->free);
}
- xprt_wake_up_backlog(xprt);
spin_unlock(&xprt->reserve_lock);
}
EXPORT_SYMBOL_GPL(xprt_free_slot);
@@ -1795,6 +1812,10 @@ xprt_request_init(struct rpc_task *task)
struct rpc_xprt *xprt = task->tk_xprt;
struct rpc_rqst *req = task->tk_rqstp;
+ if (req->rq_task)
+ /* Already initialized */
+ return;
+
req->rq_task = task;
req->rq_xprt = xprt;
req->rq_buffer = NULL;
@@ -1855,8 +1876,10 @@ void xprt_retry_reserve(struct rpc_task *task)
struct rpc_xprt *xprt = task->tk_xprt;
task->tk_status = 0;
- if (task->tk_rqstp != NULL)
+ if (task->tk_rqstp != NULL) {
+ xprt_request_init(task);
return;
+ }
task->tk_status = -EAGAIN;
xprt_do_reserve(xprt, task);
@@ -1881,23 +1904,26 @@ void xprt_release(struct rpc_task *task)
}
xprt = req->rq_xprt;
- xprt_request_dequeue_xprt(task);
- spin_lock(&xprt->transport_lock);
- xprt->ops->release_xprt(xprt, task);
- if (xprt->ops->release_request)
- xprt->ops->release_request(task);
- xprt_schedule_autodisconnect(xprt);
- spin_unlock(&xprt->transport_lock);
- if (req->rq_buffer)
- xprt->ops->buf_free(task);
- xdr_free_bvec(&req->rq_rcv_buf);
- xdr_free_bvec(&req->rq_snd_buf);
- if (req->rq_cred != NULL)
- put_rpccred(req->rq_cred);
- task->tk_rqstp = NULL;
- if (req->rq_release_snd_buf)
- req->rq_release_snd_buf(req);
+ if (xprt) {
+ xprt_request_dequeue_xprt(task);
+ spin_lock(&xprt->transport_lock);
+ xprt->ops->release_xprt(xprt, task);
+ if (xprt->ops->release_request)
+ xprt->ops->release_request(task);
+ xprt_schedule_autodisconnect(xprt);
+ spin_unlock(&xprt->transport_lock);
+ if (req->rq_buffer)
+ xprt->ops->buf_free(task);
+ xdr_free_bvec(&req->rq_rcv_buf);
+ xdr_free_bvec(&req->rq_snd_buf);
+ if (req->rq_cred != NULL)
+ put_rpccred(req->rq_cred);
+ if (req->rq_release_snd_buf)
+ req->rq_release_snd_buf(req);
+ } else
+ xprt = task->tk_xprt;
+ task->tk_rqstp = NULL;
if (likely(!bc_prealloc(req)))
xprt->ops->free_slot(xprt, req);
else
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e877a88d1f069edced4160792f42c2a8e2dba942 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb(a)suse.de>
Date: Mon, 17 May 2021 09:59:10 +1000
Subject: [PATCH] SUNRPC in case of backlog, hand free slots directly to
waiting task
If sunrpc.tcp_max_slot_table_entries is small and there are tasks
on the backlog queue, then when a request completes it is freed and the
first task on the queue is woken. The expectation is that it will wake
and claim that request. However if it was a sync task and the waiting
process was killed at just that moment, it will wake and NOT claim the
request.
As long as TASK_CONGESTED remains set, requests can only be claimed by
tasks woken from the backlog, and they are woken only as requests are
freed, so when a task doesn't claim a request, no other task can ever
get that request until TASK_CONGESTED is cleared. Each time this
happens the number of available requests is decreased by one.
With a sufficiently high workload and sufficiently low setting of
max_slot (16 in the case where this was seen), TASK_CONGESTED can remain
set for an extended period, and the above scenario (of a process being
killed just as its task was woken) can repeat until no requests can be
allocated. Then traffic stops.
This patch addresses the problem by introducing a positive handover of a
request from a completing task to a backlog task - the request is never
freed when there is a backlog.
When a task is woken it might not already have a request attached in
which case it is *not* freed (as with current code) but is initialised
(if needed) and used. If it isn't used it will eventually be freed by
rpc_exit_task(). xprt_release() is enhanced to be able to correctly
release an uninitialised request.
Fixes: ba60eb25ff6b ("SUNRPC: Fix a livelock problem in the xprt->backlog queue")
Signed-off-by: NeilBrown <neilb(a)suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust(a)hammerspace.com>
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f555d335e910..42623d6b8f0e 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1677,13 +1677,6 @@ call_reserveresult(struct rpc_task *task)
return;
}
- /*
- * Even though there was an error, we may have acquired
- * a request slot somehow. Make sure not to leak it.
- */
- if (task->tk_rqstp)
- xprt_release(task);
-
switch (status) {
case -ENOMEM:
rpc_delay(task, HZ >> 2);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index e5b5a960a69b..5b3981fd3783 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -70,6 +70,7 @@
static void xprt_init(struct rpc_xprt *xprt, struct net *net);
static __be32 xprt_alloc_xid(struct rpc_xprt *xprt);
static void xprt_destroy(struct rpc_xprt *xprt);
+static void xprt_request_init(struct rpc_task *task);
static DEFINE_SPINLOCK(xprt_list_lock);
static LIST_HEAD(xprt_list);
@@ -1612,10 +1613,26 @@ static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
rpc_sleep_on(&xprt->backlog, task, NULL);
}
-static void xprt_wake_up_backlog(struct rpc_xprt *xprt)
+static bool __xprt_set_rq(struct rpc_task *task, void *data)
{
- if (rpc_wake_up_next(&xprt->backlog) == NULL)
+ struct rpc_rqst *req = data;
+
+ if (task->tk_rqstp == NULL) {
+ memset(req, 0, sizeof(*req)); /* mark unused */
+ task->tk_status = -EAGAIN;
+ task->tk_rqstp = req;
+ return true;
+ }
+ return false;
+}
+
+static bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req)
+{
+ if (rpc_wake_up_first(&xprt->backlog, __xprt_set_rq, req) == NULL) {
clear_bit(XPRT_CONGESTED, &xprt->state);
+ return false;
+ }
+ return true;
}
static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task)
@@ -1703,11 +1720,11 @@ EXPORT_SYMBOL_GPL(xprt_alloc_slot);
void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
spin_lock(&xprt->reserve_lock);
- if (!xprt_dynamic_free_slot(xprt, req)) {
+ if (!xprt_wake_up_backlog(xprt, req) &&
+ !xprt_dynamic_free_slot(xprt, req)) {
memset(req, 0, sizeof(*req)); /* mark unused */
list_add(&req->rq_list, &xprt->free);
}
- xprt_wake_up_backlog(xprt);
spin_unlock(&xprt->reserve_lock);
}
EXPORT_SYMBOL_GPL(xprt_free_slot);
@@ -1795,6 +1812,10 @@ xprt_request_init(struct rpc_task *task)
struct rpc_xprt *xprt = task->tk_xprt;
struct rpc_rqst *req = task->tk_rqstp;
+ if (req->rq_task)
+ /* Already initialized */
+ return;
+
req->rq_task = task;
req->rq_xprt = xprt;
req->rq_buffer = NULL;
@@ -1855,8 +1876,10 @@ void xprt_retry_reserve(struct rpc_task *task)
struct rpc_xprt *xprt = task->tk_xprt;
task->tk_status = 0;
- if (task->tk_rqstp != NULL)
+ if (task->tk_rqstp != NULL) {
+ xprt_request_init(task);
return;
+ }
task->tk_status = -EAGAIN;
xprt_do_reserve(xprt, task);
@@ -1881,23 +1904,26 @@ void xprt_release(struct rpc_task *task)
}
xprt = req->rq_xprt;
- xprt_request_dequeue_xprt(task);
- spin_lock(&xprt->transport_lock);
- xprt->ops->release_xprt(xprt, task);
- if (xprt->ops->release_request)
- xprt->ops->release_request(task);
- xprt_schedule_autodisconnect(xprt);
- spin_unlock(&xprt->transport_lock);
- if (req->rq_buffer)
- xprt->ops->buf_free(task);
- xdr_free_bvec(&req->rq_rcv_buf);
- xdr_free_bvec(&req->rq_snd_buf);
- if (req->rq_cred != NULL)
- put_rpccred(req->rq_cred);
- task->tk_rqstp = NULL;
- if (req->rq_release_snd_buf)
- req->rq_release_snd_buf(req);
+ if (xprt) {
+ xprt_request_dequeue_xprt(task);
+ spin_lock(&xprt->transport_lock);
+ xprt->ops->release_xprt(xprt, task);
+ if (xprt->ops->release_request)
+ xprt->ops->release_request(task);
+ xprt_schedule_autodisconnect(xprt);
+ spin_unlock(&xprt->transport_lock);
+ if (req->rq_buffer)
+ xprt->ops->buf_free(task);
+ xdr_free_bvec(&req->rq_rcv_buf);
+ xdr_free_bvec(&req->rq_snd_buf);
+ if (req->rq_cred != NULL)
+ put_rpccred(req->rq_cred);
+ if (req->rq_release_snd_buf)
+ req->rq_release_snd_buf(req);
+ } else
+ xprt = task->tk_xprt;
+ task->tk_rqstp = NULL;
if (likely(!bc_prealloc(req)))
xprt->ops->free_slot(xprt, req);
else
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From ef4c9f4f654622fa15b7a94a9bd1f19e76bb7feb Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack(a)google.com>
Date: Fri, 21 May 2021 17:38:28 +0000
Subject: [PATCH] KVM: selftests: Fix 32-bit truncation of vm_get_max_gfn()
vm_get_max_gfn() casts vm->max_gfn from a uint64_t to an unsigned int,
which causes the upper 32-bits of the max_gfn to get truncated.
Nobody noticed until now likely because vm_get_max_gfn() is only used
as a mechanism to create a memslot in an unused region of the guest
physical address space (the top), and the top of the 32-bit physical
address space was always good enough.
This fix reveals a bug in memslot_modification_stress_test which was
trying to create a dummy memslot past the end of guest physical memory.
Fix that by moving the dummy memslot lower.
Fixes: 52200d0d944e ("KVM: selftests: Remove duplicate guest mode handling")
Reviewed-by: Venkatesh Srinivas <venkateshs(a)chromium.org>
Signed-off-by: David Matlack <dmatlack(a)google.com>
Message-Id: <20210521173828.1180619-1-dmatlack(a)google.com>
Reviewed-by: Andrew Jones <drjones(a)redhat.com>
Reviewed-by: Peter Xu <peterx(a)redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index a8f022794ce3..2e0d253dabd6 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -302,7 +302,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm);
unsigned int vm_get_page_size(struct kvm_vm *vm);
unsigned int vm_get_page_shift(struct kvm_vm *vm);
-unsigned int vm_get_max_gfn(struct kvm_vm *vm);
+uint64_t vm_get_max_gfn(struct kvm_vm *vm);
int vm_get_fd(struct kvm_vm *vm);
unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 1255744758e3..ea3f0db85b3e 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -2117,7 +2117,7 @@ unsigned int vm_get_page_shift(struct kvm_vm *vm)
return vm->page_shift;
}
-unsigned int vm_get_max_gfn(struct kvm_vm *vm)
+uint64_t vm_get_max_gfn(struct kvm_vm *vm)
{
return vm->max_gfn;
}
diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
index 81490b9b4e32..abf381800a59 100644
--- a/tools/testing/selftests/kvm/lib/perf_test_util.c
+++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2020, Google LLC.
*/
+#include <inttypes.h>
#include "kvm_util.h"
#include "perf_test_util.h"
@@ -80,7 +81,8 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
*/
TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm),
"Requested more guest memory than address space allows.\n"
- " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n",
+ " guest pages: %" PRIx64 " max gfn: %" PRIx64
+ " vcpus: %d wss: %" PRIx64 "]\n",
guest_num_pages, vm_get_max_gfn(vm), vcpus,
vcpu_memory_bytes);
diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
index 6096bf0a5b34..98351ba0933c 100644
--- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c
+++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
@@ -71,14 +71,22 @@ struct memslot_antagonist_args {
};
static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay,
- uint64_t nr_modifications, uint64_t gpa)
+ uint64_t nr_modifications)
{
+ const uint64_t pages = 1;
+ uint64_t gpa;
int i;
+ /*
+ * Add the dummy memslot just below the perf_test_util memslot, which is
+ * at the top of the guest physical address space.
+ */
+ gpa = guest_test_phys_mem - pages * vm_get_page_size(vm);
+
for (i = 0; i < nr_modifications; i++) {
usleep(delay);
vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa,
- DUMMY_MEMSLOT_INDEX, 1, 0);
+ DUMMY_MEMSLOT_INDEX, pages, 0);
vm_mem_region_delete(vm, DUMMY_MEMSLOT_INDEX);
}
@@ -120,11 +128,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
pr_info("Started all vCPUs\n");
add_remove_memslot(vm, p->memslot_modification_delay,
- p->nr_memslot_modifications,
- guest_test_phys_mem +
- (guest_percpu_mem_size * nr_vcpus) +
- perf_test_args.host_page_size +
- perf_test_args.guest_page_size);
+ p->nr_memslot_modifications);
run_vcpus = false;