From: "Phil Auld" <pauld(a)redhat.com>
sched/fair: Avoid throttle_list starvation with low cfs quota
With a very low cpu.cfs_quota_us setting, such as the minimum of 1000,
distribute_cfs_runtime may not empty the throttled_list before it runs
out of runtime to distribute. In that case, due to the change from
c06f04c7048 to put throttled entries at the head of the list, later entries
on the list will starve. Essentially, the same X processes will get pulled
off the list, given CPU time and then, when expired, get put back on the
head of the list where distribute_cfs_runtime will give runtime to the same
set of processes leaving the rest.
Fix the issue by setting a bit in struct cfs_bandwidth when
distribute_cfs_runtime is running, so that the code in throttle_cfs_rq can
decide to put the throttled entry on the tail or the head of the list. The
bit is set/cleared by the callers of distribute_cfs_runtime while they hold
cfs_bandwidth->lock.
Signed-off-by: Phil Auld <pauld(a)redhat.com>
Fixes: c06f04c70489 ("sched: Fix potential near-infinite distribute_cfs_runtime() loop")
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
This is easy to reproduce with a handful of cpu consumers. I use crash on
the live system. In some cases you can simply look at the throttled list and
see the later entries are not changing:
crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1" "$4}' | pr -t -n3
1 ffff90b56cb2d200 -976050
2 ffff90b56cb2cc00 -484925
3 ffff90b56cb2bc00 -658814
4 ffff90b56cb2ba00 -275365
5 ffff90b166a45600 -135138
6 ffff90b56cb2da00 -282505
7 ffff90b56cb2e000 -148065
8 ffff90b56cb2fa00 -872591
9 ffff90b56cb2c000 -84687
10 ffff90b56cb2f000 -87237
11 ffff90b166a40a00 -164582
crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1" "$4}' | pr -t -n3
1 ffff90b56cb2d200 -994147
2 ffff90b56cb2cc00 -306051
3 ffff90b56cb2bc00 -961321
4 ffff90b56cb2ba00 -24490
5 ffff90b166a45600 -135138
6 ffff90b56cb2da00 -282505
7 ffff90b56cb2e000 -148065
8 ffff90b56cb2fa00 -872591
9 ffff90b56cb2c000 -84687
10 ffff90b56cb2f000 -87237
11 ffff90b166a40a00 -164582
Sometimes it is easier to see by finding a process getting starved and looking
at the sched_info:
crash> task ffff8eb765994500 sched_info
PID: 7800 TASK: ffff8eb765994500 CPU: 16 COMMAND: "cputest"
sched_info = {
pcount = 8,
run_delay = 697094208,
last_arrival = 240260125039,
last_queued = 240260327513
},
crash> task ffff8eb765994500 sched_info
PID: 7800 TASK: ffff8eb765994500 CPU: 16 COMMAND: "cputest"
sched_info = {
pcount = 8,
run_delay = 697094208,
last_arrival = 240260125039,
last_queued = 240260327513
},
fair.c | 22 +++++++++++++++++++---
sched.h | 2 ++
2 files changed, 21 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7fc4a371bdd2..f88e00705b55 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4476,9 +4476,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
/*
* Add to the _head_ of the list, so that an already-started
- * distribute_cfs_runtime will not see us
+ * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
+ * not running add to the tail so that later runqueues don't get starved.
*/
- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ if (cfs_b->distribute_running)
+ list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ else
+ list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
/*
* If we're the first throttled task, make sure the bandwidth
@@ -4622,14 +4626,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
* in us over-using our runtime if it is all used during this loop, but
* only by limited amounts in that extreme case.
*/
- while (throttled && cfs_b->runtime > 0) {
+ while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
runtime = cfs_b->runtime;
+ cfs_b->distribute_running = 1;
raw_spin_unlock(&cfs_b->lock);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
runtime_expires);
raw_spin_lock(&cfs_b->lock);
+ cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
cfs_b->runtime -= min(runtime, cfs_b->runtime);
@@ -4740,6 +4746,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
/* confirm we're still not at a refresh boundary */
raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->distribute_running) {
+ raw_spin_unlock(&cfs_b->lock);
+ return;
+ }
+
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
raw_spin_unlock(&cfs_b->lock);
return;
@@ -4749,6 +4760,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
runtime = cfs_b->runtime;
expires = cfs_b->runtime_expires;
+ if (runtime)
+ cfs_b->distribute_running = 1;
+
raw_spin_unlock(&cfs_b->lock);
if (!runtime)
@@ -4759,6 +4773,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
raw_spin_lock(&cfs_b->lock);
if (expires == cfs_b->runtime_expires)
cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ cfs_b->distribute_running = 0;
raw_spin_unlock(&cfs_b->lock);
}
@@ -4867,6 +4882,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cfs_b->period_timer.function = sched_cfs_period_timer;
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
+ cfs_b->distribute_running = 0;
}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 455fa330de04..9683f458aec7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -346,6 +346,8 @@ struct cfs_bandwidth {
int nr_periods;
int nr_throttled;
u64 throttled_time;
+
+ bool distribute_running;
#endif
};
--
Dear friend,
I am Abel Brent, a NATO soldier serving in Afghanistan. I and my
Comrades, we are seeking your assistance to help us
receive/invest our funds in your country in any lucrative
business. Please if this proposal is acceptable by you, kindly
respond back to me for more details.
Thanks and waiting to hear from you
Abel.
The patch below does not apply to the 4.18-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bbcda30271752bb7490f2e2aef5411dbcae69116 Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris(a)chromium.org>
Date: Mon, 8 Oct 2018 19:08:05 -0700
Subject: [PATCH] remoteproc: qcom: q6v5-mss: add SCM probe dependency
The memory ownership transfer request is performed using SCM, ensure
that SCM is available before we probe the driver if memory protection is
needed by the subsystem.
Fixes: 6c5a9dc2481b ("remoteproc: qcom: Make secure world call for mem ownership switch")
Cc: stable(a)vger.kernel.org
Signed-off-by: Brian Norris <briannorris(a)chromium.org>
[bjorn: Added condition for need_mem_protection, updated commit message]
Signed-off-by: Bjorn Andersson <bjorn.andersson(a)linaro.org>
diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c
index 3448f1df2e87..4c47f5e0a87c 100644
--- a/drivers/remoteproc/qcom_q6v5_mss.c
+++ b/drivers/remoteproc/qcom_q6v5_mss.c
@@ -1152,6 +1152,9 @@ static int q6v5_probe(struct platform_device *pdev)
if (!desc)
return -EINVAL;
+ if (desc->need_mem_protection && !qcom_scm_is_available())
+ return -EPROBE_DEFER;
+
rproc = rproc_alloc(&pdev->dev, pdev->name, &q6v5_ops,
desc->hexagon_mba_image, sizeof(*qproc));
if (!rproc) {
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bbcda30271752bb7490f2e2aef5411dbcae69116 Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris(a)chromium.org>
Date: Mon, 8 Oct 2018 19:08:05 -0700
Subject: [PATCH] remoteproc: qcom: q6v5-mss: add SCM probe dependency
The memory ownership transfer request is performed using SCM, ensure
that SCM is available before we probe the driver if memory protection is
needed by the subsystem.
Fixes: 6c5a9dc2481b ("remoteproc: qcom: Make secure world call for mem ownership switch")
Cc: stable(a)vger.kernel.org
Signed-off-by: Brian Norris <briannorris(a)chromium.org>
[bjorn: Added condition for need_mem_protection, updated commit message]
Signed-off-by: Bjorn Andersson <bjorn.andersson(a)linaro.org>
diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c
index 3448f1df2e87..4c47f5e0a87c 100644
--- a/drivers/remoteproc/qcom_q6v5_mss.c
+++ b/drivers/remoteproc/qcom_q6v5_mss.c
@@ -1152,6 +1152,9 @@ static int q6v5_probe(struct platform_device *pdev)
if (!desc)
return -EINVAL;
+ if (desc->need_mem_protection && !qcom_scm_is_available())
+ return -EPROBE_DEFER;
+
rproc = rproc_alloc(&pdev->dev, pdev->name, &q6v5_ops,
desc->hexagon_mba_image, sizeof(*qproc));
if (!rproc) {
The patch below does not apply to the 4.18-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 490d84f6d73c12f4204241cff8651eed60aae914 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil(a)xs4all.nl>
Date: Mon, 15 Oct 2018 06:14:22 -0400
Subject: [PATCH] media: cec: forgot to cancel delayed work
If the wait for completion was interrupted, then make sure to cancel
any delayed work.
This can only happen if a transmit is waiting for a reply, and you press
Ctrl-C or reboot/poweroff or something like that which interrupts the
thread waiting for the reply and then proceeds to delete the CEC message.
Since the delayed work wasn't canceled, once it would trigger it referred
to stale data and resulted in a kernel oops.
Fixes: 7ec2b3b941a6 ("cec: add new tx/rx status bits to detect aborts/timeouts")
Signed-off-by: Hans Verkuil <hans.verkuil(a)cisco.com>
Cc: <stable(a)vger.kernel.org> # for v4.18 and up
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung(a)kernel.org>
diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c
index 0c0d9107383e..31d1f4ab915e 100644
--- a/drivers/media/cec/cec-adap.c
+++ b/drivers/media/cec/cec-adap.c
@@ -844,6 +844,8 @@ int cec_transmit_msg_fh(struct cec_adapter *adap, struct cec_msg *msg,
*/
mutex_unlock(&adap->lock);
wait_for_completion_killable(&data->c);
+ if (!data->completed)
+ cancel_delayed_work_sync(&data->work);
mutex_lock(&adap->lock);
/* Cancel the transmit if it was interrupted */
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 490d84f6d73c12f4204241cff8651eed60aae914 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil(a)xs4all.nl>
Date: Mon, 15 Oct 2018 06:14:22 -0400
Subject: [PATCH] media: cec: forgot to cancel delayed work
If the wait for completion was interrupted, then make sure to cancel
any delayed work.
This can only happen if a transmit is waiting for a reply, and you press
Ctrl-C or reboot/poweroff or something like that which interrupts the
thread waiting for the reply and then proceeds to delete the CEC message.
Since the delayed work wasn't canceled, once it would trigger it referred
to stale data and resulted in a kernel oops.
Fixes: 7ec2b3b941a6 ("cec: add new tx/rx status bits to detect aborts/timeouts")
Signed-off-by: Hans Verkuil <hans.verkuil(a)cisco.com>
Cc: <stable(a)vger.kernel.org> # for v4.18 and up
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung(a)kernel.org>
diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c
index 0c0d9107383e..31d1f4ab915e 100644
--- a/drivers/media/cec/cec-adap.c
+++ b/drivers/media/cec/cec-adap.c
@@ -844,6 +844,8 @@ int cec_transmit_msg_fh(struct cec_adapter *adap, struct cec_msg *msg,
*/
mutex_unlock(&adap->lock);
wait_for_completion_killable(&data->c);
+ if (!data->completed)
+ cancel_delayed_work_sync(&data->work);
mutex_lock(&adap->lock);
/* Cancel the transmit if it was interrupted */
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 250854eed5d45a73d81e4137dfd85180af6f2ec3 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil(a)xs4all.nl>
Date: Mon, 8 Oct 2018 15:08:27 -0400
Subject: [PATCH] media: v4l2-tpg: fix kernel oops when enabling HFLIP and OSD
When the OSD is on (i.e. vivid displays text on top of the test pattern), and
you enable hflip, then the driver crashes.
The cause turned out to be a division of a negative number by an unsigned value.
You expect that -8 / 2U would be -4, but in reality it is 2147483644 :-(
Fixes: 3e14e7a82c1ef ("vivid-tpg: add hor/vert downsampling support to tpg_gen_text")
Signed-off-by: Hans Verkuil <hans.verkuil(a)cisco.com>
Reported-by: Mauro Carvalho Chehab <mchehab+samsung(a)kernel.org>
Cc: <stable(a)vger.kernel.org> # for v4.1 and up
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung(a)kernel.org>
diff --git a/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c b/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c
index 76b125ebee6d..fa483b95bc5a 100644
--- a/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c
+++ b/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c
@@ -1801,7 +1801,7 @@ typedef struct { u16 __; u8 _; } __packed x24;
pos[7] = (chr & (0x01 << 0) ? fg : bg); \
} \
\
- pos += (tpg->hflip ? -8 : 8) / hdiv; \
+ pos += (tpg->hflip ? -8 : 8) / (int)hdiv; \
} \
} \
} while (0)
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From ef739b2175dde9c05594f768cb78149f1ce2ac36 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever(a)oracle.com>
Date: Mon, 1 Oct 2018 14:25:14 -0400
Subject: [PATCH] xprtrdma: Reset credit grant properly after a disconnect
On a fresh connection, an RPC/RDMA client is supposed to send only
one RPC Call until it gets a credit grant in the first RPC Reply
from the server [RFC 8166, Section 3.3.3].
There is a bug in the Linux client's credit accounting mechanism
introduced by commit e7ce710a8802 ("xprtrdma: Avoid deadlock when
credit window is reset"). On connect, it simply dumps all pending
RPC Calls onto the new connection.
Servers have been tolerant of this bad behavior. Currently no server
implementation ever changes its credit grant over reconnects, and
servers always repost enough Receives before connections are fully
established.
To correct this issue, ensure that the client resets both the credit
grant _and_ the congestion window when handling a reconnect.
Fixes: e7ce710a8802 ("xprtrdma: Avoid deadlock when credit ... ")
Signed-off-by: Chuck Lever <chuck.lever(a)oracle.com>
Cc: stable(a)kernel.org
Signed-off-by: Anna Schumaker <Anna.Schumaker(a)Netapp.com>
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index a68180090554..b9827665ff35 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -248,6 +248,7 @@ static void
xprt_rdma_bc_close(struct rpc_xprt *xprt)
{
dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
+ xprt->cwnd = RPC_CWNDSHIFT;
}
static void
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 143ce2579ba9..98cbc7b060ba 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -468,6 +468,12 @@ xprt_rdma_close(struct rpc_xprt *xprt)
xprt->reestablish_timeout = 0;
xprt_disconnect_done(xprt);
rpcrdma_ep_disconnect(ep, ia);
+
+ /* Prepare @xprt for the next connection by reinitializing
+ * its credit grant to one (see RFC 8166, Section 3.3.3).
+ */
+ r_xprt->rx_buf.rb_credits = 1;
+ xprt->cwnd = RPC_CWNDSHIFT;
}
/**