As noted by Michal, the blkg_iostat_set's in the lockless list hold
reference to blkg's to protect against their removal. Those blkg's
hold reference to blkcg. When a cgroup is being destroyed,
cgroup_rstat_flush() is only called at css_release_work_fn() which
is called when the blkcg reference count reaches 0. This circular
dependency will prevent blkcg and some blkgs from being freed after
they are made offline.
It is less a problem if the cgroup to be destroyed also has other
controllers like memory that will call cgroup_rstat_flush() which will
clean up the reference count. If block is the only controller that uses
rstat, these offline blkcg and blkgs may never be freed leaking more
and more memory over time.
To prevent this potential memory leak:
- flush blkcg per-cpu stats list in __blkg_release(), when no new stat
can be added
- add global blkg_stat_lock for covering concurrent parent blkg stat
update
- don't grab bio->bi_blkg reference when adding the stats into blkcg's
per-cpu stat list since all stats are guaranteed to be consumed before
releasing blkg instance, and grabbing blkg reference for stats was the
most fragile part of original patch
Based on Waiman's patch:
https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.…
Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
Cc: stable(a)vger.kernel.org
Reported-by: Jay Shin <jaeshin(a)redhat.com>
Cc: Waiman Long <longman(a)redhat.com>
Cc: Tejun Heo <tj(a)kernel.org>
Cc: mkoutny(a)suse.com
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Signed-off-by: Ming Lei <ming.lei(a)redhat.com>
---
V3:
- add one global blkg_stat_lock for avoiding concurrent update on
blkg stat; this way is easier for backport, also won't cause contention;
V2:
- remove kernel/cgroup change, and call blkcg_rstat_flush()
to flush stat directly
block/blk-cgroup.c | 40 +++++++++++++++++++++++++++++++---------
1 file changed, 31 insertions(+), 9 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ce64dd73cfe..f0b5c9c41cde 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -34,6 +34,8 @@
#include "blk-ioprio.h"
#include "blk-throttle.h"
+static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
+
/*
* blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
* blkcg_pol_register_mutex nests outside of it and synchronizes entire
@@ -56,6 +58,8 @@ static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
bool blkcg_debug_stats = false;
+static DEFINE_RAW_SPINLOCK(blkg_stat_lock);
+
#define BLKG_DESTROY_BATCH_SIZE 64
/*
@@ -163,10 +167,20 @@ static void blkg_free(struct blkcg_gq *blkg)
static void __blkg_release(struct rcu_head *rcu)
{
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+ struct blkcg *blkcg = blkg->blkcg;
+ int cpu;
#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
WARN_ON(!bio_list_empty(&blkg->async_bios));
#endif
+ /*
+ * Flush all the non-empty percpu lockless lists before releasing
+ * us, given these stat belongs to us.
+ *
+ * blkg_stat_lock is for serializing blkg stat update
+ */
+ for_each_possible_cpu(cpu)
+ __blkcg_rstat_flush(blkcg, cpu);
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
@@ -951,23 +965,26 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
-static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
{
- struct blkcg *blkcg = css_to_blkcg(css);
struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
struct llist_node *lnode;
struct blkg_iostat_set *bisc, *next_bisc;
- /* Root-level stats are sourced from system-wide IO stats */
- if (!cgroup_parent(css->cgroup))
- return;
-
rcu_read_lock();
lnode = llist_del_all(lhead);
if (!lnode)
goto out;
+ /*
+ * For covering concurrent parent blkg update from blkg_release().
+ *
+ * When flushing from cgroup, cgroup_rstat_lock is always held, so
+ * this lock won't cause contention most of time.
+ */
+ raw_spin_lock(&blkg_stat_lock);
+
/*
* Iterate only the iostat_cpu's queued in the lockless list.
*/
@@ -991,13 +1008,19 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (parent && parent->parent)
blkcg_iostat_update(parent, &blkg->iostat.cur,
&blkg->iostat.last);
- percpu_ref_put(&blkg->refcnt);
}
-
+ raw_spin_unlock(&blkg_stat_lock);
out:
rcu_read_unlock();
}
+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ /* Root-level stats are sourced from system-wide IO stats */
+ if (cgroup_parent(css->cgroup))
+ __blkcg_rstat_flush(css_to_blkcg(css), cpu);
+}
+
/*
* We source root cgroup stats from the system-wide stats to avoid
* tracking the same information twice and incurring overhead when no
@@ -2075,7 +2098,6 @@ void blk_cgroup_bio_start(struct bio *bio)
llist_add(&bis->lnode, lhead);
WRITE_ONCE(bis->lqueued, true);
- percpu_ref_get(&bis->blkg->refcnt);
}
u64_stats_update_end_irqrestore(&bis->sync, flags);
--
2.40.1
I'm proposing to address the most obvious issues with dpt_i2o on stable
branches. At this stage it may be better to remove it as has been done
upstream, but I'd rather limit the regression for anyone still using
the hardware.
The changes are:
- "scsi: dpt_i2o: Remove broken pass-through ioctl (I2OUSERCMD)",
which closes security flaws including CVE-2023-2007.
- "scsi: dpt_i2o: Do not process completions with invalid addresses",
which removes the remaining bus_to_virt() call and may slightly
improve handling of misbehaving hardware.
These changes have been compiled on all the relevant stable branches,
but I don't have hardware to test on.
Ben.
--
Ben Hutchings - Debian developer, member of kernel, installer and LTS
teams
I did some randconfig testing on 5.10.y and the following patches are
required.
d7a7d721064c5 ("media: ti-vpe: cal: avoid FIELD_GET assertion")
42d95d1b3a9c6 ("drm/rcar: stop using 'imply' for dependencies")
The first patch is only required on 5.10.y.
The second "drm/rcar" commit is required in 5.15.y as well.
I'm going to be doing regular randconfig testing on stable so let me
know if you have any advice.
regards,
dan carpenter
[Why]
The sequence for collecting down_reply from source perspective should
be:
Request_n->repeat (get partial reply of Request_n->clear message ready
flag to ack DPRX that the message is received) till all partial
replies for Request_n are received->new Request_n+1.
Now there is chance that drm_dp_mst_hpd_irq() will fire new down
request in the tx queue when the down reply is incomplete. Source is
restricted to generate interveleaved message transactions so we should
avoid it.
Also, while assembling partial reply packets, reading out DPCD DOWN_REP
Sideband MSG buffer + clearing DOWN_REP_MSG_RDY flag should be
wrapped up as a complete operation for reading out a reply packet.
Kicking off a new request before clearing DOWN_REP_MSG_RDY flag might
be risky. e.g. If the reply of the new request has overwritten the
DPRX DOWN_REP Sideband MSG buffer before source writing one to clear
DOWN_REP_MSG_RDY flag, source then unintentionally flushes the reply
for the new request. Should handle the up request in the same way.
[How]
Separete drm_dp_mst_hpd_irq() into 2 steps. After acking the MST IRQ
event, driver calls drm_dp_mst_hpd_irq_send_new_request() and might
trigger drm_dp_mst_kick_tx() only when there is no on going message
transaction.
Changes since v1:
* Reworked on review comments received
-> Adjust the fix to let driver explicitly kick off new down request
when mst irq event is handled and acked
-> Adjust the commit message
Changes since v2:
* Adjust the commit message
* Adjust the naming of the divided 2 functions and add a new input
parameter "ack".
* Adjust code flow as per review comments.
Signed-off-by: Wayne Lin <Wayne.Lin(a)amd.com>
Cc: stable(a)vger.kernel.org
---
.../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 33 +++++++++-------
drivers/gpu/drm/display/drm_dp_mst_topology.c | 39 +++++++++++++++++--
drivers/gpu/drm/i915/display/intel_dp.c | 7 ++--
drivers/gpu/drm/nouveau/dispnv50/disp.c | 12 ++++--
include/drm/display/drm_dp_mst_helper.h | 7 +++-
5 files changed, 70 insertions(+), 28 deletions(-)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index d5cec03eaa8d..597c3368bcfb 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -3236,6 +3236,7 @@ static void dm_handle_mst_sideband_msg(struct amdgpu_dm_connector *aconnector)
{
u8 esi[DP_PSR_ERROR_STATUS - DP_SINK_COUNT_ESI] = { 0 };
u8 dret;
+ u8 ack;
bool new_irq_handled = false;
int dpcd_addr;
int dpcd_bytes_to_read;
@@ -3265,34 +3266,36 @@ static void dm_handle_mst_sideband_msg(struct amdgpu_dm_connector *aconnector)
process_count < max_process_count) {
u8 retry;
dret = 0;
+ ack = 0;
process_count++;
DRM_DEBUG_DRIVER("ESI %02x %02x %02x\n", esi[0], esi[1], esi[2]);
/* handle HPD short pulse irq */
if (aconnector->mst_mgr.mst_state)
- drm_dp_mst_hpd_irq(
- &aconnector->mst_mgr,
- esi,
- &new_irq_handled);
+ drm_dp_mst_hpd_irq_handle_event(&aconnector->mst_mgr,
+ esi,
+ &ack,
+ &new_irq_handled);
if (new_irq_handled) {
/* ACK at DPCD to notify down stream */
- const int ack_dpcd_bytes_to_write =
- dpcd_bytes_to_read - 1;
-
for (retry = 0; retry < 3; retry++) {
- u8 wret;
-
- wret = drm_dp_dpcd_write(
- &aconnector->dm_dp_aux.aux,
- dpcd_addr + 1,
- &esi[1],
- ack_dpcd_bytes_to_write);
- if (wret == ack_dpcd_bytes_to_write)
+ ssize_t wret;
+
+ wret = drm_dp_dpcd_writeb(&aconnector->dm_dp_aux.aux,
+ dpcd_addr + 1,
+ ack);
+ if (wret == 1)
break;
}
+ if (retry == 3) {
+ DRM_ERROR("Failed to ack MST event.\n");
+ return;
+ }
+
+ drm_dp_mst_hpd_irq_send_new_request(&aconnector->mst_mgr);
/* check if there is new irq to be handled */
dret = drm_dp_dpcd_read(
&aconnector->dm_dp_aux.aux,
diff --git a/drivers/gpu/drm/display/drm_dp_mst_topology.c b/drivers/gpu/drm/display/drm_dp_mst_topology.c
index 38dab76ae69e..13165e764709 100644
--- a/drivers/gpu/drm/display/drm_dp_mst_topology.c
+++ b/drivers/gpu/drm/display/drm_dp_mst_topology.c
@@ -4053,9 +4053,10 @@ static int drm_dp_mst_handle_up_req(struct drm_dp_mst_topology_mgr *mgr)
}
/**
- * drm_dp_mst_hpd_irq() - MST hotplug IRQ notify
+ * drm_dp_mst_hpd_irq_handle_event() - MST hotplug IRQ handle MST event
* @mgr: manager to notify irq for.
* @esi: 4 bytes from SINK_COUNT_ESI
+ * @ack: flags of events to ack
* @handled: whether the hpd interrupt was consumed or not
*
* This should be called from the driver when it detects a short IRQ,
@@ -4063,7 +4064,8 @@ static int drm_dp_mst_handle_up_req(struct drm_dp_mst_topology_mgr *mgr)
* topology manager will process the sideband messages received as a result
* of this.
*/
-int drm_dp_mst_hpd_irq(struct drm_dp_mst_topology_mgr *mgr, u8 *esi, bool *handled)
+int drm_dp_mst_hpd_irq_handle_event(struct drm_dp_mst_topology_mgr *mgr, const u8 *esi,
+ u8 *ack, bool *handled)
{
int ret = 0;
int sc;
@@ -4078,18 +4080,47 @@ int drm_dp_mst_hpd_irq(struct drm_dp_mst_topology_mgr *mgr, u8 *esi, bool *handl
if (esi[1] & DP_DOWN_REP_MSG_RDY) {
ret = drm_dp_mst_handle_down_rep(mgr);
*handled = true;
+ *ack |= DP_DOWN_REP_MSG_RDY;
}
if (esi[1] & DP_UP_REQ_MSG_RDY) {
ret |= drm_dp_mst_handle_up_req(mgr);
*handled = true;
+ *ack |= DP_UP_REQ_MSG_RDY;
}
- drm_dp_mst_kick_tx(mgr);
return ret;
}
-EXPORT_SYMBOL(drm_dp_mst_hpd_irq);
+EXPORT_SYMBOL(drm_dp_mst_hpd_irq_handle_event);
+
+/**
+ * drm_dp_mst_hpd_irq_send_new_request() - MST hotplug IRQ kick off new request
+ * @mgr: manager to notify irq for.
+ *
+ * This should be called from the driver when mst irq event is handled
+ * and acked. Note that new down request should only be sent when
+ * previous message transaction is completed. Source is not supposed to generate
+ * interleaved message transactions.
+ */
+void drm_dp_mst_hpd_irq_send_new_request(struct drm_dp_mst_topology_mgr *mgr)
+{
+ struct drm_dp_sideband_msg_tx *txmsg;
+ bool kick = true;
+ mutex_lock(&mgr->qlock);
+ txmsg = list_first_entry_or_null(&mgr->tx_msg_downq,
+ struct drm_dp_sideband_msg_tx, next);
+ /* If last transaction is not completed yet*/
+ if (!txmsg ||
+ txmsg->state == DRM_DP_SIDEBAND_TX_START_SEND ||
+ txmsg->state == DRM_DP_SIDEBAND_TX_SENT)
+ kick = false;
+ mutex_unlock(&mgr->qlock);
+
+ if (kick)
+ drm_dp_mst_kick_tx(mgr);
+}
+EXPORT_SYMBOL(drm_dp_mst_hpd_irq_send_new_request);
/**
* drm_dp_mst_detect_port() - get connection status for an MST port
* @connector: DRM connector for this port
diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c
index 4bec8cd7979f..f24602887015 100644
--- a/drivers/gpu/drm/i915/display/intel_dp.c
+++ b/drivers/gpu/drm/i915/display/intel_dp.c
@@ -4062,9 +4062,7 @@ intel_dp_mst_hpd_irq(struct intel_dp *intel_dp, u8 *esi, u8 *ack)
{
bool handled = false;
- drm_dp_mst_hpd_irq(&intel_dp->mst_mgr, esi, &handled);
- if (handled)
- ack[1] |= esi[1] & (DP_DOWN_REP_MSG_RDY | DP_UP_REQ_MSG_RDY);
+ drm_dp_mst_hpd_irq_handle_event(&intel_dp->mst_mgr, esi, &ack[1], &handled);
if (esi[1] & DP_CP_IRQ) {
intel_hdcp_handle_cp_irq(intel_dp->attached_connector);
@@ -4139,6 +4137,9 @@ intel_dp_check_mst_status(struct intel_dp *intel_dp)
if (!intel_dp_ack_sink_irq_esi(intel_dp, ack))
drm_dbg_kms(&i915->drm, "Failed to ack ESI\n");
+
+ if (ack[1] & (DP_DOWN_REP_MSG_RDY | DP_UP_REQ_MSG_RDY))
+ drm_dp_mst_hpd_irq_send_new_request(&intel_dp->mst_mgr);
}
return link_ok;
diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.c b/drivers/gpu/drm/nouveau/dispnv50/disp.c
index 9b6824f6b9e4..b2d9978e88a8 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/disp.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/disp.c
@@ -1357,6 +1357,7 @@ nv50_mstm_service(struct nouveau_drm *drm,
bool handled = true, ret = true;
int rc;
u8 esi[8] = {};
+ u8 ack;
while (handled) {
rc = drm_dp_dpcd_read(aux, DP_SINK_COUNT_ESI, esi, 8);
@@ -1365,16 +1366,19 @@ nv50_mstm_service(struct nouveau_drm *drm,
break;
}
- drm_dp_mst_hpd_irq(&mstm->mgr, esi, &handled);
+ ack = 0;
+ drm_dp_mst_hpd_irq_handle_event(&mstm->mgr, esi, &ack, &handled);
if (!handled)
break;
- rc = drm_dp_dpcd_write(aux, DP_SINK_COUNT_ESI + 1, &esi[1],
- 3);
- if (rc != 3) {
+ rc = drm_dp_dpcd_writeb(aux, DP_SINK_COUNT_ESI + 1, ack);
+
+ if (rc != 1) {
ret = false;
break;
}
+
+ drm_dp_mst_hpd_irq_send_new_request(&mstm->mgr);
}
if (!ret)
diff --git a/include/drm/display/drm_dp_mst_helper.h b/include/drm/display/drm_dp_mst_helper.h
index 32c764fb9cb5..40e855c8407c 100644
--- a/include/drm/display/drm_dp_mst_helper.h
+++ b/include/drm/display/drm_dp_mst_helper.h
@@ -815,8 +815,11 @@ void drm_dp_mst_topology_mgr_destroy(struct drm_dp_mst_topology_mgr *mgr);
bool drm_dp_read_mst_cap(struct drm_dp_aux *aux, const u8 dpcd[DP_RECEIVER_CAP_SIZE]);
int drm_dp_mst_topology_mgr_set_mst(struct drm_dp_mst_topology_mgr *mgr, bool mst_state);
-int drm_dp_mst_hpd_irq(struct drm_dp_mst_topology_mgr *mgr, u8 *esi, bool *handled);
-
+int drm_dp_mst_hpd_irq_handle_event(struct drm_dp_mst_topology_mgr *mgr,
+ const u8 *esi,
+ u8 *ack,
+ bool *handled);
+void drm_dp_mst_hpd_irq_send_new_request(struct drm_dp_mst_topology_mgr *mgr);
int
drm_dp_mst_detect_port(struct drm_connector *connector,
--
2.37.3