From: Michael J. Ruhl <michael.j.ruhl(a)intel.com>
commit a0e0cb82804a6a21d9067022c2dfdf80d11da429 upstream
pq_update() can only be called in two places: from the completion
function when the complete (npkts) sequence of packets has been
submitted and processed, or from setup function if a subset of the
packets were submitted (i.e. the error path).
Currently both paths can call pq_update() if an error occurrs. This
race will cause the n_req value to go negative, hanging file_close(),
or cause a crash by freeing the txlist more than once.
Several variables are used to determine SDMA send state. Most of
these are unnecessary, and have code inspectible races between the
setup function and the completion function, in both the send path and
the error path.
The request 'status' value can be set by the setup or by the
completion function. This is code inspectibly racy. Since the status
is not needed in the completion code or by the caller it has been
removed.
The request 'done' value races between usage by the setup and the
completion function. The completion function does not need this.
When the number of processed packets matches npkts, it is done.
The 'has_error' value races between usage of the setup and the
completion function. This can cause incorrect error handling and leave
the n_req in an incorrect value (i.e. negative).
Simplify the code by removing all of the unneeded state checks and
variables.
Clean up iovs node when it is freed.
Eliminate race conditions in the error path:
If all packets are submitted, the completion handler will set the
completion status correctly (ok or aborted).
If all packets are not submitted, the caller must wait until the
submitted packets have completed, and then set the completion status.
These two change eliminate the race condition in the error path.
Cc: <stable(a)vger.kernel.org> # 4.9.0
Reviewed-by: Mitko Haralanov <mitko.haralanov(a)intel.com>
Reviewed-by: Mike Marciniszyn <mike.marciniszyn(a)intel.com>
Signed-off-by: Michael J. Ruhl <michael.j.ruhl(a)intel.com>
---
drivers/infiniband/hw/hfi1/user_sdma.c | 106 ++++++++++++++------------------
1 file changed, 45 insertions(+), 61 deletions(-)
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index 018a415..619475c 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -148,11 +148,8 @@
#define TXREQ_FLAGS_REQ_LAST_PKT BIT(0)
/* SDMA request flag bits */
-#define SDMA_REQ_FOR_THREAD 1
-#define SDMA_REQ_SEND_DONE 2
-#define SDMA_REQ_HAVE_AHG 3
-#define SDMA_REQ_HAS_ERROR 4
-#define SDMA_REQ_DONE_ERROR 5
+#define SDMA_REQ_HAVE_AHG 1
+#define SDMA_REQ_HAS_ERROR 2
#define SDMA_PKT_Q_INACTIVE BIT(0)
#define SDMA_PKT_Q_ACTIVE BIT(1)
@@ -252,8 +249,6 @@ struct user_sdma_request {
u64 seqsubmitted;
struct list_head txps;
unsigned long flags;
- /* status of the last txreq completed */
- int status;
};
/*
@@ -546,7 +541,6 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
struct sdma_req_info info;
struct user_sdma_request *req;
u8 opcode, sc, vl;
- int req_queued = 0;
u16 dlid;
u32 selector;
@@ -611,11 +605,13 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
req->pq = pq;
req->cq = cq;
- req->status = -1;
INIT_LIST_HEAD(&req->txps);
memcpy(&req->info, &info, sizeof(info));
+ /* The request is initialized, count it */
+ atomic_inc(&pq->n_reqs);
+
if (req_opcode(info.ctrl) == EXPECTED) {
/* expected must have a TID info and at least one data vector */
if (req->data_iovs < 2) {
@@ -704,7 +700,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec));
ret = pin_vector_pages(req, &req->iovs[i]);
if (ret) {
- req->status = ret;
+ req->data_iovs = i;
goto free_req;
}
req->data_len += req->iovs[i].iov.iov_len;
@@ -772,14 +768,10 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
}
set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
- atomic_inc(&pq->n_reqs);
- req_queued = 1;
/* Send the first N packets in the request to buy us some time */
ret = user_sdma_send_pkts(req, pcount);
- if (unlikely(ret < 0 && ret != -EBUSY)) {
- req->status = ret;
+ if (unlikely(ret < 0 && ret != -EBUSY))
goto free_req;
- }
/*
* It is possible that the SDMA engine would have processed all the
@@ -796,17 +788,11 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
* request have been submitted to the SDMA engine. However, it
* will not wait for send completions.
*/
- while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) {
+ while (req->seqsubmitted != req->info.npkts) {
ret = user_sdma_send_pkts(req, pcount);
if (ret < 0) {
- if (ret != -EBUSY) {
- req->status = ret;
- set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
- if (ACCESS_ONCE(req->seqcomp) ==
- req->seqsubmitted - 1)
- goto free_req;
- return ret;
- }
+ if (ret != -EBUSY)
+ goto free_req;
wait_event_interruptible_timeout(
pq->busy.wait_dma,
(pq->state == SDMA_PKT_Q_ACTIVE),
@@ -817,10 +803,19 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
*count += idx;
return 0;
free_req:
- user_sdma_free_request(req, true);
- if (req_queued)
+ /*
+ * If the submitted seqsubmitted == npkts, the completion routine
+ * controls the final state. If sequbmitted < npkts, wait for any
+ * outstanding packets to finish before cleaning up.
+ */
+ if (req->seqsubmitted < req->info.npkts) {
+ if (req->seqsubmitted)
+ wait_event(pq->busy.wait_dma,
+ (req->seqcomp == req->seqsubmitted - 1));
+ user_sdma_free_request(req, true);
pq_update(pq);
- set_comp_state(pq, cq, info.comp_idx, ERROR, req->status);
+ set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
+ }
return ret;
}
@@ -903,10 +898,8 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
pq = req->pq;
/* If tx completion has reported an error, we are done. */
- if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
- set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+ if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags))
return -EFAULT;
- }
/*
* Check if we might have sent the entire request already
@@ -929,10 +922,8 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
* with errors. If so, we are not going to process any
* more packets from this request.
*/
- if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
- set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+ if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags))
return -EFAULT;
- }
tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
if (!tx)
@@ -1090,7 +1081,6 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count);
req->seqsubmitted += count;
if (req->seqsubmitted == req->info.npkts) {
- set_bit(SDMA_REQ_SEND_DONE, &req->flags);
/*
* The txreq has already been submitted to the HW queue
* so we can free the AHG entry now. Corruption will not
@@ -1489,11 +1479,15 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
return diff;
}
-/*
- * SDMA tx request completion callback. Called when the SDMA progress
- * state machine gets notification that the SDMA descriptors for this
- * tx request have been processed by the DMA engine. Called in
- * interrupt context.
+/**
+ * user_sdma_txreq_cb() - SDMA tx request completion callback.
+ * @txreq: valid sdma tx request
+ * @status: success/failure of request
+ *
+ * Called when the SDMA progress state machine gets notification that
+ * the SDMA descriptors for this tx request have been processed by the
+ * DMA engine. Called in interrupt context.
+ * Only do work on completed sequences.
*/
static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
{
@@ -1502,7 +1496,7 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
struct user_sdma_request *req;
struct hfi1_user_sdma_pkt_q *pq;
struct hfi1_user_sdma_comp_q *cq;
- u16 idx;
+ enum hfi1_sdma_comp_state state = COMPLETE;
if (!tx->req)
return;
@@ -1515,31 +1509,19 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
SDMA_DBG(req, "SDMA completion with error %d",
status);
set_bit(SDMA_REQ_HAS_ERROR, &req->flags);
+ state = ERROR;
}
req->seqcomp = tx->seqnum;
kmem_cache_free(pq->txreq_cache, tx);
- tx = NULL;
-
- idx = req->info.comp_idx;
- if (req->status == -1 && status == SDMA_TXREQ_S_OK) {
- if (req->seqcomp == req->info.npkts - 1) {
- req->status = 0;
- user_sdma_free_request(req, false);
- pq_update(pq);
- set_comp_state(pq, cq, idx, COMPLETE, 0);
- }
- } else {
- if (status != SDMA_TXREQ_S_OK)
- req->status = status;
- if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) &&
- (test_bit(SDMA_REQ_SEND_DONE, &req->flags) ||
- test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) {
- user_sdma_free_request(req, false);
- pq_update(pq);
- set_comp_state(pq, cq, idx, ERROR, req->status);
- }
- }
+
+ /* sequence isn't complete? We are done */
+ if (req->seqcomp != req->info.npkts - 1)
+ return;
+
+ user_sdma_free_request(req, false);
+ set_comp_state(pq, cq, req->info.comp_idx, state, status);
+ pq_update(pq);
}
static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
@@ -1572,6 +1554,8 @@ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
if (!node)
continue;
+ req->iovs[i].node = NULL;
+
if (unpin)
hfi1_mmu_rb_remove(req->pq->handler,
&node->rb);
From: Michael J. Ruhl <michael.j.ruhl(a)intel.com>
commit a0e0cb82804a6a21d9067022c2dfdf80d11da429 upstream
pq_update() can only be called in two places: from the completion
function when the complete (npkts) sequence of packets has been
submitted and processed, or from setup function if a subset of the
packets were submitted (i.e. the error path).
Currently both paths can call pq_update() if an error occurrs. This
race will cause the n_req value to go negative, hanging file_close(),
or cause a crash by freeing the txlist more than once.
Several variables are used to determine SDMA send state. Most of
these are unnecessary, and have code inspectible races between the
setup function and the completion function, in both the send path and
the error path.
The request 'status' value can be set by the setup or by the
completion function. This is code inspectibly racy. Since the status
is not needed in the completion code or by the caller it has been
removed.
The request 'done' value races between usage by the setup and the
completion function. The completion function does not need this.
When the number of processed packets matches npkts, it is done.
The 'has_error' value races between usage of the setup and the
completion function. This can cause incorrect error handling and leave
the n_req in an incorrect value (i.e. negative).
Simplify the code by removing all of the unneeded state checks and
variables.
Clean up iovs node when it is freed.
Eliminate race conditions in the error path:
If all packets are submitted, the completion handler will set the
completion status correctly (ok or aborted).
If all packets are not submitted, the caller must wait until the
submitted packets have completed, and then set the completion status.
These two change eliminate the race condition in the error path.
Cc: <stable(a)vger.kernel.org> # 4.18.x+
Reviewed-by: Mitko Haralanov <mitko.haralanov(a)intel.com>
Reviewed-by: Mike Marciniszyn <mike.marciniszyn(a)intel.com>
Signed-off-by: Michael J. Ruhl <michael.j.ruhl(a)intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro(a)intel.com>
Signed-off-by: Jason Gunthorpe <jgg(a)mellanox.com>
---
drivers/infiniband/hw/hfi1/user_sdma.c | 87 ++++++++++++++------------------
drivers/infiniband/hw/hfi1/user_sdma.h | 3 -
2 files changed, 39 insertions(+), 51 deletions(-)
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index 5c88706..39134dd 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -328,7 +328,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
u8 opcode, sc, vl;
u16 pkey;
u32 slid;
- int req_queued = 0;
u16 dlid;
u32 selector;
@@ -392,7 +391,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
req->data_len = 0;
req->pq = pq;
req->cq = cq;
- req->status = -1;
req->ahg_idx = -1;
req->iov_idx = 0;
req->sent = 0;
@@ -400,12 +398,14 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
req->seqcomp = 0;
req->seqsubmitted = 0;
req->tids = NULL;
- req->done = 0;
req->has_error = 0;
INIT_LIST_HEAD(&req->txps);
memcpy(&req->info, &info, sizeof(info));
+ /* The request is initialized, count it */
+ atomic_inc(&pq->n_reqs);
+
if (req_opcode(info.ctrl) == EXPECTED) {
/* expected must have a TID info and at least one data vector */
if (req->data_iovs < 2) {
@@ -500,7 +500,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
ret = pin_vector_pages(req, &req->iovs[i]);
if (ret) {
req->data_iovs = i;
- req->status = ret;
goto free_req;
}
req->data_len += req->iovs[i].iov.iov_len;
@@ -561,14 +560,10 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
req->ahg_idx = sdma_ahg_alloc(req->sde);
set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
- atomic_inc(&pq->n_reqs);
- req_queued = 1;
/* Send the first N packets in the request to buy us some time */
ret = user_sdma_send_pkts(req, pcount);
- if (unlikely(ret < 0 && ret != -EBUSY)) {
- req->status = ret;
+ if (unlikely(ret < 0 && ret != -EBUSY))
goto free_req;
- }
/*
* It is possible that the SDMA engine would have processed all the
@@ -588,14 +583,8 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
while (req->seqsubmitted != req->info.npkts) {
ret = user_sdma_send_pkts(req, pcount);
if (ret < 0) {
- if (ret != -EBUSY) {
- req->status = ret;
- WRITE_ONCE(req->has_error, 1);
- if (READ_ONCE(req->seqcomp) ==
- req->seqsubmitted - 1)
- goto free_req;
- return ret;
- }
+ if (ret != -EBUSY)
+ goto free_req;
wait_event_interruptible_timeout(
pq->busy.wait_dma,
(pq->state == SDMA_PKT_Q_ACTIVE),
@@ -606,10 +595,19 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
*count += idx;
return 0;
free_req:
- user_sdma_free_request(req, true);
- if (req_queued)
+ /*
+ * If the submitted seqsubmitted == npkts, the completion routine
+ * controls the final state. If sequbmitted < npkts, wait for any
+ * outstanding packets to finish before cleaning up.
+ */
+ if (req->seqsubmitted < req->info.npkts) {
+ if (req->seqsubmitted)
+ wait_event(pq->busy.wait_dma,
+ (req->seqcomp == req->seqsubmitted - 1));
+ user_sdma_free_request(req, true);
pq_update(pq);
- set_comp_state(pq, cq, info.comp_idx, ERROR, req->status);
+ set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
+ }
return ret;
}
@@ -917,7 +915,6 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count);
req->seqsubmitted += count;
if (req->seqsubmitted == req->info.npkts) {
- WRITE_ONCE(req->done, 1);
/*
* The txreq has already been submitted to the HW queue
* so we can free the AHG entry now. Corruption will not
@@ -1365,11 +1362,15 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
return idx;
}
-/*
- * SDMA tx request completion callback. Called when the SDMA progress
- * state machine gets notification that the SDMA descriptors for this
- * tx request have been processed by the DMA engine. Called in
- * interrupt context.
+/**
+ * user_sdma_txreq_cb() - SDMA tx request completion callback.
+ * @txreq: valid sdma tx request
+ * @status: success/failure of request
+ *
+ * Called when the SDMA progress state machine gets notification that
+ * the SDMA descriptors for this tx request have been processed by the
+ * DMA engine. Called in interrupt context.
+ * Only do work on completed sequences.
*/
static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
{
@@ -1378,7 +1379,7 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
struct user_sdma_request *req;
struct hfi1_user_sdma_pkt_q *pq;
struct hfi1_user_sdma_comp_q *cq;
- u16 idx;
+ enum hfi1_sdma_comp_state state = COMPLETE;
if (!tx->req)
return;
@@ -1391,31 +1392,19 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
SDMA_DBG(req, "SDMA completion with error %d",
status);
WRITE_ONCE(req->has_error, 1);
+ state = ERROR;
}
req->seqcomp = tx->seqnum;
kmem_cache_free(pq->txreq_cache, tx);
- tx = NULL;
-
- idx = req->info.comp_idx;
- if (req->status == -1 && status == SDMA_TXREQ_S_OK) {
- if (req->seqcomp == req->info.npkts - 1) {
- req->status = 0;
- user_sdma_free_request(req, false);
- pq_update(pq);
- set_comp_state(pq, cq, idx, COMPLETE, 0);
- }
- } else {
- if (status != SDMA_TXREQ_S_OK)
- req->status = status;
- if (req->seqcomp == (READ_ONCE(req->seqsubmitted) - 1) &&
- (READ_ONCE(req->done) ||
- READ_ONCE(req->has_error))) {
- user_sdma_free_request(req, false);
- pq_update(pq);
- set_comp_state(pq, cq, idx, ERROR, req->status);
- }
- }
+
+ /* sequence isn't complete? We are done */
+ if (req->seqcomp != req->info.npkts - 1)
+ return;
+
+ user_sdma_free_request(req, false);
+ set_comp_state(pq, cq, req->info.comp_idx, state, status);
+ pq_update(pq);
}
static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
@@ -1448,6 +1437,8 @@ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
if (!node)
continue;
+ req->iovs[i].node = NULL;
+
if (unpin)
hfi1_mmu_rb_remove(req->pq->handler,
&node->rb);
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h
index d2bc77f..0ae0645 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.h
+++ b/drivers/infiniband/hw/hfi1/user_sdma.h
@@ -205,8 +205,6 @@ struct user_sdma_request {
/* Writeable fields shared with interrupt */
u64 seqcomp ____cacheline_aligned_in_smp;
u64 seqsubmitted;
- /* status of the last txreq completed */
- int status;
/* Send side fields */
struct list_head txps ____cacheline_aligned_in_smp;
@@ -228,7 +226,6 @@ struct user_sdma_request {
u16 tididx;
/* progress index moving along the iovs array */
u8 iov_idx;
- u8 done;
u8 has_error;
struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];
In order for ULPI PHYs to work, dwc3_phy_setup() and dwc3_ulpi_init()
must be doene before dwc3_core_get_phy().
commit 541768b08a40 ("usb: dwc3: core: Call dwc3_core_get_phy() before initializing phys")
broke this.
The other issue is that dwc3_core_get_phy() and dwc3_ulpi_init() should
be called only once during the life cycle of the driver. However,
as dwc3_core_init() is called during system suspend/resume it will
result in multiple calls to dwc3_core_get_phy() and dwc3_ulpi_init()
which is wrong.
Fix this by moving dwc3_ulpi_init() out of dwc3_phy_setup()
into dwc3_core_ulpi_init(). Use a flag 'ulpi_ready' to ensure that
dwc3_core_ulpi_init() is called only once from dwc3_core_init().
Use another flag 'phys_ready' to call dwc3_core_get_phy() only once from
dwc3_core_init().
Fixes: 541768b08a40 ("usb: dwc3: core: Call dwc3_core_get_phy() before initializing phys")
Fixes: f54edb539c11 ("usb: dwc3: core: initialize ULPI before trying to get the PHY")
Cc: linux-stable <stable(a)vger.kernel.org> # >= v4.13
Signed-off-by: Roger Quadros <rogerq(a)ti.com>
---
Changelog:
v2:
- don't break ULPI case. Also take into consideration suspend/resume for ULPI case.
drivers/usb/dwc3/core.c | 47 ++++++++++++++++++++++++++++++++++++-----------
drivers/usb/dwc3/core.h | 5 +++++
2 files changed, 41 insertions(+), 11 deletions(-)
diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index 0783250..84382f6 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -482,6 +482,22 @@ static void dwc3_cache_hwparams(struct dwc3 *dwc)
parms->hwparams8 = dwc3_readl(dwc->regs, DWC3_GHWPARAMS8);
}
+static int dwc3_core_ulpi_init(struct dwc3 *dwc)
+{
+ int intf;
+ int ret = 0;
+
+ intf = DWC3_GHWPARAMS3_HSPHY_IFC(dwc->hwparams.hwparams3);
+
+ if (intf == DWC3_GHWPARAMS3_HSPHY_IFC_ULPI ||
+ (intf == DWC3_GHWPARAMS3_HSPHY_IFC_UTMI_ULPI &&
+ dwc->hsphy_interface &&
+ !strncmp(dwc->hsphy_interface, "ulpi", 4)))
+ ret = dwc3_ulpi_init(dwc);
+
+ return ret;
+}
+
/**
* dwc3_phy_setup - Configure USB PHY Interface of DWC3 Core
* @dwc: Pointer to our controller context structure
@@ -493,7 +509,6 @@ static void dwc3_cache_hwparams(struct dwc3 *dwc)
static int dwc3_phy_setup(struct dwc3 *dwc)
{
u32 reg;
- int ret;
reg = dwc3_readl(dwc->regs, DWC3_GUSB3PIPECTL(0));
@@ -564,9 +579,6 @@ static int dwc3_phy_setup(struct dwc3 *dwc)
}
/* FALLTHROUGH */
case DWC3_GHWPARAMS3_HSPHY_IFC_ULPI:
- ret = dwc3_ulpi_init(dwc);
- if (ret)
- return ret;
/* FALLTHROUGH */
default:
break;
@@ -723,6 +735,7 @@ static void dwc3_core_setup_global_control(struct dwc3 *dwc)
}
static int dwc3_core_get_phy(struct dwc3 *dwc);
+static int dwc3_core_ulpi_init(struct dwc3 *dwc);
/**
* dwc3_core_init - Low-level initialization of DWC3 Core
@@ -754,17 +767,27 @@ static int dwc3_core_init(struct dwc3 *dwc)
dwc->maximum_speed = USB_SPEED_HIGH;
}
- ret = dwc3_core_get_phy(dwc);
+ ret = dwc3_phy_setup(dwc);
if (ret)
goto err0;
- ret = dwc3_core_soft_reset(dwc);
- if (ret)
- goto err0;
+ if (!dwc->ulpi_ready) {
+ ret = dwc3_core_ulpi_init(dwc);
+ if (ret)
+ goto err0;
+ dwc->ulpi_ready = true;
+ }
- ret = dwc3_phy_setup(dwc);
+ if (!dwc->phys_ready) {
+ ret = dwc3_core_get_phy(dwc);
+ if (ret)
+ goto err0a;
+ dwc->phys_ready = true;
+ }
+
+ ret = dwc3_core_soft_reset(dwc);
if (ret)
- goto err0;
+ goto err0a;
dwc3_core_setup_global_control(dwc);
dwc3_core_num_eps(dwc);
@@ -837,6 +860,9 @@ static int dwc3_core_init(struct dwc3 *dwc)
phy_exit(dwc->usb2_generic_phy);
phy_exit(dwc->usb3_generic_phy);
+err0a:
+ dwc3_ulpi_exit(dwc);
+
err0:
return ret;
}
@@ -1229,7 +1255,6 @@ static int dwc3_probe(struct platform_device *pdev)
err3:
dwc3_free_event_buffers(dwc);
- dwc3_ulpi_exit(dwc);
err2:
pm_runtime_allow(&pdev->dev);
diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h
index 4a4a4c9..6b202e3 100644
--- a/drivers/usb/dwc3/core.h
+++ b/drivers/usb/dwc3/core.h
@@ -795,7 +795,9 @@ struct dwc3_scratchpad_array {
* @usb3_phy: pointer to USB3 PHY
* @usb2_generic_phy: pointer to USB2 PHY
* @usb3_generic_phy: pointer to USB3 PHY
+ * @phys_ready: flag to indicate that PHYs are ready
* @ulpi: pointer to ulpi interface
+ * @ulpi_ready: flag to indicate that ULPI is initialized
* @isoch_delay: wValue from Set Isochronous Delay request;
* @u2sel: parameter from Set SEL request.
* @u2pel: parameter from Set SEL request.
@@ -893,7 +895,10 @@ struct dwc3 {
struct phy *usb2_generic_phy;
struct phy *usb3_generic_phy;
+ bool phys_ready;
+
struct ulpi *ulpi;
+ bool ulpi_ready;
void __iomem *regs;
size_t regs_size;
--
cheers,
-roger
Texas Instruments Finland Oy, Porkkalankatu 22, 00180 Helsinki. Y-tunnus/Business ID: 0615521-4. Kotipaikka/Domicile: Helsinki
From: Lyude <cpaul(a)redhat.com>
As observed with the latest ThinkPad docks, we unfortunately can't rely
on docks keeping us updated with hotplug events that happened while we
were suspended. On top of that, even if the number of connectors remains
the same between suspend and resume it's still not safe to assume that
there were no hotplugs, since a different monitor might have been
plugged into a port another monitor previously occupied. As such, we
need to go through all of the MST ports and check whether or not their
EDIDs have changed.
In addition to that, we also now return -EINVAL from
drm_dp_mst_topology_mgr_resume to indicate to callers that they need to
reset the MST connection, and that they can't rely on any other method
of reprobing.
Cc: stable(a)vger.kernel.org
Signed-off-by: Lyude <cpaul(a)redhat.com>
Signed-off-by: Juston Li <juston.li(a)intel.com>
---
drivers/gpu/drm/drm_dp_mst_topology.c | 94 ++++++++++++++++++++++++++-
1 file changed, 93 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c b/drivers/gpu/drm/drm_dp_mst_topology.c
index 5ff1d79b86c4..88abebe52021 100644
--- a/drivers/gpu/drm/drm_dp_mst_topology.c
+++ b/drivers/gpu/drm/drm_dp_mst_topology.c
@@ -29,6 +29,7 @@
#include <linux/i2c.h>
#include <drm/drm_dp_mst_helper.h>
#include <drm/drmP.h>
+#include <drm/drm_edid.h>
#include <drm/drm_fixed.h>
#include <drm/drm_atomic.h>
@@ -2201,6 +2202,64 @@ void drm_dp_mst_topology_mgr_suspend(struct drm_dp_mst_topology_mgr *mgr)
}
EXPORT_SYMBOL(drm_dp_mst_topology_mgr_suspend);
+static bool drm_dp_mst_edids_changed(struct drm_dp_mst_topology_mgr *mgr,
+ struct drm_dp_mst_port *port)
+{
+ struct drm_device *dev;
+ struct drm_connector *connector;
+ struct drm_dp_mst_port *dport;
+ struct drm_dp_mst_branch *mstb;
+ struct edid *current_edid, *cached_edid;
+ bool ret = false;
+
+ port = drm_dp_get_validated_port_ref(mgr, port);
+ if (!port)
+ return false;
+
+ mstb = drm_dp_get_validated_mstb_ref(mgr, port->mstb);
+ if (mstb) {
+ list_for_each_entry(dport, &port->mstb->ports, next) {
+ ret = drm_dp_mst_edids_changed(mgr, dport);
+ if (ret)
+ break;
+ }
+
+ drm_dp_put_mst_branch_device(mstb);
+ if (ret)
+ goto out;
+ }
+
+ connector = port->connector;
+ if (!connector || !port->aux.ddc.algo)
+ goto out;
+
+ dev = connector->dev;
+ mutex_lock(&dev->mode_config.mutex);
+
+ current_edid = drm_get_edid(connector, &port->aux.ddc);
+ if (connector->edid_blob_ptr)
+ cached_edid = (void *)connector->edid_blob_ptr->data;
+ else
+ return false;
+
+ if ((current_edid && cached_edid && memcmp(current_edid, cached_edid,
+ sizeof(struct edid)) != 0) ||
+ (!current_edid && cached_edid) || (current_edid && !cached_edid)) {
+ ret = true;
+ DRM_DEBUG_KMS("EDID on %s changed, reprobing connectors\n",
+ connector->name);
+ }
+
+ mutex_unlock(&dev->mode_config.mutex);
+
+ kfree(current_edid);
+
+out:
+ drm_dp_put_port(port);
+
+ return ret;
+}
+
/**
* drm_dp_mst_topology_mgr_resume() - resume the MST manager
* @mgr: manager to resume
@@ -2210,9 +2269,15 @@ EXPORT_SYMBOL(drm_dp_mst_topology_mgr_suspend);
*
* if the device fails this returns -1, and the driver should do
* a full MST reprobe, in case we were undocked.
+ *
+ * if the device can no longer be trusted, this returns -EINVAL
+ * and the driver should unconditionally disconnect and reconnect
+ * the dock.
*/
int drm_dp_mst_topology_mgr_resume(struct drm_dp_mst_topology_mgr *mgr)
{
+ struct drm_dp_mst_branch *mstb;
+ struct drm_dp_mst_port *port;
int ret = 0;
mutex_lock(&mgr->lock);
@@ -2246,8 +2311,35 @@ int drm_dp_mst_topology_mgr_resume(struct drm_dp_mst_topology_mgr *mgr)
drm_dp_check_mstb_guid(mgr->mst_primary, guid);
ret = 0;
- } else
+
+ /*
+ * Some hubs also forget to notify us of hotplugs that happened
+ * while we were in suspend, so we need to verify that the edid
+ * hasn't changed for any of the connectors. If it has been,
+ * we unfortunately can't rely on the dock updating us with
+ * hotplug events, so indicate we need a full reconnect.
+ */
+
+ /* MST's I2C helpers can't be used while holding this lock */
+ mutex_unlock(&mgr->lock);
+
+ mstb = drm_dp_get_validated_mstb_ref(mgr, mgr->mst_primary);
+ if (mstb) {
+ list_for_each_entry(port, &mstb->ports, next) {
+ if (drm_dp_mst_edids_changed(mgr, port)) {
+ ret = -EINVAL;
+ break;
+ }
+ }
+
+ drm_dp_put_mst_branch_device(mstb);
+ }
+ } else {
ret = -1;
+ mutex_unlock(&mgr->lock);
+ }
+
+ return ret;
out_unlock:
mutex_unlock(&mgr->lock);
--
2.17.2
Hi,
Could you please apply f8b39039cbf2a15f2b8c9f081e1cbd5dee00aaf5 to 4.9
and 4.14 ?
It fixes an Oops when Ethernet transmission times out.
As you can observe in the commit log, the Oops what initially observed
in 4.9.
The fix has been in mainline since 4.15
Thanks
Christophe
From: "Y.C. Chen" <yc_chen(a)aspeedtech.com>
The value of pitches is not correct while calling mode_set.
The issue we found so far on following system:
- Debian8 with XFCE Desktop
- Ubuntu with KDE Desktop
- SUSE15 with KDE Desktop
Signed-off-by: Y.C. Chen <yc_chen(a)aspeedtech.com>
---
drivers/gpu/drm/ast/ast_mode.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/gpu/drm/ast/ast_mode.c b/drivers/gpu/drm/ast/ast_mode.c
index 5e77d45..f06aae7 100644
--- a/drivers/gpu/drm/ast/ast_mode.c
+++ b/drivers/gpu/drm/ast/ast_mode.c
@@ -568,6 +568,7 @@ static int ast_crtc_do_set_base(struct drm_crtc *crtc,
}
ast_bo_unreserve(bo);
+ ast_set_offset_reg(crtc);
ast_set_start_address_crt1(crtc, (u32)gpu_addr);
return 0;
--
1.8.3.1
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 017b1660df89f5fb4bfe66c34e35f7d2031100c7 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz(a)oracle.com>
Date: Fri, 5 Oct 2018 15:51:29 -0700
Subject: [PATCH] mm: migration: fix migration of huge PMD shared pages
The page migration code employs try_to_unmap() to try and unmap the source
page. This is accomplished by using rmap_walk to find all vmas where the
page is mapped. This search stops when page mapcount is zero. For shared
PMD huge pages, the page map count is always 1 no matter the number of
mappings. Shared mappings are tracked via the reference count of the PMD
page. Therefore, try_to_unmap stops prematurely and does not completely
unmap all mappings of the source page.
This problem can result is data corruption as writes to the original
source page can happen after contents of the page are copied to the target
page. Hence, data is lost.
This problem was originally seen as DB corruption of shared global areas
after a huge page was soft offlined due to ECC memory errors. DB
developers noticed they could reproduce the issue by (hotplug) offlining
memory used to back huge pages. A simple testcase can reproduce the
problem by creating a shared PMD mapping (note that this must be at least
PUD_SIZE in size and PUD_SIZE aligned (1GB on x86)), and using
migrate_pages() to migrate process pages between nodes while continually
writing to the huge pages being migrated.
To fix, have the try_to_unmap_one routine check for huge PMD sharing by
calling huge_pmd_unshare for hugetlbfs huge pages. If it is a shared
mapping it will be 'unshared' which removes the page table entry and drops
the reference on the PMD page. After this, flush caches and TLB.
mmu notifiers are called before locking page tables, but we can not be
sure of PMD sharing until page tables are locked. Therefore, check for
the possibility of PMD sharing before locking so that notifiers can
prepare for the worst possible case.
Link: http://lkml.kernel.org/r/20180823205917.16297-2-mike.kravetz@oracle.com
[mike.kravetz(a)oracle.com: make _range_in_vma() a static inline]
Link: http://lkml.kernel.org/r/6063f215-a5c8-2f0c-465a-2c515ddc952d@oracle.com
Fixes: 39dde65c9940 ("shared page table for hugetlb page")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Jerome Glisse <jglisse(a)redhat.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6b68e345f0ca..087fd5f48c91 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -140,6 +140,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz);
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end);
struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
int write);
struct page *follow_huge_pd(struct vm_area_struct *vma,
@@ -170,6 +172,18 @@ static inline unsigned long hugetlb_total_pages(void)
return 0;
}
+static inline int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr,
+ pte_t *ptep)
+{
+ return 0;
+}
+
+static inline void adjust_range_if_pmd_sharing_possible(
+ struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+}
+
#define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n) ({ BUG(); 0; })
#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a61ebe8ad4ca..0416a7204be3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2455,6 +2455,12 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
return vma;
}
+static inline bool range_in_vma(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ return (vma && vma->vm_start <= start && end <= vma->vm_end);
+}
+
#ifdef CONFIG_MMU
pgprot_t vm_get_page_prot(unsigned long vm_flags);
void vma_set_page_prot(struct vm_area_struct *vma);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3c21775f196b..b903d746e132 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4545,12 +4545,40 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
/*
* check on proper vm_flags and page table alignment
*/
- if (vma->vm_flags & VM_MAYSHARE &&
- vma->vm_start <= base && end <= vma->vm_end)
+ if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
return true;
return false;
}
+/*
+ * Determine if start,end range within vma could be mapped by shared pmd.
+ * If yes, adjust start and end to cover range associated with possible
+ * shared pmd mappings.
+ */
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+ unsigned long check_addr = *start;
+
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ return;
+
+ for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
+ unsigned long a_start = check_addr & PUD_MASK;
+ unsigned long a_end = a_start + PUD_SIZE;
+
+ /*
+ * If sharing is possible, adjust start/end if necessary.
+ */
+ if (range_in_vma(vma, a_start, a_end)) {
+ if (a_start < *start)
+ *start = a_start;
+ if (a_end > *end)
+ *end = a_end;
+ }
+ }
+}
+
/*
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
@@ -4648,6 +4676,11 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
{
return 0;
}
+
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+}
#define want_pmd_share() (0)
#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
diff --git a/mm/rmap.c b/mm/rmap.c
index eb477809a5c0..1e79fac3186b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1362,11 +1362,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
/*
- * We have to assume the worse case ie pmd for invalidation. Note that
- * the page can not be free in this function as call of try_to_unmap()
- * must hold a reference on the page.
+ * For THP, we have to assume the worse case ie pmd for invalidation.
+ * For hugetlb, it could be much worse if we need to do pud
+ * invalidation in the case of pmd sharing.
+ *
+ * Note that the page can not be free in this function as call of
+ * try_to_unmap() must hold a reference on the page.
*/
end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
+ if (PageHuge(page)) {
+ /*
+ * If sharing is possible, start and end will be adjusted
+ * accordingly.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &start, &end);
+ }
mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
while (page_vma_mapped_walk(&pvmw)) {
@@ -1409,6 +1419,32 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
address = pvmw.address;
+ if (PageHuge(page)) {
+ if (huge_pmd_unshare(mm, &address, pvmw.pte)) {
+ /*
+ * huge_pmd_unshare unmapped an entire PMD
+ * page. There is no way of knowing exactly
+ * which PMDs may be cached for this mm, so
+ * we must flush them all. start/end were
+ * already adjusted above to cover this range.
+ */
+ flush_cache_range(vma, start, end);
+ flush_tlb_range(vma, start, end);
+ mmu_notifier_invalidate_range(mm, start, end);
+
+ /*
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ }
if (IS_ENABLED(CONFIG_MIGRATION) &&
(flags & TTU_MIGRATION) &&
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 017b1660df89f5fb4bfe66c34e35f7d2031100c7 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz(a)oracle.com>
Date: Fri, 5 Oct 2018 15:51:29 -0700
Subject: [PATCH] mm: migration: fix migration of huge PMD shared pages
The page migration code employs try_to_unmap() to try and unmap the source
page. This is accomplished by using rmap_walk to find all vmas where the
page is mapped. This search stops when page mapcount is zero. For shared
PMD huge pages, the page map count is always 1 no matter the number of
mappings. Shared mappings are tracked via the reference count of the PMD
page. Therefore, try_to_unmap stops prematurely and does not completely
unmap all mappings of the source page.
This problem can result is data corruption as writes to the original
source page can happen after contents of the page are copied to the target
page. Hence, data is lost.
This problem was originally seen as DB corruption of shared global areas
after a huge page was soft offlined due to ECC memory errors. DB
developers noticed they could reproduce the issue by (hotplug) offlining
memory used to back huge pages. A simple testcase can reproduce the
problem by creating a shared PMD mapping (note that this must be at least
PUD_SIZE in size and PUD_SIZE aligned (1GB on x86)), and using
migrate_pages() to migrate process pages between nodes while continually
writing to the huge pages being migrated.
To fix, have the try_to_unmap_one routine check for huge PMD sharing by
calling huge_pmd_unshare for hugetlbfs huge pages. If it is a shared
mapping it will be 'unshared' which removes the page table entry and drops
the reference on the PMD page. After this, flush caches and TLB.
mmu notifiers are called before locking page tables, but we can not be
sure of PMD sharing until page tables are locked. Therefore, check for
the possibility of PMD sharing before locking so that notifiers can
prepare for the worst possible case.
Link: http://lkml.kernel.org/r/20180823205917.16297-2-mike.kravetz@oracle.com
[mike.kravetz(a)oracle.com: make _range_in_vma() a static inline]
Link: http://lkml.kernel.org/r/6063f215-a5c8-2f0c-465a-2c515ddc952d@oracle.com
Fixes: 39dde65c9940 ("shared page table for hugetlb page")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Jerome Glisse <jglisse(a)redhat.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6b68e345f0ca..087fd5f48c91 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -140,6 +140,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz);
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end);
struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
int write);
struct page *follow_huge_pd(struct vm_area_struct *vma,
@@ -170,6 +172,18 @@ static inline unsigned long hugetlb_total_pages(void)
return 0;
}
+static inline int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr,
+ pte_t *ptep)
+{
+ return 0;
+}
+
+static inline void adjust_range_if_pmd_sharing_possible(
+ struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+}
+
#define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n) ({ BUG(); 0; })
#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a61ebe8ad4ca..0416a7204be3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2455,6 +2455,12 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
return vma;
}
+static inline bool range_in_vma(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ return (vma && vma->vm_start <= start && end <= vma->vm_end);
+}
+
#ifdef CONFIG_MMU
pgprot_t vm_get_page_prot(unsigned long vm_flags);
void vma_set_page_prot(struct vm_area_struct *vma);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3c21775f196b..b903d746e132 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4545,12 +4545,40 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
/*
* check on proper vm_flags and page table alignment
*/
- if (vma->vm_flags & VM_MAYSHARE &&
- vma->vm_start <= base && end <= vma->vm_end)
+ if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
return true;
return false;
}
+/*
+ * Determine if start,end range within vma could be mapped by shared pmd.
+ * If yes, adjust start and end to cover range associated with possible
+ * shared pmd mappings.
+ */
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+ unsigned long check_addr = *start;
+
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ return;
+
+ for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
+ unsigned long a_start = check_addr & PUD_MASK;
+ unsigned long a_end = a_start + PUD_SIZE;
+
+ /*
+ * If sharing is possible, adjust start/end if necessary.
+ */
+ if (range_in_vma(vma, a_start, a_end)) {
+ if (a_start < *start)
+ *start = a_start;
+ if (a_end > *end)
+ *end = a_end;
+ }
+ }
+}
+
/*
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
@@ -4648,6 +4676,11 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
{
return 0;
}
+
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+}
#define want_pmd_share() (0)
#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
diff --git a/mm/rmap.c b/mm/rmap.c
index eb477809a5c0..1e79fac3186b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1362,11 +1362,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
/*
- * We have to assume the worse case ie pmd for invalidation. Note that
- * the page can not be free in this function as call of try_to_unmap()
- * must hold a reference on the page.
+ * For THP, we have to assume the worse case ie pmd for invalidation.
+ * For hugetlb, it could be much worse if we need to do pud
+ * invalidation in the case of pmd sharing.
+ *
+ * Note that the page can not be free in this function as call of
+ * try_to_unmap() must hold a reference on the page.
*/
end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
+ if (PageHuge(page)) {
+ /*
+ * If sharing is possible, start and end will be adjusted
+ * accordingly.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &start, &end);
+ }
mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
while (page_vma_mapped_walk(&pvmw)) {
@@ -1409,6 +1419,32 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
address = pvmw.address;
+ if (PageHuge(page)) {
+ if (huge_pmd_unshare(mm, &address, pvmw.pte)) {
+ /*
+ * huge_pmd_unshare unmapped an entire PMD
+ * page. There is no way of knowing exactly
+ * which PMDs may be cached for this mm, so
+ * we must flush them all. start/end were
+ * already adjusted above to cover this range.
+ */
+ flush_cache_range(vma, start, end);
+ flush_tlb_range(vma, start, end);
+ mmu_notifier_invalidate_range(mm, start, end);
+
+ /*
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ }
if (IS_ENABLED(CONFIG_MIGRATION) &&
(flags & TTU_MIGRATION) &&