When both the RHBA and RPA FDMI requests time out, fnic reuses a frame to send ABTS for each of them. On send completion, this causes an attempt to free the same frame twice that leads to a crash.
Fix crash by allocating separate frames for RHBA and RPA, and modify ABTS logic accordingly.
Tested by checking MDS for FDMI information. Tested by using instrumented driver to: Drop PLOGI response Drop RHBA response Drop RPA response Drop RHBA and RPA response Drop PLOGI response + ABTS response Drop RHBA response + ABTS response Drop RPA response + ABTS response Drop RHBA and RPA response + ABTS response for both of them
Fixes: 09c1e6ab4ab2 ("scsi: fnic: Add and integrate support for FDMI") Reviewed-by: Sesidhar Baddela sebaddel@cisco.com Reviewed-by: Arulprabhu Ponnusamy arulponn@cisco.com Reviewed-by: Gian Carlo Boffa gcboffa@cisco.com Tested-by: Arun Easi aeasi@cisco.com Co-developed-by: Arun Easi aeasi@cisco.com Signed-off-by: Arun Easi aeasi@cisco.com Tested-by: Karan Tilak Kumar kartilak@cisco.com Cc: stable@vger.kernel.org # 6.14.x Please see patch description Signed-off-by: Karan Tilak Kumar kartilak@cisco.com --- drivers/scsi/fnic/fdls_disc.c | 113 +++++++++++++++++++++++++--------- drivers/scsi/fnic/fnic_fdls.h | 1 + 2 files changed, 86 insertions(+), 28 deletions(-)
diff --git a/drivers/scsi/fnic/fdls_disc.c b/drivers/scsi/fnic/fdls_disc.c index c2b6f4eb338e..0ee1b74967b9 100644 --- a/drivers/scsi/fnic/fdls_disc.c +++ b/drivers/scsi/fnic/fdls_disc.c @@ -763,47 +763,69 @@ static void fdls_send_fabric_abts(struct fnic_iport_s *iport) iport->fabric.timer_pending = 1; }
-static void fdls_send_fdmi_abts(struct fnic_iport_s *iport) +static uint8_t *fdls_alloc_init_fdmi_abts_frame(struct fnic_iport_s *iport, + uint16_t oxid) { - uint8_t *frame; + struct fc_frame_header *pfdmi_abts; uint8_t d_id[3]; + uint8_t *frame; struct fnic *fnic = iport->fnic; - struct fc_frame_header *pfabric_abts; - unsigned long fdmi_tov; - uint16_t oxid; - uint16_t frame_size = FNIC_ETH_FCOE_HDRS_OFFSET + - sizeof(struct fc_frame_header);
frame = fdls_alloc_frame(iport); if (frame == NULL) { FNIC_FCS_DBG(KERN_ERR, fnic->host, fnic->fnic_num, "Failed to allocate frame to send FDMI ABTS"); - return; + return NULL; }
- pfabric_abts = (struct fc_frame_header *) (frame + FNIC_ETH_FCOE_HDRS_OFFSET); + pfdmi_abts = (struct fc_frame_header *) (frame + FNIC_ETH_FCOE_HDRS_OFFSET); fdls_init_fabric_abts_frame(frame, iport);
hton24(d_id, FC_FID_MGMT_SERV); - FNIC_STD_SET_D_ID(*pfabric_abts, d_id); + FNIC_STD_SET_D_ID(*pfdmi_abts, d_id); + FNIC_STD_SET_OX_ID(*pfdmi_abts, oxid); + + return frame; +} + +static void fdls_send_fdmi_abts(struct fnic_iport_s *iport) +{ + uint8_t *frame; + unsigned long fdmi_tov; + uint16_t frame_size = FNIC_ETH_FCOE_HDRS_OFFSET + + sizeof(struct fc_frame_header);
if (iport->fabric.fdmi_pending & FDLS_FDMI_PLOGI_PENDING) { - oxid = iport->active_oxid_fdmi_plogi; - FNIC_STD_SET_OX_ID(*pfabric_abts, oxid); + frame = fdls_alloc_init_fdmi_abts_frame(iport, + iport->active_oxid_fdmi_plogi); + if (frame == NULL) + return; + fnic_send_fcoe_frame(iport, frame, frame_size); } else { if (iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING) { - oxid = iport->active_oxid_fdmi_rhba; - FNIC_STD_SET_OX_ID(*pfabric_abts, oxid); + frame = fdls_alloc_init_fdmi_abts_frame(iport, + iport->active_oxid_fdmi_rhba); + if (frame == NULL) + return; + fnic_send_fcoe_frame(iport, frame, frame_size); } if (iport->fabric.fdmi_pending & FDLS_FDMI_RPA_PENDING) { - oxid = iport->active_oxid_fdmi_rpa; - FNIC_STD_SET_OX_ID(*pfabric_abts, oxid); + frame = fdls_alloc_init_fdmi_abts_frame(iport, + iport->active_oxid_fdmi_rpa); + if (frame == NULL) { + if (iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING) + goto arm_timer; + else + return; + } + fnic_send_fcoe_frame(iport, frame, frame_size); } }
+arm_timer: fdmi_tov = jiffies + msecs_to_jiffies(2 * iport->e_d_tov); mod_timer(&iport->fabric.fdmi_timer, round_jiffies(fdmi_tov)); iport->fabric.fdmi_pending |= FDLS_FDMI_ABORT_PENDING; @@ -2244,6 +2266,21 @@ void fdls_fabric_timer_callback(struct timer_list *t) spin_unlock_irqrestore(&fnic->fnic_lock, flags); }
+void fdls_fdmi_retry_plogi(struct fnic_iport_s *iport) +{ + struct fnic *fnic = iport->fnic; + + iport->fabric.fdmi_pending = 0; + /* If max retries not exhausted, start over from fdmi plogi */ + if (iport->fabric.fdmi_retry < FDLS_FDMI_MAX_RETRY) { + iport->fabric.fdmi_retry++; + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "Retry FDMI PLOGI. FDMI retry: %d", + iport->fabric.fdmi_retry); + fdls_send_fdmi_plogi(iport); + } +} + void fdls_fdmi_timer_callback(struct timer_list *t) { struct fnic_fdls_fabric_s *fabric = from_timer(fabric, t, fdmi_timer); @@ -2289,14 +2326,7 @@ void fdls_fdmi_timer_callback(struct timer_list *t) FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending);
- iport->fabric.fdmi_pending = 0; - /* If max retries not exhaused, start over from fdmi plogi */ - if (iport->fabric.fdmi_retry < FDLS_FDMI_MAX_RETRY) { - iport->fabric.fdmi_retry++; - FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, - "retry fdmi timer %d", iport->fabric.fdmi_retry); - fdls_send_fdmi_plogi(iport); - } + fdls_fdmi_retry_plogi(iport); FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending); spin_unlock_irqrestore(&fnic->fnic_lock, flags); @@ -3714,11 +3744,32 @@ static void fdls_process_fdmi_abts_rsp(struct fnic_iport_s *iport, switch (FNIC_FRAME_TYPE(oxid)) { case FNIC_FRAME_TYPE_FDMI_PLOGI: fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_plogi); + + iport->fabric.fdmi_pending &= ~FDLS_FDMI_PLOGI_PENDING; + iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; break; case FNIC_FRAME_TYPE_FDMI_RHBA: + iport->fabric.fdmi_pending &= ~FDLS_FDMI_REG_HBA_PENDING; + + /* If RPA is still pending, don't turn off ABORT PENDING. + * We count on the timer to detect the ABTS timeout and take + * corrective action. + */ + if (!(iport->fabric.fdmi_pending & FDLS_FDMI_RPA_PENDING)) + iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; + fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_rhba); break; case FNIC_FRAME_TYPE_FDMI_RPA: + iport->fabric.fdmi_pending &= ~FDLS_FDMI_RPA_PENDING; + + /* If RHBA is still pending, don't turn off ABORT PENDING. + * We count on the timer to detect the ABTS timeout and take + * corrective action. + */ + if (!(iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING)) + iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; + fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_rpa); break; default: @@ -3728,10 +3779,16 @@ static void fdls_process_fdmi_abts_rsp(struct fnic_iport_s *iport, break; }
- timer_delete_sync(&iport->fabric.fdmi_timer); - iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; - - fdls_send_fdmi_plogi(iport); + /* + * Only if ABORT PENDING is off, delete the timer, and if no other + * operations are pending, retry FDMI. + * Otherwise, let the timer pop and take the appropriate action. + */ + if (!(iport->fabric.fdmi_pending & FDLS_FDMI_ABORT_PENDING)) { + timer_delete_sync(&iport->fabric.fdmi_timer); + if (!iport->fabric.fdmi_pending) + fdls_fdmi_retry_plogi(iport); + } }
static void diff --git a/drivers/scsi/fnic/fnic_fdls.h b/drivers/scsi/fnic/fnic_fdls.h index 8e610b65ad57..531d0b37e450 100644 --- a/drivers/scsi/fnic/fnic_fdls.h +++ b/drivers/scsi/fnic/fnic_fdls.h @@ -394,6 +394,7 @@ void fdls_send_tport_abts(struct fnic_iport_s *iport, bool fdls_delete_tport(struct fnic_iport_s *iport, struct fnic_tport_s *tport); void fdls_fdmi_timer_callback(struct timer_list *t); +void fdls_fdmi_retry_plogi(struct fnic_iport_s *iport);
/* fnic_fcs.c */ void fnic_fdls_init(struct fnic *fnic, int usefip);
On Wed, Jun 11, 2025 at 05:44:23PM -0700, Karan Tilak Kumar wrote:
When both the RHBA and RPA FDMI requests time out, fnic reuses a frame to send ABTS for each of them. On send completion, this causes an attempt to free the same frame twice that leads to a crash.
Fix crash by allocating separate frames for RHBA and RPA, and modify ABTS logic accordingly.
Tested by checking MDS for FDMI information. Tested by using instrumented driver to: Drop PLOGI response Drop RHBA response Drop RPA response Drop RHBA and RPA response Drop PLOGI response + ABTS response Drop RHBA response + ABTS response Drop RPA response + ABTS response Drop RHBA and RPA response + ABTS response for both of them
Fixes: 09c1e6ab4ab2 ("scsi: fnic: Add and integrate support for FDMI") Reviewed-by: Sesidhar Baddela sebaddel@cisco.com Reviewed-by: Arulprabhu Ponnusamy arulponn@cisco.com Reviewed-by: Gian Carlo Boffa gcboffa@cisco.com Tested-by: Arun Easi aeasi@cisco.com Co-developed-by: Arun Easi aeasi@cisco.com Signed-off-by: Arun Easi aeasi@cisco.com Tested-by: Karan Tilak Kumar kartilak@cisco.com Cc: stable@vger.kernel.org # 6.14.x Please see patch description
I'm a bit confused. Why do we need to specify 6.14.x? I would have assumed that the Fixes tag was enough information. What are we supposed to see in the patch description?
I suspect you're making this too complicated... Just put Cc: stable@vger.kernel.org and a Fixes tag and let the scripts figure it out. Or put in the commit description, "The Fixes tag points to an older kernel because XXX but really this should only be backported to 6.14.x because YYY."
regards, dan carpenter
On Thu, Jun 12, 2025 at 10:42:30AM +0300, Dan Carpenter wrote:
On Wed, Jun 11, 2025 at 05:44:23PM -0700, Karan Tilak Kumar wrote:
When both the RHBA and RPA FDMI requests time out, fnic reuses a frame to send ABTS for each of them. On send completion, this causes an attempt to free the same frame twice that leads to a crash.
Fix crash by allocating separate frames for RHBA and RPA, and modify ABTS logic accordingly.
Tested by checking MDS for FDMI information. Tested by using instrumented driver to: Drop PLOGI response Drop RHBA response Drop RPA response Drop RHBA and RPA response Drop PLOGI response + ABTS response Drop RHBA response + ABTS response Drop RPA response + ABTS response Drop RHBA and RPA response + ABTS response for both of them
Fixes: 09c1e6ab4ab2 ("scsi: fnic: Add and integrate support for FDMI") Reviewed-by: Sesidhar Baddela sebaddel@cisco.com Reviewed-by: Arulprabhu Ponnusamy arulponn@cisco.com Reviewed-by: Gian Carlo Boffa gcboffa@cisco.com Tested-by: Arun Easi aeasi@cisco.com Co-developed-by: Arun Easi aeasi@cisco.com Signed-off-by: Arun Easi aeasi@cisco.com Tested-by: Karan Tilak Kumar kartilak@cisco.com Cc: stable@vger.kernel.org # 6.14.x Please see patch description
I'm a bit confused. Why do we need to specify 6.14.x? I would have assumed that the Fixes tag was enough information. What are we supposed to see in the patch description?
I suspect you're making this too complicated... Just put Cc: stable@vger.kernel.org and a Fixes tag and let the scripts figure it out. Or put in the commit description, "The Fixes tag points to an older kernel because XXX but really this should only be backported to 6.14.x because YYY."
But here even with the comment in the commit description, you would still just say:
Cc: stable@vger.kernel.org # 6.14.x
The stable maintainers trust you to list the correct kernel and don't need to know the reasoning.
I much prefer to keep it simple whenever possible. We had bad CVE where someone left off the Fixes tag and instead specified "Cc: stable@vger.kernel.org # 4.1" where 4.1 was the oldest supported kernel on kernel.org. The patch should have been applied to the older vendor kernels but it wasn't because the the tag was wrong.
regards, dan carpenter
On Thursday, June 12, 2025 1:39 AM, Dan Carpenter dan.carpenter@linaro.org wrote:
On Thu, Jun 12, 2025 at 10:42:30AM +0300, Dan Carpenter wrote:
On Wed, Jun 11, 2025 at 05:44:23PM -0700, Karan Tilak Kumar wrote:
When both the RHBA and RPA FDMI requests time out, fnic reuses a frame to send ABTS for each of them. On send completion, this causes an attempt to free the same frame twice that leads to a crash.
Fix crash by allocating separate frames for RHBA and RPA, and modify ABTS logic accordingly.
Tested by checking MDS for FDMI information. Tested by using instrumented driver to: Drop PLOGI response Drop RHBA response Drop RPA response Drop RHBA and RPA response Drop PLOGI response + ABTS response Drop RHBA response + ABTS response Drop RPA response + ABTS response Drop RHBA and RPA response + ABTS response for both of them
Fixes: 09c1e6ab4ab2 ("scsi: fnic: Add and integrate support for FDMI") Reviewed-by: Sesidhar Baddela sebaddel@cisco.com Reviewed-by: Arulprabhu Ponnusamy arulponn@cisco.com Reviewed-by: Gian Carlo Boffa gcboffa@cisco.com Tested-by: Arun Easi aeasi@cisco.com Co-developed-by: Arun Easi aeasi@cisco.com Signed-off-by: Arun Easi aeasi@cisco.com Tested-by: Karan Tilak Kumar kartilak@cisco.com Cc: stable@vger.kernel.org # 6.14.x Please see patch description
I'm a bit confused. Why do we need to specify 6.14.x? I would have assumed that the Fixes tag was enough information. What are we supposed to see in the patch description?
I suspect you're making this too complicated... Just put Cc: stable@vger.kernel.org and a Fixes tag and let the scripts figure it out. Or put in the commit description, "The Fixes tag points to an older kernel because XXX but really this should only be backported to 6.14.x because YYY."
But here even with the comment in the commit description, you would still just say:
Cc: stable@vger.kernel.org # 6.14.x
The stable maintainers trust you to list the correct kernel and don't need to know the reasoning.
Thanks for clarifying this Dan.
I much prefer to keep it simple whenever possible. We had bad CVE where someone left off the Fixes tag and instead specified "Cc: stable@vger.kernel.org # 4.1" where 4.1 was the oldest supported kernel on kernel.org. The patch should have been applied to the older vendor kernels but it wasn't because the the tag was wrong.
I understand and agree with you. I prefer to keep it simple as well. In V4, as you suggested, I'll keep the fixes tag and the Cc: tag and remove the comments section.
regards, dan carpenter
Thanks for your review comments.
Regards, Karan
linux-stable-mirror@lists.linaro.org