Nilay reported that since commit daaa574aba6f ("powerpc/pseries/msi: Switch to msi_create_parent_irq_domain()"), the NVMe driver cannot enable MSI-X when the device's MSI-X table size is larger than the firmware's MSI quota for the device.
This is because the commit changes how rtas_prepare_msi_irqs() is called:
- Before, it is called when interrupts are allocated at the global interrupt domain with nvec_in being the number of allocated interrupts. rtas_prepare_msi_irqs() can return a positive number and the allocation will be retried.
- Now, it is called at the creation of per-device interrupt domain with nvec_in being the number of interrupts that the device supports. If rtas_prepare_msi_irqs() returns positive, domain creation just fails.
For Nilay's NVMe driver case, rtas_prepare_msi_irqs() returns a positive number (the quota). This causes per-device interrupt domain creation to fail and thus the NVMe driver cannot enable MSI-X.
Rework to make this scenario works again:
- pseries_msi_ops_prepare() only prepares as many interrupts as the quota permit.
- pseries_irq_domain_alloc() fails if the device's quota is exceeded.
Now, if the quota is exceeded, pseries_msi_ops_prepare() will only prepare as allowed by the quota. If device drivers attempt to allocate more interrupts than the quota permits, pseries_irq_domain_alloc() will return an error code and msi_handle_pci_fail() will allow device drivers a retry.
Reported-by: Nilay Shroff nilay@linux.ibm.com Closes: https://lore.kernel.org/linuxppc-dev/6af2c4c2-97f6-4758-be33-256638ef39e5@li... Fixes: daaa574aba6f ("powerpc/pseries/msi: Switch to msi_create_parent_irq_domain()") Signed-off-by: Nam Cao namcao@linutronix.de Acked-by: Nilay Shroff nilay@linux.ibm.com Cc: stable@vger.kernel.org --- v2: - change pseries_msi_ops_prepare()'s allocation logic to match the original logic in __pci_enable_msix_range()
- fix up Nilay's email address --- arch/powerpc/platforms/pseries/msi.c | 44 ++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index a82aaa786e9e..edc30cda5dbc 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -19,6 +19,11 @@
#include "pseries.h"
+struct pseries_msi_device { + unsigned int msi_quota; + unsigned int msi_used; +}; + static int query_token, change_token;
#define RTAS_QUERY_FN 0 @@ -433,8 +438,28 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev struct msi_domain_info *info = domain->host_data; struct pci_dev *pdev = to_pci_dev(dev); int type = (info->flags & MSI_FLAG_PCI_MSIX) ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI; + int ret; + + struct pseries_msi_device *pseries_dev __free(kfree) + = kmalloc(sizeof(*pseries_dev), GFP_KERNEL); + if (!pseries_dev) + return -ENOMEM; + + while (1) { + ret = rtas_prepare_msi_irqs(pdev, nvec, type, arg); + if (!ret) + break; + else if (ret > 0) + nvec = ret; + else + return ret; + }
- return rtas_prepare_msi_irqs(pdev, nvec, type, arg); + pseries_dev->msi_quota = nvec; + pseries_dev->msi_used = 0; + + arg->scratchpad[0].ptr = no_free_ptr(pseries_dev); + return 0; }
/* @@ -443,9 +468,13 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev */ static void pseries_msi_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg) { + struct pseries_msi_device *pseries_dev = arg->scratchpad[0].ptr; struct pci_dev *pdev = to_pci_dev(domain->dev);
rtas_disable_msi(pdev); + + WARN_ON(pseries_dev->msi_used); + kfree(pseries_dev); }
static void pseries_msi_shutdown(struct irq_data *d) @@ -546,12 +575,18 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq unsigned int nr_irqs, void *arg) { struct pci_controller *phb = domain->host_data; + struct pseries_msi_device *pseries_dev; msi_alloc_info_t *info = arg; struct msi_desc *desc = info->desc; struct pci_dev *pdev = msi_desc_to_pci_dev(desc); int hwirq; int i, ret;
+ pseries_dev = info->scratchpad[0].ptr; + + if (pseries_dev->msi_used + nr_irqs > pseries_dev->msi_quota) + return -ENOSPC; + hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_index); if (hwirq < 0) { dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq); @@ -567,9 +602,10 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq goto out;
irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, - &pseries_msi_irq_chip, domain->host_data); + &pseries_msi_irq_chip, pseries_dev); }
+ pseries_dev->msi_used++; return 0;
out: @@ -582,9 +618,11 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq unsigned int nr_irqs) { struct irq_data *d = irq_domain_get_irq_data(domain, virq); - struct pci_controller *phb = irq_data_get_irq_chip_data(d); + struct pseries_msi_device *pseries_dev = irq_data_get_irq_chip_data(d); + struct pci_controller *phb = domain->host_data;
pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs); + pseries_dev->msi_used -= nr_irqs; irq_domain_free_irqs_parent(domain, virq, nr_irqs); }
On 1/7/26 3:32 PM, Nam Cao wrote:
Nilay reported that since commit daaa574aba6f ("powerpc/pseries/msi: Switch to msi_create_parent_irq_domain()"), the NVMe driver cannot enable MSI-X when the device's MSI-X table size is larger than the firmware's MSI quota for the device.
This is because the commit changes how rtas_prepare_msi_irqs() is called:
Before, it is called when interrupts are allocated at the global interrupt domain with nvec_in being the number of allocated interrupts. rtas_prepare_msi_irqs() can return a positive number and the allocation will be retried.
Now, it is called at the creation of per-device interrupt domain with nvec_in being the number of interrupts that the device supports. If rtas_prepare_msi_irqs() returns positive, domain creation just fails.
For Nilay's NVMe driver case, rtas_prepare_msi_irqs() returns a positive number (the quota). This causes per-device interrupt domain creation to fail and thus the NVMe driver cannot enable MSI-X.
Rework to make this scenario works again:
pseries_msi_ops_prepare() only prepares as many interrupts as the quota permit.
pseries_irq_domain_alloc() fails if the device's quota is exceeded.
Now, if the quota is exceeded, pseries_msi_ops_prepare() will only prepare as allowed by the quota. If device drivers attempt to allocate more interrupts than the quota permits, pseries_irq_domain_alloc() will return an error code and msi_handle_pci_fail() will allow device drivers a retry.
Reported-by: Nilay Shroff nilay@linux.ibm.com Closes: https://lore.kernel.org/linuxppc-dev/6af2c4c2-97f6-4758-be33-256638ef39e5@li... Fixes: daaa574aba6f ("powerpc/pseries/msi: Switch to msi_create_parent_irq_domain()") Signed-off-by: Nam Cao namcao@linutronix.de Acked-by: Nilay Shroff nilay@linux.ibm.com Cc: stable@vger.kernel.org
v2:
change pseries_msi_ops_prepare()'s allocation logic to match the original logic in __pci_enable_msix_range()
fix up Nilay's email address
arch/powerpc/platforms/pseries/msi.c | 44 ++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index a82aaa786e9e..edc30cda5dbc 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -19,6 +19,11 @@ #include "pseries.h" +struct pseries_msi_device {
- unsigned int msi_quota;
- unsigned int msi_used;
+};
static int query_token, change_token; #define RTAS_QUERY_FN 0 @@ -433,8 +438,28 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev struct msi_domain_info *info = domain->host_data; struct pci_dev *pdev = to_pci_dev(dev); int type = (info->flags & MSI_FLAG_PCI_MSIX) ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
- int ret;
- struct pseries_msi_device *pseries_dev __free(kfree)
= kmalloc(sizeof(*pseries_dev), GFP_KERNEL);- if (!pseries_dev)
return -ENOMEM;- while (1) {
ret = rtas_prepare_msi_irqs(pdev, nvec, type, arg);if (!ret)break;else if (ret > 0)nvec = ret;elsereturn ret;- }
- return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
- pseries_dev->msi_quota = nvec;
- pseries_dev->msi_used = 0;
- arg->scratchpad[0].ptr = no_free_ptr(pseries_dev);
- return 0;
} /* @@ -443,9 +468,13 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev */ static void pseries_msi_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg) {
- struct pseries_msi_device *pseries_dev = arg->scratchpad[0].ptr; struct pci_dev *pdev = to_pci_dev(domain->dev);
rtas_disable_msi(pdev);
- WARN_ON(pseries_dev->msi_used);
- kfree(pseries_dev);
} static void pseries_msi_shutdown(struct irq_data *d) @@ -546,12 +575,18 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq unsigned int nr_irqs, void *arg) { struct pci_controller *phb = domain->host_data;
- struct pseries_msi_device *pseries_dev; msi_alloc_info_t *info = arg; struct msi_desc *desc = info->desc; struct pci_dev *pdev = msi_desc_to_pci_dev(desc); int hwirq; int i, ret;
- pseries_dev = info->scratchpad[0].ptr;
- if (pseries_dev->msi_used + nr_irqs > pseries_dev->msi_quota)
return -ENOSPC;- hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_index); if (hwirq < 0) { dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq);
@@ -567,9 +602,10 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq goto out; irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
&pseries_msi_irq_chip, domain->host_data);
}&pseries_msi_irq_chip, pseries_dev);
- pseries_dev->msi_used++; return 0;
out: @@ -582,9 +618,11 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq unsigned int nr_irqs) { struct irq_data *d = irq_domain_get_irq_data(domain, virq);
- struct pci_controller *phb = irq_data_get_irq_chip_data(d);
- struct pseries_msi_device *pseries_dev = irq_data_get_irq_chip_data(d);
- struct pci_controller *phb = domain->host_data;
pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
- pseries_dev->msi_used -= nr_irqs; irq_domain_free_irqs_parent(domain, virq, nr_irqs);
}
I just tested this change on my system using the latest mainline kernel and it works well for me. So with that, please fell free to add,
Tested-by: Nilay Shroff nilay@linux.ibm.com
On 1/7/26 7:55 PM, Nilay Shroff wrote:
On 1/7/26 3:32 PM, Nam Cao wrote:
Nilay reported that since commit daaa574aba6f ("powerpc/pseries/msi: Switch to msi_create_parent_irq_domain()"), the NVMe driver cannot enable MSI-X when the device's MSI-X table size is larger than the firmware's MSI quota for the device.
This is because the commit changes how rtas_prepare_msi_irqs() is called:
Before, it is called when interrupts are allocated at the global interrupt domain with nvec_in being the number of allocated interrupts. rtas_prepare_msi_irqs() can return a positive number and the allocation will be retried.
Now, it is called at the creation of per-device interrupt domain with nvec_in being the number of interrupts that the device supports. If rtas_prepare_msi_irqs() returns positive, domain creation just fails.
For Nilay's NVMe driver case, rtas_prepare_msi_irqs() returns a positive number (the quota). This causes per-device interrupt domain creation to fail and thus the NVMe driver cannot enable MSI-X.
Rework to make this scenario works again:
pseries_msi_ops_prepare() only prepares as many interrupts as the quota permit.
pseries_irq_domain_alloc() fails if the device's quota is exceeded.
Now, if the quota is exceeded, pseries_msi_ops_prepare() will only prepare as allowed by the quota. If device drivers attempt to allocate more interrupts than the quota permits, pseries_irq_domain_alloc() will return an error code and msi_handle_pci_fail() will allow device drivers a retry.
Reported-by: Nilay Shroff nilay@linux.ibm.com Closes: https://lore.kernel.org/linuxppc-dev/6af2c4c2-97f6-4758-be33-256638ef39e5@li... Fixes: daaa574aba6f ("powerpc/pseries/msi: Switch to msi_create_parent_irq_domain()") Signed-off-by: Nam Cao namcao@linutronix.de Acked-by: Nilay Shroff nilay@linux.ibm.com Cc: stable@vger.kernel.org
v2:
change pseries_msi_ops_prepare()'s allocation logic to match the original logic in __pci_enable_msix_range()
fix up Nilay's email address
arch/powerpc/platforms/pseries/msi.c | 44 ++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index a82aaa786e9e..edc30cda5dbc 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -19,6 +19,11 @@ #include "pseries.h" +struct pseries_msi_device {
- unsigned int msi_quota;
- unsigned int msi_used;
+};
- static int query_token, change_token;
#define RTAS_QUERY_FN 0 @@ -433,8 +438,28 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev struct msi_domain_info *info = domain->host_data; struct pci_dev *pdev = to_pci_dev(dev); int type = (info->flags & MSI_FLAG_PCI_MSIX) ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
- int ret;
- struct pseries_msi_device *pseries_dev __free(kfree)
= kmalloc(sizeof(*pseries_dev), GFP_KERNEL);- if (!pseries_dev)
return -ENOMEM;- while (1) {
ret = rtas_prepare_msi_irqs(pdev, nvec, type, arg);if (!ret)break;else if (ret > 0)nvec = ret;elsereturn ret;- }
- return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
- pseries_dev->msi_quota = nvec;
- pseries_dev->msi_used = 0;
- arg->scratchpad[0].ptr = no_free_ptr(pseries_dev);
- return 0; }
/* @@ -443,9 +468,13 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev */ static void pseries_msi_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg) {
- struct pseries_msi_device *pseries_dev = arg->scratchpad[0].ptr; struct pci_dev *pdev = to_pci_dev(domain->dev);
rtas_disable_msi(pdev);
- WARN_ON(pseries_dev->msi_used);
- kfree(pseries_dev); }
static void pseries_msi_shutdown(struct irq_data *d) @@ -546,12 +575,18 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq unsigned int nr_irqs, void *arg) { struct pci_controller *phb = domain->host_data;
- struct pseries_msi_device *pseries_dev; msi_alloc_info_t *info = arg; struct msi_desc *desc = info->desc; struct pci_dev *pdev = msi_desc_to_pci_dev(desc); int hwirq; int i, ret;
- pseries_dev = info->scratchpad[0].ptr;
- if (pseries_dev->msi_used + nr_irqs > pseries_dev->msi_quota)
return -ENOSPC;- hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_index); if (hwirq < 0) { dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq);
@@ -567,9 +602,10 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq goto out; irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
&pseries_msi_irq_chip, domain->host_data);
}&pseries_msi_irq_chip, pseries_dev);
- pseries_dev->msi_used++; return 0;
out: @@ -582,9 +618,11 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq unsigned int nr_irqs) { struct irq_data *d = irq_domain_get_irq_data(domain, virq);
- struct pci_controller *phb = irq_data_get_irq_chip_data(d);
- struct pseries_msi_device *pseries_dev = irq_data_get_irq_chip_data(d);
- struct pci_controller *phb = domain->host_data;
pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
- pseries_dev->msi_used -= nr_irqs; irq_domain_free_irqs_parent(domain, virq, nr_irqs); }
I just tested this change on my system using the latest mainline kernel and it works well for me. So with that, please fell free to add,
Tested-by: Nilay Shroff nilay@linux.ibm.com
Thanks will pull this in. Maddy
linux-stable-mirror@lists.linaro.org