On Tue, Nov 04, 2025 at 02:30:11PM -0400, Jason Gunthorpe wrote:
From: Alejandro Jimenez alejandro.j.jimenez@oracle.com
Replace the io_pgtable versions with pt_iommu versions. The v2 page table uses the x86 implementation that will be eventually shared with VT-d.
This supports the same special features as the original code:
- increase_top for the v1 format to allow scaling from 3 to 6 levels
- non-present flushing
- Dirty tracking for v1 only
- __sme_set() to adjust the PTEs for CC
- Optimization for flushing with virtualization to minimize the range
- amd_iommu_pgsize_bitmap override of the native page sizes
- page tables allocate from the device's NUMA node
Rework the domain ops so that v1/v2 get their own ops. Make dedicated allocation functions for v1 and v2. Hook up invalidation for a top change to struct pt_iommu_flush_ops. Delete some of the iopgtable related code that becomes unused in this patch. The next patch will delete the rest of it.
This fixes a race bug in AMD's increase_address_space() implementation. It stores the top level and top pointer in different memory, which prevents other threads from reading a coherent version:
increase_address_space() alloc_pte() level = pgtable->mode - 1; pgtable->root = pte; pgtable->mode += 1; pte = &pgtable->root[PM_LEVEL_INDEX(level, address)];
The iommupt version is careful to put mode and root under a single READ_ONCE and then is careful to only READ_ONCE a single time per walk.
Signed-off-by: Alejandro Jimenez alejandro.j.jimenez@oracle.com Reviewed-by: Vasant Hegde vasant.hegde@amd.com Tested-by: Alejandro Jimenez alejandro.j.jimenez@oracle.com Tested-by: Pasha Tatashin pasha.tatashin@soleen.com Signed-off-by: Jason Gunthorpe jgg@nvidia.com
Tested-by: Ankit Soni Ankit.Soni@amd.com
I am little late, I had tested this patch series in both v1 and v2 page table modes using multiple benchmark tools (FIO, netperf, etc). The changes work as expected, and I observed no regressions.
drivers/iommu/amd/Kconfig | 5 +- drivers/iommu/amd/amd_iommu.h | 1 - drivers/iommu/amd/amd_iommu_types.h | 12 +- drivers/iommu/amd/io_pgtable.c | 2 - drivers/iommu/amd/iommu.c | 538 ++++++++++++++-------------- 5 files changed, 282 insertions(+), 276 deletions(-)
diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig index ecef69c11144db..f2acf471cb5d9f 100644 --- a/drivers/iommu/amd/Kconfig +++ b/drivers/iommu/amd/Kconfig @@ -11,10 +11,13 @@ config AMD_IOMMU select MMU_NOTIFIER select IOMMU_API select IOMMU_IOVA
- select IOMMU_IO_PGTABLE select IOMMU_SVA select IOMMU_IOPF select IOMMUFD_DRIVER if IOMMUFD
- select GENERIC_PT
- select IOMMU_PT
- select IOMMU_PT_AMDV1
- select IOMMU_PT_X86_64 depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE help With this option you can enable support for AMD IOMMU hardware in
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 9b4b589a54b57e..25044d28f28a8d 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -88,7 +88,6 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag);
- the IOMMU used by this driver.
*/ void amd_iommu_flush_all_caches(struct amd_iommu *iommu); -void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); void amd_iommu_domain_flush_pages(struct protection_domain *domain, u64 address, size_t size); void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index a698a2e7ce2a6e..d90a285b44eb3a 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -19,6 +19,7 @@ #include <linux/pci.h> #include <linux/irqreturn.h> #include <linux/io-pgtable.h> +#include <linux/generic_pt/iommu.h> /*
- Maximum number of IOMMUs supported
@@ -589,9 +590,13 @@ struct pdom_iommu_info {
- independent of their use.
*/ struct protection_domain {
- union {
struct iommu_domain domain;struct pt_iommu iommu;struct pt_iommu_amdv1 amdv1;struct pt_iommu_x86_64 amdv2;- }; struct list_head dev_list; /* List of all devices in this domain */
- struct iommu_domain domain; /* generic domain handle used by
struct amd_io_pgtable iop; spinlock_t lock; /* mostly used to lock the page table*/ u16 id; /* the domain id written to the device table */iommu core code */@@ -602,6 +607,9 @@ struct protection_domain { struct mmu_notifier mn; /* mmu notifier for the SVA domain */ struct list_head dev_data_list; /* List of pdom_dev_data */ }; +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv2.iommu, domain); /*
- This structure contains information about one PCI segment in the system.
diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index 70c2f5b1631b05..f64244938c9af7 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -136,8 +136,6 @@ static bool increase_address_space(struct amd_io_pgtable *pgtable, pgtable->mode += 1; write_seqcount_end(&pgtable->seqcount);
- amd_iommu_update_and_flush_device_table(domain);
- pte = NULL; ret = true;
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 2e1865daa1cee8..0dc4d9682000bf 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -30,7 +30,6 @@ #include <linux/msi.h> #include <linux/irqdomain.h> #include <linux/percpu.h> -#include <linux/io-pgtable.h> #include <linux/cc_platform.h> #include <asm/irq_remapping.h> #include <asm/io_apic.h> @@ -41,9 +40,9 @@ #include <asm/gart.h> #include <asm/dma.h> #include <uapi/linux/iommufd.h> +#include <linux/generic_pt/iommu.h> #include "amd_iommu.h" -#include "../dma-iommu.h" #include "../irq_remapping.h" #include "../iommu-pages.h" @@ -60,7 +59,6 @@ LIST_HEAD(hpet_map); LIST_HEAD(acpihid_map); const struct iommu_ops amd_iommu_ops; -static const struct iommu_dirty_ops amd_dirty_ops; int amd_iommu_max_glx_val = -1; @@ -74,11 +72,18 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev); static void set_dte_entry(struct amd_iommu *iommu,
struct iommu_dev_data *dev_data);
struct iommu_dev_data *dev_data,phys_addr_t top_paddr, unsigned int top_level);+static void amd_iommu_change_top(struct pt_iommu *iommu_table,
phys_addr_t top_paddr, unsigned int top_level);static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid); static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid); +static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain); +static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
bool enable);/****************************************************************************
@@ -1756,42 +1761,6 @@ static void dev_flush_pasid_all(struct iommu_dev_data *dev_data, CMD_INV_IOMMU_ALL_PAGES_ADDRESS); } -/* Flush the not present cache if it exists */ -static void domain_flush_np_cache(struct protection_domain *domain,
dma_addr_t iova, size_t size)-{
- if (unlikely(amd_iommu_np_cache)) {
unsigned long flags;spin_lock_irqsave(&domain->lock, flags);amd_iommu_domain_flush_pages(domain, iova, size);spin_unlock_irqrestore(&domain->lock, flags);- }
-}
-/*
- This function flushes the DTEs for all devices in domain
- */
-void amd_iommu_update_and_flush_device_table(struct protection_domain *domain) -{
- struct iommu_dev_data *dev_data;
- lockdep_assert_held(&domain->lock);
- list_for_each_entry(dev_data, &domain->dev_list, list) {
struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);set_dte_entry(iommu, dev_data);clone_aliases(iommu, dev_data->dev);- }
- list_for_each_entry(dev_data, &domain->dev_list, list)
device_flush_dte(dev_data);- domain_flush_complete(domain);
-}
int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag) { struct iommu_dev_data *dev_data; @@ -2051,7 +2020,8 @@ static void set_dte_gcr3_table(struct amd_iommu *iommu, } static void set_dte_entry(struct amd_iommu *iommu,
struct iommu_dev_data *dev_data)
struct iommu_dev_data *dev_data,phys_addr_t top_paddr, unsigned int top_level){ u16 domid; u32 old_domid; @@ -2060,19 +2030,36 @@ static void set_dte_entry(struct amd_iommu *iommu, struct protection_domain *domain = dev_data->domain; struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid];
- if (gcr3_info && gcr3_info->gcr3_tbl)
domid = dev_data->gcr3_info.domid;- else
domid = domain->id;
- struct pt_iommu_amdv1_hw_info pt_info;
make_clear_dte(dev_data, dte, &new);
- if (domain->iop.mode != PAGE_MODE_NONE)
new.data[0] |= iommu_virt_to_phys(domain->iop.root);
- if (gcr3_info && gcr3_info->gcr3_tbl)
domid = dev_data->gcr3_info.domid;- else {
domid = domain->id;
- new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
<< DEV_ENTRY_MODE_SHIFT;
if (domain->domain.type & __IOMMU_DOMAIN_PAGING) {/** When updating the IO pagetable, the new top and level* are provided as parameters. For other operations i.e.* device attach, retrieve the current pagetable info* via the IOMMU PT API.*/if (top_paddr) {pt_info.host_pt_root = top_paddr;pt_info.mode = top_level + 1;} else {WARN_ON(top_paddr || top_level);pt_iommu_amdv1_hw_info(&domain->amdv1,&pt_info);}new.data[0] |= __sme_set(pt_info.host_pt_root) |(pt_info.mode & DEV_ENTRY_MODE_MASK)<< DEV_ENTRY_MODE_SHIFT;}- }
new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW; @@ -2138,7 +2125,7 @@ static void dev_update_dte(struct iommu_dev_data *dev_data, bool set) struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); if (set)
set_dte_entry(iommu, dev_data);
else clear_dte_entry(iommu, dev_data);set_dte_entry(iommu, dev_data, 0, 0);@@ -2156,6 +2143,7 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data, { struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); int max_pasids = dev_data->max_pasids;
- struct pt_iommu_x86_64_hw_info pt_info; int ret = 0;
/* @@ -2178,7 +2166,8 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data, if (!pdom_is_v2_pgtbl_mode(pdom)) return ret;
- ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true);
- pt_iommu_x86_64_hw_info(&pdom->amdv2, &pt_info);
- ret = update_gcr3(dev_data, 0, __sme_set(pt_info.gcr3_pt), true); if (ret) free_gcr3_table(&dev_data->gcr3_info);
@@ -2500,54 +2489,6 @@ struct protection_domain *protection_domain_alloc(void) return domain; } -static int pdom_setup_pgtable(struct protection_domain *domain,
struct device *dev)-{
- struct io_pgtable_ops *pgtbl_ops;
- enum io_pgtable_fmt fmt;
- switch (domain->pd_mode) {
- case PD_MODE_V1:
fmt = AMD_IOMMU_V1;break;- case PD_MODE_V2:
fmt = AMD_IOMMU_V2;break;- case PD_MODE_NONE:
WARN_ON_ONCE(1);return -EPERM;- }
- domain->iop.pgtbl.cfg.amd.nid = dev_to_node(dev);
- pgtbl_ops = alloc_io_pgtable_ops(fmt, &domain->iop.pgtbl.cfg, domain);
- if (!pgtbl_ops)
return -ENOMEM;- return 0;
-}
-static inline u64 dma_max_address(enum protection_domain_mode pgtable) -{
- if (pgtable == PD_MODE_V1)
return PM_LEVEL_SIZE(amd_iommu_hpt_level);- /*
* V2 with 4/5 level page table. Note that "2.2.6.5 AMD64 4-Kbyte Page* Translation" shows that the V2 table sign extends the top of the* address space creating a reserved region in the middle of the* translation, just like the CPU does. Further Vasant says the docs are* incomplete and this only applies to non-zero PASIDs. If the AMDv2* page table is assigned to the 0 PASID then there is no sign extension* check.** Since the IOMMU must have a fixed geometry, and the core code does* not understand sign extended addressing, we have to chop off the high* bit to get consistent behavior with attachments of the domain to any* PASID.*/- return ((1ULL << (PM_LEVEL_SHIFT(amd_iommu_gpt_level) - 1)) - 1);
-}
static bool amd_iommu_hd_support(struct amd_iommu *iommu) { if (amd_iommu_hatdis) @@ -2556,38 +2497,229 @@ static bool amd_iommu_hd_support(struct amd_iommu *iommu) return iommu && (iommu->features & FEATURE_HDSUP); } -static struct iommu_domain * -do_iommu_domain_alloc(struct device *dev, u32 flags,
enum protection_domain_mode pgtable)+static spinlock_t *amd_iommu_get_top_lock(struct pt_iommu *iommupt) {
- bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
- struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
- struct protection_domain *pdom =
container_of(iommupt, struct protection_domain, iommu);- return &pdom->lock;
+}
+/*
- Update all HW references to the domain with a new pgtable configuration.
- */
+static void amd_iommu_change_top(struct pt_iommu *iommu_table,
phys_addr_t top_paddr, unsigned int top_level)+{
- struct protection_domain *pdom =
container_of(iommu_table, struct protection_domain, iommu);- struct iommu_dev_data *dev_data;
- lockdep_assert_held(&pdom->lock);
- /* Update the DTE for all devices attached to this domain */
- list_for_each_entry(dev_data, &pdom->dev_list, list) {
struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);/* Update the HW references with the new level and top ptr */set_dte_entry(iommu, dev_data, top_paddr, top_level);clone_aliases(iommu, dev_data->dev);- }
- list_for_each_entry(dev_data, &pdom->dev_list, list)
device_flush_dte(dev_data);- domain_flush_complete(pdom);
+}
+/*
- amd_iommu_iotlb_sync_map() is used to generate flushes for non-present to
- present (ie mapping) operations. It is a NOP if the IOMMU doesn't have non
- present caching (like hypervisor shadowing).
- */
+static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
unsigned long iova, size_t size)+{
- struct protection_domain *domain = to_pdomain(dom);
- unsigned long flags;
- if (likely(!amd_iommu_np_cache))
return 0;- spin_lock_irqsave(&domain->lock, flags);
- amd_iommu_domain_flush_pages(domain, iova, size);
- spin_unlock_irqrestore(&domain->lock, flags);
- return 0;
+}
+static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) +{
- struct protection_domain *dom = to_pdomain(domain);
- unsigned long flags;
- spin_lock_irqsave(&dom->lock, flags);
- amd_iommu_domain_flush_all(dom);
- spin_unlock_irqrestore(&dom->lock, flags);
+}
+static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
struct iommu_iotlb_gather *gather)+{
- struct protection_domain *dom = to_pdomain(domain);
- unsigned long flags;
- spin_lock_irqsave(&dom->lock, flags);
- amd_iommu_domain_flush_pages(dom, gather->start,
gather->end - gather->start + 1);- spin_unlock_irqrestore(&dom->lock, flags);
- iommu_put_pages_list(&gather->freelist);
+}
+static const struct pt_iommu_driver_ops amd_hw_driver_ops_v1 = {
- .get_top_lock = amd_iommu_get_top_lock,
- .change_top = amd_iommu_change_top,
+};
+static const struct iommu_domain_ops amdv1_ops = {
- IOMMU_PT_DOMAIN_OPS(amdv1),
- .iotlb_sync_map = amd_iommu_iotlb_sync_map,
- .flush_iotlb_all = amd_iommu_flush_iotlb_all,
- .iotlb_sync = amd_iommu_iotlb_sync,
- .attach_dev = amd_iommu_attach_device,
- .free = amd_iommu_domain_free,
- .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
+};
+static const struct iommu_dirty_ops amdv1_dirty_ops = {
- IOMMU_PT_DIRTY_OPS(amdv1),
- .set_dirty_tracking = amd_iommu_set_dirty_tracking,
+};
+static struct iommu_domain *amd_iommu_domain_alloc_paging_v1(struct device *dev,
u32 flags)+{
- struct pt_iommu_amdv1_cfg cfg = {}; struct protection_domain *domain; int ret;
- if (amd_iommu_hatdis)
return ERR_PTR(-EOPNOTSUPP);- domain = protection_domain_alloc(); if (!domain) return ERR_PTR(-ENOMEM);
- domain->pd_mode = pgtable;
- ret = pdom_setup_pgtable(domain, dev);
- domain->pd_mode = PD_MODE_V1;
- domain->iommu.driver_ops = &amd_hw_driver_ops_v1;
- domain->iommu.nid = dev_to_node(dev);
- if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
domain->domain.dirty_ops = &amdv1_dirty_ops;- /*
* Someday FORCE_COHERENCE should be set by* amd_iommu_enforce_cache_coherency() like VT-d does.*/- cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) |
BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) |BIT(PT_FEAT_AMDV1_FORCE_COHERENCE);- /*
* AMD's IOMMU can flush as many pages as necessary in a single flush.* Unless we run in a virtual machine, which can be inferred according* to whether "non-present cache" is on, it is probably best to prefer* (potentially) too extensive TLB flushing (i.e., more misses) over* multiple TLB flushes (i.e., more flushes). For virtual machines the* hypervisor needs to synchronize the host IOMMU PTEs with those of* the guest, and the trade-off is different: unnecessary TLB flushes* should be avoided.*/- if (amd_iommu_np_cache)
cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS);- else
cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE);- cfg.common.hw_max_vasz_lg2 =
min(64, (amd_iommu_hpt_level - 1) * 9 + 21);- cfg.common.hw_max_oasz_lg2 = 52;
- cfg.starting_level = 2;
- domain->domain.ops = &amdv1_ops;
- ret = pt_iommu_amdv1_init(&domain->amdv1, &cfg, GFP_KERNEL); if (ret) {
pdom_id_free(domain->id);kfree(domain);
return ERR_PTR(ret); }amd_iommu_domain_free(&domain->domain);
- domain->domain.geometry.aperture_start = 0;
- domain->domain.geometry.aperture_end = dma_max_address(pgtable);
- domain->domain.geometry.force_aperture = true;
- domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap;
- /*
* Narrow the supported page sizes to those selected by the kernel* command line.*/- domain->domain.pgsize_bitmap &= amd_iommu_pgsize_bitmap;
- return &domain->domain;
+}
- domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
- domain->domain.ops = iommu->iommu.ops->default_domain_ops;
+static const struct iommu_domain_ops amdv2_ops = {
- IOMMU_PT_DOMAIN_OPS(x86_64),
- .iotlb_sync_map = amd_iommu_iotlb_sync_map,
- .flush_iotlb_all = amd_iommu_flush_iotlb_all,
- .iotlb_sync = amd_iommu_iotlb_sync,
- .attach_dev = amd_iommu_attach_device,
- .free = amd_iommu_domain_free,
- /*
* Note the AMDv2 page table format does not support a Force Coherency* bit, so enforce_cache_coherency should not be set. However VFIO is* not prepared to handle a case where some domains will support* enforcement and others do not. VFIO and iommufd will have to be fixed* before it can fully use the V2 page table. See the comment in* iommufd_hwpt_paging_alloc(). For now leave things as they have* historically been and lie about enforce_cache_coherencey.*/- .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
+};
- if (dirty_tracking)
domain->domain.dirty_ops = &amd_dirty_ops;+static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev,
u32 flags)+{
- struct pt_iommu_x86_64_cfg cfg = {};
- struct protection_domain *domain;
- int ret;
- if (!amd_iommu_v2_pgtbl_supported())
return ERR_PTR(-EOPNOTSUPP);- domain = protection_domain_alloc();
- if (!domain)
return ERR_PTR(-ENOMEM);- domain->pd_mode = PD_MODE_V2;
- domain->iommu.nid = dev_to_node(dev);
- cfg.common.features = BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES);
- if (amd_iommu_np_cache)
cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS);- else
cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE);- /*
* The v2 table behaves differently if it is attached to PASID 0 vs a* non-zero PASID. On PASID 0 it has no sign extension and the full* 57/48 bits decode the lower addresses. Otherwise it behaves like a* normal sign extended x86 page table. Since we want the domain to work* in both modes the top bit is removed and PT_FEAT_SIGN_EXTEND is not* set which creates a table that is compatible in both modes.*/- if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL)
cfg.common.hw_max_vasz_lg2 = 56;- else
cfg.common.hw_max_vasz_lg2 = 47;- cfg.common.hw_max_oasz_lg2 = 52;
- domain->domain.ops = &amdv2_ops;
- ret = pt_iommu_x86_64_init(&domain->amdv2, &cfg, GFP_KERNEL);
- if (ret) {
amd_iommu_domain_free(&domain->domain);return ERR_PTR(ret);- } return &domain->domain;
} @@ -2608,15 +2740,27 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, /* Allocate domain with v1 page table for dirty tracking */ if (!amd_iommu_hd_support(iommu)) break;
return do_iommu_domain_alloc(dev, flags, PD_MODE_V1);
case IOMMU_HWPT_ALLOC_PASID: /* Allocate domain with v2 page table if IOMMU supports PASID. */ if (!amd_iommu_pasid_supported()) break;return amd_iommu_domain_alloc_paging_v1(dev, flags);
return do_iommu_domain_alloc(dev, flags, PD_MODE_V2);- case 0:
return amd_iommu_domain_alloc_paging_v2(dev, flags);- case 0: {
struct iommu_domain *ret;- /* If nothing specific is required use the kernel commandline default */
return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable);
if (amd_iommu_pgtable == PD_MODE_V1) {ret = amd_iommu_domain_alloc_paging_v1(dev, flags);if (ret != ERR_PTR(-EOPNOTSUPP))return ret;return amd_iommu_domain_alloc_paging_v2(dev, flags);}ret = amd_iommu_domain_alloc_paging_v2(dev, flags);if (ret != ERR_PTR(-EOPNOTSUPP))return ret;return amd_iommu_domain_alloc_paging_v1(dev, flags);- } default: break; }
@@ -2628,8 +2772,7 @@ void amd_iommu_domain_free(struct iommu_domain *dom) struct protection_domain *domain = to_pdomain(dom); WARN_ON(!list_empty(&domain->dev_list));
- if (domain->domain.type & __IOMMU_DOMAIN_PAGING)
free_io_pgtable_ops(&domain->iop.pgtbl.ops);
- pt_iommu_deinit(&domain->iommu); pdom_id_free(domain->id); kfree(domain);
} @@ -2734,93 +2877,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, return ret; } -static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
unsigned long iova, size_t size)-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
- if (ops->map_pages)
domain_flush_np_cache(domain, iova, size);- return 0;
-}
-static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
phys_addr_t paddr, size_t pgsize, size_t pgcount,int iommu_prot, gfp_t gfp, size_t *mapped)-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
- int prot = 0;
- int ret = -EINVAL;
- if ((domain->pd_mode == PD_MODE_V1) &&
(domain->iop.mode == PAGE_MODE_NONE))return -EINVAL;- if (iommu_prot & IOMMU_READ)
prot |= IOMMU_PROT_IR;- if (iommu_prot & IOMMU_WRITE)
prot |= IOMMU_PROT_IW;- if (ops->map_pages) {
ret = ops->map_pages(ops, iova, paddr, pgsize,pgcount, prot, gfp, mapped);- }
- return ret;
-}
-static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
struct iommu_iotlb_gather *gather,unsigned long iova, size_t size)-{
- /*
* AMD's IOMMU can flush as many pages as necessary in a single flush.* Unless we run in a virtual machine, which can be inferred according* to whether "non-present cache" is on, it is probably best to prefer* (potentially) too extensive TLB flushing (i.e., more misses) over* mutliple TLB flushes (i.e., more flushes). For virtual machines the* hypervisor needs to synchronize the host IOMMU PTEs with those of* the guest, and the trade-off is different: unnecessary TLB flushes* should be avoided.*/- if (amd_iommu_np_cache &&
iommu_iotlb_gather_is_disjoint(gather, iova, size))iommu_iotlb_sync(domain, gather);- iommu_iotlb_gather_add_range(gather, iova, size);
-}
-static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova,
size_t pgsize, size_t pgcount,struct iommu_iotlb_gather *gather)-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
- size_t r;
- if ((domain->pd_mode == PD_MODE_V1) &&
(domain->iop.mode == PAGE_MODE_NONE))return 0;- r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0;
- if (r)
amd_iommu_iotlb_gather_add_page(dom, gather, iova, r);- return r;
-}
-static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
dma_addr_t iova)-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
- return ops->iova_to_phys(ops, iova);
-}
static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) { switch (cap) { @@ -2887,28 +2943,6 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, return 0; } -static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
unsigned long iova, size_t size,unsigned long flags,struct iommu_dirty_bitmap *dirty)-{
- struct protection_domain *pdomain = to_pdomain(domain);
- struct io_pgtable_ops *ops = &pdomain->iop.pgtbl.ops;
- unsigned long lflags;
- if (!ops || !ops->read_and_clear_dirty)
return -EOPNOTSUPP;- spin_lock_irqsave(&pdomain->lock, lflags);
- if (!pdomain->dirty_tracking && dirty->bitmap) {
spin_unlock_irqrestore(&pdomain->lock, lflags);return -EINVAL;- }
- spin_unlock_irqrestore(&pdomain->lock, lflags);
- return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
-}
static void amd_iommu_get_resv_regions(struct device *dev, struct list_head *head) { @@ -2978,28 +3012,6 @@ static bool amd_iommu_is_attach_deferred(struct device *dev) return dev_data->defer_attach; } -static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) -{
- struct protection_domain *dom = to_pdomain(domain);
- unsigned long flags;
- spin_lock_irqsave(&dom->lock, flags);
- amd_iommu_domain_flush_all(dom);
- spin_unlock_irqrestore(&dom->lock, flags);
-}
-static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
struct iommu_iotlb_gather *gather)-{
- struct protection_domain *dom = to_pdomain(domain);
- unsigned long flags;
- spin_lock_irqsave(&dom->lock, flags);
- amd_iommu_domain_flush_pages(dom, gather->start,
gather->end - gather->start + 1);- spin_unlock_irqrestore(&dom->lock, flags);
-}
static int amd_iommu_def_domain_type(struct device *dev) { struct iommu_dev_data *dev_data; @@ -3034,11 +3046,6 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) return true; } -static const struct iommu_dirty_ops amd_dirty_ops = {
- .set_dirty_tracking = amd_iommu_set_dirty_tracking,
- .read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
-};
const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .blocked_domain = &blocked_domain, @@ -3053,17 +3060,6 @@ const struct iommu_ops amd_iommu_ops = { .is_attach_deferred = amd_iommu_is_attach_deferred, .def_domain_type = amd_iommu_def_domain_type, .page_response = amd_iommu_page_response,
- .default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = amd_iommu_attach_device,.map_pages = amd_iommu_map_pages,.unmap_pages = amd_iommu_unmap_pages,.iotlb_sync_map = amd_iommu_iotlb_sync_map,.iova_to_phys = amd_iommu_iova_to_phys,.flush_iotlb_all = amd_iommu_flush_iotlb_all,.iotlb_sync = amd_iommu_iotlb_sync,.free = amd_iommu_domain_free,.enforce_cache_coherency = amd_iommu_enforce_cache_coherency,- }
}; #ifdef CONFIG_IRQ_REMAP @@ -4072,3 +4068,5 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu) return 0; } #endif
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
2.43.0