 
            unmap_pages removes mappings and any fully contained interior tables from the given range. This follows the now-standard iommu_domain API definition where it does not split up larger page sizes into smaller. The caller must perform unmap only on ranges created by map or it must have somehow otherwise determined safe cut points (eg iommufd/vfio use iova_to_phys to scan for them)
A future work will provide 'cut' which explicitly does the page size split if the HW can support it.
unmap is implemented with a recursive descent of the tree. If the caller provides a VA range that spans an entire table item then the table memory can be freed as well.
If an entire table item can be freed then this version will also check the leaf-only level of the tree to ensure that all entries are present to generate -EINVAL. Many of the existing drivers don't do this extra check.
This version sits under the iommu_domain_ops as unmap_pages() but does not require the external page size calculation. The implementation is actually unmap_range() and can do arbitrary ranges, internally handling all the validation and supporting any arrangment of page sizes. A future series can optimize __iommu_unmap() to take advantage of this.
Freed page table memory is batched up in the gather and will be freed in the driver's iotlb_sync() callback after the IOTLB flush completes.
Tested-by: Alejandro Jimenez alejandro.j.jimenez@oracle.com Reviewed-by: Kevin Tian kevin.tian@intel.com Signed-off-by: Jason Gunthorpe jgg@nvidia.com --- drivers/iommu/generic_pt/iommu_pt.h | 156 ++++++++++++++++++++++++++++ include/linux/generic_pt/iommu.h | 10 +- 2 files changed, 164 insertions(+), 2 deletions(-)
diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 5ff1b887928a46..e3d1b272723db0 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -14,6 +14,29 @@ #include <linux/export.h> #include <linux/iommu.h> #include "../iommu-pages.h" +#include <linux/cleanup.h> +#include <linux/dma-mapping.h> + +static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather, + struct pt_iommu *iommu_table, pt_vaddr_t iova, + pt_vaddr_t len, + struct iommu_pages_list *free_list) +{ + struct pt_common *common = common_from_iommu(iommu_table); + + if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) && + iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) { + iommu_iotlb_sync(&iommu_table->domain, iotlb_gather); + /* + * Note that the sync frees the gather's free list, so we must + * not have any pages on that list that are covered by iova/len + */ + } else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) { + iommu_iotlb_gather_add_range(iotlb_gather, iova, len); + } + + iommu_pages_list_splice(free_list, &iotlb_gather->freelist); +}
#define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op)
@@ -164,6 +187,139 @@ static inline struct pt_table_p *table_alloc_top(struct pt_common *common, log2_to_int(pt_top_memsize_lg2(common, top_of_table))); }
+struct pt_unmap_args { + struct iommu_pages_list free_list; + pt_vaddr_t unmapped; +}; + +static __maybe_unused int __unmap_range(struct pt_range *range, void *arg, + unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_unmap_args *unmap = arg; + unsigned int num_oas = 0; + unsigned int start_index; + int ret = 0; + + _pt_iter_first(&pts); + start_index = pts.index; + pts.type = pt_load_entry_raw(&pts); + /* + * A starting index is in the middle of a contiguous entry + * + * The IOMMU API does not require drivers to support unmapping parts of + * large pages. Long ago VFIO would try to split maps but the current + * version never does. + * + * Instead when unmap reaches a partial unmap of the start of a large + * IOPTE it should remove the entire IOPTE and return that size to the + * caller. + */ + if (pts.type == PT_ENTRY_OA) { + if (log2_mod(range->va, pt_entry_oa_lg2sz(&pts))) + return -EINVAL; + /* Micro optimization */ + goto start_oa; + } + + do { + if (pts.type != PT_ENTRY_OA) { + bool fully_covered; + + if (pts.type != PT_ENTRY_TABLE) { + ret = -EINVAL; + break; + } + + if (pts.index != start_index) + pt_index_to_va(&pts); + pts.table_lower = pt_table_ptr(&pts); + + fully_covered = pt_entry_fully_covered( + &pts, pt_table_item_lg2sz(&pts)); + + ret = pt_descend(&pts, arg, __unmap_range); + if (ret) + break; + + /* + * If the unmapping range fully covers the table then we + * can free it as well. The clear is delayed until we + * succeed in clearing the lower table levels. + */ + if (fully_covered) { + iommu_pages_list_add(&unmap->free_list, + pts.table_lower); + pt_clear_entries(&pts, ilog2(1)); + } + pts.index++; + } else { + unsigned int num_contig_lg2; +start_oa: + /* + * If the caller requested an last that falls within a + * single entry then the entire entry is unmapped and + * the length returned will be larger than requested. + */ + num_contig_lg2 = pt_entry_num_contig_lg2(&pts); + pt_clear_entries(&pts, num_contig_lg2); + num_oas += log2_to_int(num_contig_lg2); + pts.index += log2_to_int(num_contig_lg2); + } + if (pts.index >= pts.end_index) + break; + pts.type = pt_load_entry_raw(&pts); + } while (true); + + unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts)); + return ret; +} + +/** + * unmap_pages() - Make a range of IOVA empty/not present + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @pgsize: Length of each page + * @pgcount: Length of the range in pgsize units starting from @iova + * @iotlb_gather: Gather struct that must be flushed on return + * + * unmap_pages() will remove a translation created by map_pages(). It cannot + * subdivide a mapping created by map_pages(), so it should be called with IOVA + * ranges that match those passed to map_pages(). The IOVA range can aggregate + * contiguous map_pages() calls so long as no individual range is split. + * + * Context: The caller must hold a write range lock that includes + * the whole range. + * + * Returns: Number of bytes of VA unmapped. iova + res will be the point + * unmapping stopped. + */ +size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *iotlb_gather) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT( + unmap.free_list) }; + pt_vaddr_t len = pgsize * pgcount; + struct pt_range range; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, iova, len); + if (ret) + return 0; + + pt_walk_range(&range, __unmap_range, &unmap); + + gather_range_pages(iotlb_gather, iommu_table, iova, len, + &unmap.free_list); + + return unmap.unmapped; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU"); + static void NS(get_info)(struct pt_iommu *iommu_table, struct pt_iommu_info *info) { diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 5622856e199881..ceb6bc9cea37cd 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -9,6 +9,7 @@ #include <linux/iommu.h> #include <linux/mm_types.h>
+struct iommu_iotlb_gather; struct pt_iommu_ops;
/** @@ -119,6 +120,10 @@ struct pt_iommu_cfg { #define IOMMU_PROTOTYPES(fmt) \ phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ dma_addr_t iova); \ + size_t pt_iommu_##fmt##_unmap_pages( \ + struct iommu_domain *domain, unsigned long iova, \ + size_t pgsize, size_t pgcount, \ + struct iommu_iotlb_gather *iotlb_gather); \ int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ const struct pt_iommu_##fmt##_cfg *cfg, \ gfp_t gfp); \ @@ -135,8 +140,9 @@ struct pt_iommu_cfg { * A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the * iommu_pt */ -#define IOMMU_PT_DOMAIN_OPS(fmt) \ - .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, +#define IOMMU_PT_DOMAIN_OPS(fmt) \ + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ + .unmap_pages = &pt_iommu_##fmt##_unmap_pages
/* * The driver should setup its domain struct like