On 10/23/2025 11:50 PM, Jason Gunthorpe wrote:
AMD IOMMU v1 is unique in supporting contiguous pages with a variable size and it can decode the full 64 bit VA space. Unlike other x86 page tables this explicitly does not do sign extension as part of allowing the entire 64 bit VA space to be supported.
The general design is quite similar to the x86 PAE format, except with a 6th level and quite different PTE encoding.
This format is the only one that uses the PT_FEAT_DYNAMIC_TOP feature in the existing code as the existing AMDv1 code starts out with a 3 level table and adds levels on the fly if more IOVA is needed.
Comparing the performance of several operations to the existing version:
iommu_map() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 65,64 , 62,61 , -1.01 2^13, 70,66 , 67,62 , -8.08 2^14, 73,69 , 71,65 , -9.09 2^15, 78,75 , 75,71 , -5.05 2^16, 89,89 , 86,84 , -2.02 2^17, 128,121 , 124,112 , -10.10 2^18, 175,175 , 170,163 , -4.04 2^19, 264,306 , 261,279 , 6.06 2^20, 444,525 , 438,489 , 10.10 2^21, 60,62 , 58,59 , 1.01 256*2^12, 381,1833 , 367,1795 , 79.79 256*2^21, 375,1623 , 356,1555 , 77.77 256*2^30, 356,1338 , 349,1277 , 72.72
iommu_unmap() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 76,89 , 71,86 , 17.17 2^13, 79,89 , 75,86 , 12.12 2^14, 78,90 , 74,86 , 13.13 2^15, 82,89 , 74,86 , 13.13 2^16, 79,89 , 74,86 , 13.13 2^17, 81,89 , 77,87 , 11.11 2^18, 90,92 , 87,89 , 2.02 2^19, 91,93 , 88,90 , 2.02 2^20, 96,95 , 91,92 , 1.01 2^21, 72,88 , 68,85 , 20.20 256*2^12, 372,6583 , 364,6251 , 94.94 256*2^21, 398,6032 , 392,5758 , 93.93 256*2^30, 396,5665 , 389,5258 , 92.92
The ~5-17x speedup when working with mutli-PTE map/unmaps is because the AMD implementation rewalks the entire table on every new PTE while this version retains its position. The same speedup will be seen with dirtys as well.
The old implementation triggers a compiler optimization that ends up generating a "rep stos" memset for contiguous PTEs. Since AMD can have contiguous PTEs that span 2Kbytes of table this is a huge win compared to a normal movq loop. It is why the unmap side has a fairly flat runtime as the contiguous PTE sides increases. This version makes it explicit with a memset64() call.
Tested-by: Alejandro Jimenez alejandro.j.jimenez@oracle.com Reviewed-by: Kevin Tian kevin.tian@intel.com Signed-off-by: Jason Gunthorpe jgg@nvidia.com
Reviewed-by: Vasant Hegde vasant.hegde@amd.com
drivers/iommu/Makefile | 1 + drivers/iommu/generic_pt/Kconfig | 12 + drivers/iommu/generic_pt/fmt/Makefile | 11 + drivers/iommu/generic_pt/fmt/amdv1.h | 391 +++++++++++++++++++++ drivers/iommu/generic_pt/fmt/defs_amdv1.h | 21 ++ drivers/iommu/generic_pt/fmt/iommu_amdv1.c | 15 + include/linux/generic_pt/common.h | 19 + include/linux/generic_pt/iommu.h | 12 + 8 files changed, 482 insertions(+) create mode 100644 drivers/iommu/generic_pt/fmt/Makefile create mode 100644 drivers/iommu/generic_pt/fmt/amdv1.h create mode 100644 drivers/iommu/generic_pt/fmt/defs_amdv1.h create mode 100644 drivers/iommu/generic_pt/fmt/iommu_amdv1.c
.../...
+$(eval $(foreach fmt,$(iommu_pt_fmt-m),$(call create_format,$(fmt),m))) diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h b/drivers/iommu/generic_pt/fmt/amdv1.h new file mode 100644 index 00000000000000..1f46e4ab4aea51 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/amdv1.h @@ -0,0 +1,391 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/*
- Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
- AMD IOMMU v1 page table
- This is described in Section "2.2.3 I/O Page Tables for Host Translations"
- of the "AMD I/O Virtualization Technology (IOMMU) Specification"
- Note the level numbering here matches the core code, so level 0 is the same
- as mode 1.
- */
+#ifndef __GENERIC_PT_FMT_AMDV1_H +#define __GENERIC_PT_FMT_AMDV1_H
+#include "defs_amdv1.h" +#include "../pt_defs.h"
+#include <asm/page.h> +#include <linux/bitfield.h> +#include <linux/container_of.h> +#include <linux/mem_encrypt.h> +#include <linux/minmax.h> +#include <linux/sizes.h> +#include <linux/string.h>
+enum {
- PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
- PT_MAX_VA_ADDRESS_LG2 = 64,
- PT_ITEM_WORD_SIZE = sizeof(u64),
- PT_MAX_TOP_LEVEL = 5,
- PT_GRANULE_LG2SZ = 12,
- PT_TABLEMEM_LG2SZ = 12,
- /* The DTE only has these bits for the top phyiscal address */
- PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
+};
+/* PTE bits */ +enum {
- AMDV1PT_FMT_PR = BIT(0),
- AMDV1PT_FMT_D = BIT(6),
- AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9),
- AMDV1PT_FMT_OA = GENMASK_ULL(51, 12),
- AMDV1PT_FMT_FC = BIT_ULL(60),
- AMDV1PT_FMT_IR = BIT_ULL(61),
- AMDV1PT_FMT_IW = BIT_ULL(62),
+};
+/*
- gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make
- these defines to avoid it.
- */
+#define AMDV1PT_FMT_NL_DEFAULT 0 +#define AMDV1PT_FMT_NL_SIZE 7
+#define common_to_amdv1pt(common_ptr) \
- container_of_const(common_ptr, struct pt_amdv1, common)
+#define to_amdv1pt(pts) common_to_amdv1pt((pts)->range->common)
Unused macros?
-Vasant