On Wed, Sep 03, 2025 at 02:46:28PM -0300, Jason Gunthorpe wrote:
+/**
- pt_entry_oa_lg2sz() - Return the size of a OA entry
an OA
- @pts: Entry to query
- If the entry is not contiguous this returns pt_table_item_lg2sz(), otherwise
- it returns the total VA/OA size of the entire contiguous entry.
- */
+static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts) +{
- return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts);
+}
------
- level
The number of table hops from the lowest leaf. Level 0
is always a table of only leaves of the least significant VA bits. The
Hmm, I am a bit confused here. I thought "leaf" was meant to be a "leaf" table? But here "a table of only leaves" makes it feel like a "leaf" table entry?
Also, isn't "the least significant VA bits" the page offset?
- table
A linear array of entries representing the translation items for that
level.
- index
The position in a table of an element: item = table[index]
- item
A single position in a table
- entry
A single logical element in a table. If contiguous pages are not
supported then item and entry are the same thing, otherwise entry refers
to the all the items that comprise a single contiguous translation.
So, an "entry" is a group of "items" if contiguous pages (huge page?) are supported. Then, the "entry" sounds like a physical (v.s. "logical") table entry, e.g. a PTE that we usually say?
+#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64) +static inline bool pt_table_install64(struct pt_state *pts, u64 table_entry) +{
- u64 *entryp = pt_cur_table(pts, u64) + pts->index;
- u64 old_entry = pts->entry;
- bool ret;
- /*
* Ensure the zero'd table content itself is visible before its PTE can
* be. release is a NOP on !SMP, but the HW is still doing an acquire.
*/
- if (!IS_ENABLED(CONFIG_SMP))
dma_wmb();
Mind elaborating why SMP doesn't need this?
+/*
- PT_WARN_ON is used for invariants that the kunit should be checking can't
- happen.
- */
+#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) +#define PT_WARN_ON WARN_ON +#else +static inline bool PT_WARN_ON(bool condition) +{
- return false;
Should it "return condition"?
Otherwise, these validations wouldn't be effective?
drivers/iommu/generic_pt/pt_iter.h:388: if (PT_WARN_ON(!pts->table_lower)) drivers/iommu/generic_pt/pt_iter.h-389- return -EINVAL; -- drivers/iommu/generic_pt/pt_iter.h-429- drivers/iommu/generic_pt/pt_iter.h:430: if (PT_WARN_ON(!pt_can_have_table(pts)) || drivers/iommu/generic_pt/pt_iter.h:431: PT_WARN_ON(!pts->table_lower)) drivers/iommu/generic_pt/pt_iter.h-432- return -EINVAL;
+/**
- pt_load_entry() - Read from the location pts points at into the pts
- @pts: Table index to load
- Set the type of entry that was loaded. pts->entry and pts->table_lower
- will be filled in with the entry's content.
- */
+static inline void pt_load_entry(struct pt_state *pts) +{
- pts->type = pt_load_entry_raw(pts);
- if (pts->type == PT_ENTRY_TABLE)
pts->table_lower = pt_table_ptr(pts);
+}
I see a couple of callers check pts->type. Maybe it could return pts->type, matching with pt_load_entry_raw()?
diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h b/drivers/iommu/generic_pt/pt_fmt_defaults.h new file mode 100644 index 00000000000000..19e8f820c1dccf --- /dev/null +++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/*
- Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
- Default definitions for formats that don't define these functions.
- */
+#ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H +#define __GENERIC_PT_PT_FMT_DEFAULTS_H
+#include "pt_defs.h" +#include <linux/log2.h>
<> precedes ""
+#ifndef pt_pgsz_lg2_to_level +static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common,
unsigned int pgsize_lg2)
+{
- return (pgsize_lg2 - PT_GRANULE_LG2SZ) /
(PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE));
- return 0;
"return 0" should likely be dropped.
+/*
- Format supplies either:
- pt_entry_oa - OA is at the start of a contiguous entry
- or
- pt_item_oa - OA is correct for every item in a contiguous entry
What does the "correct" mean here?
+/**
- pt_range_to_end_index() - Ending index iteration
- @pts: Iteration State
- Return: the last index for the iteration in pts.
- */
+static inline unsigned int pt_range_to_end_index(const struct pt_state *pts) +{
- unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
- struct pt_range *range = pts->range;
- unsigned int num_entries_lg2;
- if (range->va == range->last_va)
return pts->index + 1;
- if (pts->range->top_level == pts->level)
return log2_div(fvalog2_mod(pts->range->last_va,
pts->range->max_vasz_lg2),
isz_lg2) +
1;
How about: return 1 + log2_div(...); ?
+static __always_inline struct pt_range _pt_top_range(struct pt_common *common,
uintptr_t top_of_table)
+{
- struct pt_range range = {
.common = common,
.top_table =
(struct pt_table_p *)(top_of_table &
~(uintptr_t)PT_TOP_LEVEL_MASK),
.top_level = top_of_table % (1 << PT_TOP_LEVEL_BITS),
Since top_level is unsigned, would it be faster to do bitwise: .top_level = top_of_table & PT_TOP_LEVEL_MASK, ?
+/*
/**
- pt_walk_descend_all() - Recursively invoke the walker for a table item
- @parent_pts: Iteration State
- @fn: Walker function to call
- @arg: Value to pass to the function
- With pts pointing at a table item this will descend and over the entire lower
- table. This creates a new walk and does not alter pts or pts->range.
- */
+static __always_inline int +pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn,
void *arg)
-------
+/**
- pt_compute_best_pgsize() - Determine the best page size for leaf entries
- @pgsz_bitmap: Permitted page sizes
- @va: Starting virtual address for the leaf entry
- @last_va: Last virtual address for the leaf entry, sets the max page size
- @oa: Starting output address for the leaf entry
- Compute the largest page size for va, last_va, and oa together and return it
- in lg2. The largest page size depends on the format's supported page sizes at
- this level, and the relative alignment of the VA and OA addresses. 0 means
- the OA cannot be stored with the provided pgsz_bitmap.
- */
+static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap,
pt_vaddr_t va,
pt_vaddr_t last_va,
pt_oaddr_t oa)
+{
- unsigned int best_pgsz_lg2;
- unsigned int pgsz_lg2;
- pt_vaddr_t len = last_va - va + 1;
- pt_vaddr_t mask;
- if (PT_WARN_ON(va >= last_va))
return 0;
- /*
* Given a VA/OA pair the best page size is the largest page side
* where:
*
* 1) VA and OA start at the page. Bitwise this is the count of least
* significant 0 bits.
* This also implies that last_va/oa has the same prefix as va/oa.
*/
- mask = va | oa;
- /*
* 2) The page size is not larger than the last_va (length). Since page
* sizes are always power of two this can't be larger than the
* largest power of two factor of the length.
*/
- mask |= log2_to_int(log2_fls(len) - 1);
- best_pgsz_lg2 = log2_ffs(mask);
- /* Choose the higest bit <= best_pgsz_lg2 */
highest
+/* Compute a */ +#define log2_to_int_t(type, a_lg2) ((type)(((type)1) << (a_lg2))) +static_assert(log2_to_int_t(unsigned int, 0) == 1);
+/* Compute a - 1 (aka all low bits set) */ +#define log2_to_max_int_t(type, a_lg2) ((type)(log2_to_int_t(type, a_lg2) - 1))
+/* Compute a / b */ +#define log2_div_t(type, a, b_lg2) ((type)(((type)a) >> (b_lg2))) +static_assert(log2_div_t(unsigned int, 4, 2) == 1);
I skimmed through these macros and its callers. They are mostly dealing with VA, OA, and mask, which feels like straightforward bit-masking/shifting operations.
E.g. log2_to_int_t = 64bit ? BIT_ULL(lg2) : BIT(lg2); log2_to_max_int_t = 64bit ? GENMASK_ULL(lg2 - 1, 0) : GENMASK(lg2 - 1, 0);
What's the benefit from making them as arithmetic ones?
diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h new file mode 100644 index 00000000000000..a29bdd7b244de6 --- /dev/null +++ b/include/linux/generic_pt/common.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/*
- Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
- */
+#ifndef __GENERIC_PT_COMMON_H +#define __GENERIC_PT_COMMON_H
+#include <linux/types.h> +#include <linux/build_bug.h> +#include <linux/bits.h>
In alphabetical order.
Thanks Nicolin