Hi all,
This series implements the Permission Overlay Extension introduced in 2022 VMSA enhancements [1]. It is based on v6.11-rc4.
Changes since v4[2]: - Added Acks and R-bs, thanks! - KVM: - Move POR_EL{0,1} handling inside TCR_EL2 blocks - Add visibility functions for registers [4] - Make ID_AA64MMFR3_EL1 writable - use system_supports_poe() more consistently - use BIT instead of hex constants - fix off-by-one in arch_max_pkey() macro - add PKEY_DISABLE_EXECUTE and PKEY_DISABLE_READ - Update some comments and commit messages. - No change to when we save/restore POR_EL0 for signals!
Conflicts with GCS: - Uses the same (last) bit in HWCAP2 - Uses the same VM_HIGH_ARCH_5
Conflicts with arm64 KVM: - Maz has taken patch 8 into one of his own series - I have taken and modified a patch from Maz (patch 9)
The Permission Overlay Extension allows to constrain permissions on memory regions. This can be used from userspace (EL0) without a system call or TLB invalidation.
POE is used to implement the Memory Protection Keys [3] Linux syscall.
The first few patches add the basic framework, then the PKEYS interface is implemented, and then the selftests are made to work on arm64.
I have tested the modified protection_keys test on x86_64, but not PPC. I haven't build tested the x86/ppc arch changes.
Thanks, Joey
[1] https://community.arm.com/arm-community-blogs/b/architectures-and-processors... [2] https://lore.kernel.org/linux-arm-kernel/20240503130147.1154804-1-joey.gouly... [3] Documentation/core-api/protection-keys.rst [4] https://lore.kernel.org/linux-arm-kernel/20240806-kvm-arm64-get-reg-list-v2-...
Joey Gouly (30): powerpc/mm: add ARCH_PKEY_BITS to Kconfig x86/mm: add ARCH_PKEY_BITS to Kconfig mm: use ARCH_PKEY_BITS to define VM_PKEY_BITN arm64: disable trapping of POR_EL0 to EL2 arm64: cpufeature: add Permission Overlay Extension cpucap arm64: context switch POR_EL0 register KVM: arm64: Save/restore POE registers KVM: arm64: make kvm_at() take an OP_AT_* KVM: arm64: use `at s1e1a` for POE KVM: arm64: Sanitise ID_AA64MMFR3_EL1 arm64: enable the Permission Overlay Extension for EL0 arm64: re-order MTE VM_ flags arm64: add POIndex defines arm64: convert protection key into vm_flags and pgprot values arm64: mask out POIndex when modifying a PTE arm64: handle PKEY/POE faults arm64: add pte_access_permitted_no_overlay() arm64: implement PKEYS support arm64: add POE signal support arm64/ptrace: add support for FEAT_POE arm64: enable POE and PIE to coexist arm64: enable PKEY support for CPUs with S1POE arm64: add Permission Overlay Extension Kconfig kselftest/arm64: move get_header() selftests: mm: move fpregs printing selftests: mm: make protection_keys test work on arm64 kselftest/arm64: add HWCAP test for FEAT_S1POE kselftest/arm64: parse POE_MAGIC in a signal frame kselftest/arm64: Add test case for POR_EL0 signal frame records KVM: selftests: get-reg-list: add Permission Overlay registers
Documentation/arch/arm64/elf_hwcaps.rst | 2 + arch/arm64/Kconfig | 23 +++ arch/arm64/include/asm/cpufeature.h | 6 + arch/arm64/include/asm/el2_setup.h | 10 +- arch/arm64/include/asm/hwcap.h | 1 + arch/arm64/include/asm/kvm_asm.h | 3 +- arch/arm64/include/asm/kvm_host.h | 4 + arch/arm64/include/asm/mman.h | 10 +- arch/arm64/include/asm/mmu.h | 1 + arch/arm64/include/asm/mmu_context.h | 46 +++++- arch/arm64/include/asm/pgtable-hwdef.h | 10 ++ arch/arm64/include/asm/pgtable-prot.h | 8 +- arch/arm64/include/asm/pgtable.h | 34 ++++- arch/arm64/include/asm/pkeys.h | 108 ++++++++++++++ arch/arm64/include/asm/por.h | 33 +++++ arch/arm64/include/asm/processor.h | 1 + arch/arm64/include/asm/sysreg.h | 3 + arch/arm64/include/asm/traps.h | 1 + arch/arm64/include/asm/vncr_mapping.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 1 + arch/arm64/include/uapi/asm/mman.h | 9 ++ arch/arm64/include/uapi/asm/sigcontext.h | 7 + arch/arm64/kernel/cpufeature.c | 23 +++ arch/arm64/kernel/cpuinfo.c | 1 + arch/arm64/kernel/process.c | 28 ++++ arch/arm64/kernel/ptrace.c | 46 ++++++ arch/arm64/kernel/signal.c | 62 ++++++++ arch/arm64/kernel/traps.c | 6 + arch/arm64/kvm/hyp/include/hyp/fault.h | 5 +- arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 27 ++++ arch/arm64/kvm/sys_regs.c | 25 +++- arch/arm64/mm/fault.c | 55 ++++++- arch/arm64/mm/mmap.c | 11 ++ arch/arm64/mm/mmu.c | 45 ++++++ arch/arm64/tools/cpucaps | 1 + arch/powerpc/Kconfig | 4 + arch/x86/Kconfig | 4 + fs/proc/task_mmu.c | 2 + include/linux/mm.h | 20 ++- include/uapi/linux/elf.h | 1 + tools/testing/selftests/arm64/abi/hwcap.c | 14 ++ .../testing/selftests/arm64/signal/.gitignore | 1 + .../arm64/signal/testcases/poe_siginfo.c | 86 +++++++++++ .../arm64/signal/testcases/testcases.c | 27 +--- .../arm64/signal/testcases/testcases.h | 28 +++- .../selftests/kvm/aarch64/get-reg-list.c | 14 ++ tools/testing/selftests/mm/Makefile | 2 +- tools/testing/selftests/mm/pkey-arm64.h | 139 ++++++++++++++++++ tools/testing/selftests/mm/pkey-helpers.h | 8 + tools/testing/selftests/mm/pkey-powerpc.h | 3 + tools/testing/selftests/mm/pkey-x86.h | 4 + tools/testing/selftests/mm/protection_keys.c | 109 ++++++++++++-- 52 files changed, 1060 insertions(+), 63 deletions(-) create mode 100644 arch/arm64/include/asm/pkeys.h create mode 100644 arch/arm64/include/asm/por.h create mode 100644 tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c create mode 100644 tools/testing/selftests/mm/pkey-arm64.h
The new config option specifies how many bits are in each PKEY.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Nicholas Piggin npiggin@gmail.com Cc: Christophe Leroy christophe.leroy@csgroup.eu Cc: "Aneesh Kumar K.V" aneesh.kumar@kernel.org Cc: "Naveen N. Rao" naveen.n.rao@linux.ibm.com Cc: linuxppc-dev@lists.ozlabs.org Acked-by: Michael Ellerman mpe@ellerman.id.au --- arch/powerpc/Kconfig | 4 ++++ 1 file changed, 4 insertions(+)
diff --git arch/powerpc/Kconfig arch/powerpc/Kconfig index d7b09b064a8a..8a4ee57cd4ef 100644 --- arch/powerpc/Kconfig +++ arch/powerpc/Kconfig @@ -1026,6 +1026,10 @@ config PPC_MEM_KEYS
If unsure, say y.
+config ARCH_PKEY_BITS + int + default 5 + config PPC_SECURE_BOOT prompt "Enable secure boot support" bool
The new config option specifies how many bits are in each PKEY.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Thomas Gleixner tglx@linutronix.de Cc: Ingo Molnar mingo@redhat.com Cc: Borislav Petkov bp@alien8.de Cc: Dave Hansen dave.hansen@linux.intel.com Cc: H. Peter Anvin hpa@zytor.com Cc: x86@kernel.org Acked-by: Dave Hansen dave.hansen@linux.intel.com --- arch/x86/Kconfig | 4 ++++ 1 file changed, 4 insertions(+)
diff --git arch/x86/Kconfig arch/x86/Kconfig index 007bab9f2a0e..683c0a64efe2 100644 --- arch/x86/Kconfig +++ arch/x86/Kconfig @@ -1889,6 +1889,10 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS
If unsure, say y.
+config ARCH_PKEY_BITS + int + default 4 + choice prompt "TSX enable mode" depends on CPU_SUP_INTEL
Use the new CONFIG_ARCH_PKEY_BITS to simplify setting these bits for different architectures.
Signed-off-by: Joey Gouly joey.gouly@arm.com
Cc: Andrew Morton akpm@linux-foundation.org Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Acked-by: Dave Hansen dave.hansen@linux.intel.com Reviewed-by: Anshuman Khandual anshuman.khandual@arm.com --- fs/proc/task_mmu.c | 2 ++ include/linux/mm.h | 16 ++++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-)
diff --git fs/proc/task_mmu.c fs/proc/task_mmu.c index 5f171ad7b436..2c5f4814aef9 100644 --- fs/proc/task_mmu.c +++ fs/proc/task_mmu.c @@ -976,7 +976,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_PKEY_BIT0)] = "", [ilog2(VM_PKEY_BIT1)] = "", [ilog2(VM_PKEY_BIT2)] = "", +#if VM_PKEY_BIT3 [ilog2(VM_PKEY_BIT3)] = "", +#endif #if VM_PKEY_BIT4 [ilog2(VM_PKEY_BIT4)] = "", #endif diff --git include/linux/mm.h include/linux/mm.h index 6549d0979b28..56dc2481cc0f 100644 --- include/linux/mm.h +++ include/linux/mm.h @@ -330,12 +330,16 @@ extern unsigned int kobjsize(const void *objp); #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#ifdef CONFIG_ARCH_HAS_PKEYS -# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 -# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */ -# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 /* on x86 and 5-bit value on ppc64 */ -# define VM_PKEY_BIT2 VM_HIGH_ARCH_2 -# define VM_PKEY_BIT3 VM_HIGH_ARCH_3 -#ifdef CONFIG_PPC +# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 +# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 +# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 +# define VM_PKEY_BIT2 VM_HIGH_ARCH_2 +#if CONFIG_ARCH_PKEY_BITS > 3 +# define VM_PKEY_BIT3 VM_HIGH_ARCH_3 +#else +# define VM_PKEY_BIT3 0 +#endif +#if CONFIG_ARCH_PKEY_BITS > 4 # define VM_PKEY_BIT4 VM_HIGH_ARCH_4 #else # define VM_PKEY_BIT4 0
Allow EL0 or EL1 to access POR_EL0 without being trapped to EL2.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Acked-by: Catalin Marinas catalin.marinas@arm.com Reviewed-by: Anshuman Khandual anshuman.khandual@arm.com --- arch/arm64/include/asm/el2_setup.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git arch/arm64/include/asm/el2_setup.h arch/arm64/include/asm/el2_setup.h index fd87c4b8f984..212191ecad40 100644 --- arch/arm64/include/asm/el2_setup.h +++ arch/arm64/include/asm/el2_setup.h @@ -185,12 +185,20 @@ .Lset_pie_fgt_@: mrs_s x1, SYS_ID_AA64MMFR3_EL1 ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4 - cbz x1, .Lset_fgt_@ + cbz x1, .Lset_poe_fgt_@
/* Disable trapping of PIR_EL1 / PIRE0_EL1 */ orr x0, x0, #HFGxTR_EL2_nPIR_EL1 orr x0, x0, #HFGxTR_EL2_nPIRE0_EL1
+.Lset_poe_fgt_@: + mrs_s x1, SYS_ID_AA64MMFR3_EL1 + ubfx x1, x1, #ID_AA64MMFR3_EL1_S1POE_SHIFT, #4 + cbz x1, .Lset_fgt_@ + + /* Disable trapping of POR_EL0 */ + orr x0, x0, #HFGxTR_EL2_nPOR_EL0 + .Lset_fgt_@: msr_s SYS_HFGRTR_EL2, x0 msr_s SYS_HFGWTR_EL2, x0
On Thu, Aug 22, 2024 at 04:10:47PM +0100, Joey Gouly wrote:
Allow EL0 or EL1 to access POR_EL0 without being trapped to EL2.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Acked-by: Catalin Marinas catalin.marinas@arm.com Reviewed-by: Anshuman Khandual anshuman.khandual@arm.com
arch/arm64/include/asm/el2_setup.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git arch/arm64/include/asm/el2_setup.h arch/arm64/include/asm/el2_setup.h index fd87c4b8f984..212191ecad40 100644 --- arch/arm64/include/asm/el2_setup.h +++ arch/arm64/include/asm/el2_setup.h @@ -185,12 +185,20 @@ .Lset_pie_fgt_@: mrs_s x1, SYS_ID_AA64MMFR3_EL1 ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4
- cbz x1, .Lset_fgt_@
- cbz x1, .Lset_poe_fgt_@
/* Disable trapping of PIR_EL1 / PIRE0_EL1 */ orr x0, x0, #HFGxTR_EL2_nPIR_EL1 orr x0, x0, #HFGxTR_EL2_nPIRE0_EL1 +.Lset_poe_fgt_@:
- mrs_s x1, SYS_ID_AA64MMFR3_EL1
- ubfx x1, x1, #ID_AA64MMFR3_EL1_S1POE_SHIFT, #4
- cbz x1, .Lset_fgt_@
- /* Disable trapping of POR_EL0 */
- orr x0, x0, #HFGxTR_EL2_nPOR_EL0
Dave's reworking the labels here on for-next/misc:
https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git/commit/?h=fo...
Please can you follow that new style for the new label so that we don't end up with an immediate inconsistency? Leave the '.Lset_fgt' label as-is and I'll fix it when we merge the branches.
Cheers,
Will
This indicates if the system supports POE. This is a CPUCAP_BOOT_CPU_FEATURE as the boot CPU will enable POE if it has it, so secondary CPUs must also have this feature.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Acked-by: Catalin Marinas catalin.marinas@arm.com Reviewed-by: Anshuman Khandual anshuman.khandual@arm.com --- arch/arm64/kernel/cpufeature.c | 9 +++++++++ arch/arm64/tools/cpucaps | 1 + 2 files changed, 10 insertions(+)
diff --git arch/arm64/kernel/cpufeature.c arch/arm64/kernel/cpufeature.c index 646ecd3069fd..2daf5597cd65 100644 --- arch/arm64/kernel/cpufeature.c +++ arch/arm64/kernel/cpufeature.c @@ -2870,6 +2870,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_nv1, ARM64_CPUID_FIELDS_NEG(ID_AA64MMFR4_EL1, E2H0, NI_NV1) }, +#ifdef CONFIG_ARM64_POE + { + .desc = "Stage-1 Permission Overlay Extension (S1POE)", + .capability = ARM64_HAS_S1POE, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) + }, +#endif {}, };
diff --git arch/arm64/tools/cpucaps arch/arm64/tools/cpucaps index ac3429d892b9..eedb5acc21ed 100644 --- arch/arm64/tools/cpucaps +++ arch/arm64/tools/cpucaps @@ -45,6 +45,7 @@ HAS_MOPS HAS_NESTED_VIRT HAS_PAN HAS_S1PIE +HAS_S1POE HAS_RAS_EXTN HAS_RNG HAS_SB
POR_EL0 is a register that can be modified by userspace directly, so it must be context switched.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Catalin Marinas catalin.marinas@arm.com --- arch/arm64/include/asm/cpufeature.h | 6 ++++++ arch/arm64/include/asm/processor.h | 1 + arch/arm64/include/asm/sysreg.h | 3 +++ arch/arm64/kernel/process.c | 28 ++++++++++++++++++++++++++++ 4 files changed, 38 insertions(+)
diff --git arch/arm64/include/asm/cpufeature.h arch/arm64/include/asm/cpufeature.h index 558434267271..3d261cc123c1 100644 --- arch/arm64/include/asm/cpufeature.h +++ arch/arm64/include/asm/cpufeature.h @@ -832,6 +832,12 @@ static inline bool system_supports_lpa2(void) return cpus_have_final_cap(ARM64_HAS_LPA2); }
+static inline bool system_supports_poe(void) +{ + return IS_ENABLED(CONFIG_ARM64_POE) && + alternative_has_cap_unlikely(ARM64_HAS_S1POE); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn);
diff --git arch/arm64/include/asm/processor.h arch/arm64/include/asm/processor.h index f77371232d8c..e6376f979273 100644 --- arch/arm64/include/asm/processor.h +++ arch/arm64/include/asm/processor.h @@ -184,6 +184,7 @@ struct thread_struct { u64 sctlr_user; u64 svcr; u64 tpidr2_el0; + u64 por_el0; };
static inline unsigned int thread_get_vl(struct thread_struct *thread, diff --git arch/arm64/include/asm/sysreg.h arch/arm64/include/asm/sysreg.h index 4a9ea103817e..494e9efd856f 100644 --- arch/arm64/include/asm/sysreg.h +++ arch/arm64/include/asm/sysreg.h @@ -1077,6 +1077,9 @@ #define POE_RXW UL(0x7) #define POE_MASK UL(0xf)
+/* Initial value for Permission Overlay Extension for EL0 */ +#define POR_EL0_INIT POE_RXW + #define ARM64_FEATURE_FIELD_BITS 4
/* Defined for compatibility only, do not add new users. */ diff --git arch/arm64/kernel/process.c arch/arm64/kernel/process.c index 4ae31b7af6c3..a3a61ecdb165 100644 --- arch/arm64/kernel/process.c +++ arch/arm64/kernel/process.c @@ -271,12 +271,23 @@ static void flush_tagged_addr_state(void) clear_thread_flag(TIF_TAGGED_ADDR); }
+static void flush_poe(void) +{ + if (!system_supports_poe()) + return; + + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); + /* ISB required for kernel uaccess routines when changing POR_EL0 */ + isb(); +} + void flush_thread(void) { fpsimd_flush_thread(); tls_thread_flush(); flush_ptrace_hw_breakpoint(current); flush_tagged_addr_state(); + flush_poe(); }
void arch_release_task_struct(struct task_struct *tsk) @@ -371,6 +382,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (system_supports_tpidr2()) p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0);
+ if (system_supports_poe()) + p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); + if (stack_start) { if (is_compat_thread(task_thread_info(p))) childregs->compat_sp = stack_start; @@ -495,6 +509,19 @@ static void erratum_1418040_new_exec(void) preempt_enable(); }
+static void permission_overlay_switch(struct task_struct *next) +{ + if (!system_supports_poe()) + return; + + current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); + if (current->thread.por_el0 != next->thread.por_el0) { + write_sysreg_s(next->thread.por_el0, SYS_POR_EL0); + /* ISB required for kernel uaccess routines when chaning POR_EL0 */ + isb(); + } +} + /* * __switch_to() checks current->thread.sctlr_user as an optimisation. Therefore * this function must be called with preemption disabled and the update to @@ -530,6 +557,7 @@ struct task_struct *__switch_to(struct task_struct *prev, ssbs_thread_switch(next); erratum_1418040_thread_switch(next); ptrauth_thread_switch_user(next); + permission_overlay_switch(next);
/* * Complete any pending TLB or cache maintenance on this CPU in case
On Thu, Aug 22, 2024 at 04:10:49PM +0100, Joey Gouly wrote:
POR_EL0 is a register that can be modified by userspace directly, so it must be context switched.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Catalin Marinas catalin.marinas@arm.com
arch/arm64/include/asm/cpufeature.h | 6 ++++++ arch/arm64/include/asm/processor.h | 1 + arch/arm64/include/asm/sysreg.h | 3 +++ arch/arm64/kernel/process.c | 28 ++++++++++++++++++++++++++++ 4 files changed, 38 insertions(+)
[...]
+static void permission_overlay_switch(struct task_struct *next) +{
- if (!system_supports_poe())
return;
- current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
- if (current->thread.por_el0 != next->thread.por_el0) {
write_sysreg_s(next->thread.por_el0, SYS_POR_EL0);
/* ISB required for kernel uaccess routines when chaning POR_EL0 */
nit: typo "chaning".
But more substantially, is this just to prevent spurious faults in the context of a new thread using a stale value for POR_EL0?
Will
On Fri, Aug 23, 2024 at 03:45:32PM +0100, Will Deacon wrote:
On Thu, Aug 22, 2024 at 04:10:49PM +0100, Joey Gouly wrote:
+static void permission_overlay_switch(struct task_struct *next) +{
- if (!system_supports_poe())
return;
- current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
- if (current->thread.por_el0 != next->thread.por_el0) {
write_sysreg_s(next->thread.por_el0, SYS_POR_EL0);
/* ISB required for kernel uaccess routines when chaning POR_EL0 */
nit: typo "chaning".
But more substantially, is this just to prevent spurious faults in the context of a new thread using a stale value for POR_EL0?
Not just prevent faults but enforce the permissions from the new thread's POR_EL0. The kernel may continue with a uaccess routine from here, we can't tell.
On Fri, Aug 23, 2024 at 05:41:06PM +0100, Catalin Marinas wrote:
On Fri, Aug 23, 2024 at 03:45:32PM +0100, Will Deacon wrote:
On Thu, Aug 22, 2024 at 04:10:49PM +0100, Joey Gouly wrote:
+static void permission_overlay_switch(struct task_struct *next) +{
- if (!system_supports_poe())
return;
- current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
- if (current->thread.por_el0 != next->thread.por_el0) {
write_sysreg_s(next->thread.por_el0, SYS_POR_EL0);
/* ISB required for kernel uaccess routines when chaning POR_EL0 */
nit: typo "chaning".
But more substantially, is this just to prevent spurious faults in the context of a new thread using a stale value for POR_EL0?
Not just prevent faults but enforce the permissions from the new thread's POR_EL0. The kernel may continue with a uaccess routine from here, we can't tell.
Hmm, I wondered if that was the case. It's a bit weird though, because:
- There's a window between switch_mm() and switch_to() where you might reasonably expect to be able to execute uaccess routines
- kthread_use_mm() doesn't/can't look at this at all
- GUP obviously doesn't care
So what do we actually gain by having the uaccess routines honour this?
Will
On Fri, Aug 23, 2024 at 06:08:36PM +0100, Will Deacon wrote:
On Fri, Aug 23, 2024 at 05:41:06PM +0100, Catalin Marinas wrote:
On Fri, Aug 23, 2024 at 03:45:32PM +0100, Will Deacon wrote:
On Thu, Aug 22, 2024 at 04:10:49PM +0100, Joey Gouly wrote:
+static void permission_overlay_switch(struct task_struct *next) +{
- if (!system_supports_poe())
return;
- current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
- if (current->thread.por_el0 != next->thread.por_el0) {
write_sysreg_s(next->thread.por_el0, SYS_POR_EL0);
/* ISB required for kernel uaccess routines when chaning POR_EL0 */
nit: typo "chaning".
But more substantially, is this just to prevent spurious faults in the context of a new thread using a stale value for POR_EL0?
Not just prevent faults but enforce the permissions from the new thread's POR_EL0. The kernel may continue with a uaccess routine from here, we can't tell.
Hmm, I wondered if that was the case. It's a bit weird though, because:
- There's a window between switch_mm() and switch_to() where you might reasonably expect to be able to execute uaccess routines
I don't think we can have any uaccess between these two switches (a uaccess could fault, that's a pretty weird state between these two).
- kthread_use_mm() doesn't/can't look at this at all
No, but a kthread would have it's own, most permissive, POR_EL0.
- GUP obviously doesn't care
So what do we actually gain by having the uaccess routines honour this?
I guess where it matters is more like not accidentally faulting because the previous thread had more restrictive permissions.
On Fri, Aug 23, 2024 at 07:40:52PM +0100, Catalin Marinas wrote:
On Fri, Aug 23, 2024 at 06:08:36PM +0100, Will Deacon wrote:
On Fri, Aug 23, 2024 at 05:41:06PM +0100, Catalin Marinas wrote:
On Fri, Aug 23, 2024 at 03:45:32PM +0100, Will Deacon wrote:
On Thu, Aug 22, 2024 at 04:10:49PM +0100, Joey Gouly wrote:
+static void permission_overlay_switch(struct task_struct *next) +{
- if (!system_supports_poe())
return;
- current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
- if (current->thread.por_el0 != next->thread.por_el0) {
write_sysreg_s(next->thread.por_el0, SYS_POR_EL0);
/* ISB required for kernel uaccess routines when chaning POR_EL0 */
nit: typo "chaning".
But more substantially, is this just to prevent spurious faults in the context of a new thread using a stale value for POR_EL0?
Not just prevent faults but enforce the permissions from the new thread's POR_EL0. The kernel may continue with a uaccess routine from here, we can't tell.
Hmm, I wondered if that was the case. It's a bit weird though, because:
- There's a window between switch_mm() and switch_to() where you might reasonably expect to be able to execute uaccess routines
I don't think we can have any uaccess between these two switches (a uaccess could fault, that's a pretty weird state between these two).
- kthread_use_mm() doesn't/can't look at this at all
No, but a kthread would have it's own, most permissive, POR_EL0.
- GUP obviously doesn't care
So what do we actually gain by having the uaccess routines honour this?
I guess where it matters is more like not accidentally faulting because the previous thread had more restrictive permissions.
That's what I wondered initially, but won't the fault handler retry in that case?
Will
On Tue, Aug 27, 2024 at 12:38:04PM +0100, Will Deacon wrote:
On Fri, Aug 23, 2024 at 07:40:52PM +0100, Catalin Marinas wrote:
On Fri, Aug 23, 2024 at 06:08:36PM +0100, Will Deacon wrote:
On Fri, Aug 23, 2024 at 05:41:06PM +0100, Catalin Marinas wrote:
On Fri, Aug 23, 2024 at 03:45:32PM +0100, Will Deacon wrote:
On Thu, Aug 22, 2024 at 04:10:49PM +0100, Joey Gouly wrote:
+static void permission_overlay_switch(struct task_struct *next) +{
- if (!system_supports_poe())
return;
- current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
- if (current->thread.por_el0 != next->thread.por_el0) {
write_sysreg_s(next->thread.por_el0, SYS_POR_EL0);
/* ISB required for kernel uaccess routines when chaning POR_EL0 */
nit: typo "chaning".
But more substantially, is this just to prevent spurious faults in the context of a new thread using a stale value for POR_EL0?
Not just prevent faults but enforce the permissions from the new thread's POR_EL0. The kernel may continue with a uaccess routine from here, we can't tell.
[...]
So what do we actually gain by having the uaccess routines honour this?
I guess where it matters is more like not accidentally faulting because the previous thread had more restrictive permissions.
That's what I wondered initially, but won't the fault handler retry in that case?
Yes, it will retry and this should be fine (I assume you are only talking about the dropping ISB in the context switch).
For the case of running with a more permissive stale POR_EL0, arguably it's slightly more predictable for the user but, OTOH, some syscalls like readv() could be routed through GUP with no checks. As with MTE, we don't guarantee uaccesses honour the user permissions.
That said, at some point we should sanitise this path anyway and have a single ISB at the end. In the meantime, I'm fine with dropping the ISB here.
On Mon, Sep 02, 2024 at 08:08:08PM +0100, Catalin Marinas wrote:
On Tue, Aug 27, 2024 at 12:38:04PM +0100, Will Deacon wrote:
On Fri, Aug 23, 2024 at 07:40:52PM +0100, Catalin Marinas wrote:
On Fri, Aug 23, 2024 at 06:08:36PM +0100, Will Deacon wrote:
On Fri, Aug 23, 2024 at 05:41:06PM +0100, Catalin Marinas wrote:
On Fri, Aug 23, 2024 at 03:45:32PM +0100, Will Deacon wrote:
On Thu, Aug 22, 2024 at 04:10:49PM +0100, Joey Gouly wrote: > +static void permission_overlay_switch(struct task_struct *next) > +{ > + if (!system_supports_poe()) > + return; > + > + current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); > + if (current->thread.por_el0 != next->thread.por_el0) { > + write_sysreg_s(next->thread.por_el0, SYS_POR_EL0); > + /* ISB required for kernel uaccess routines when chaning POR_EL0 */
nit: typo "chaning".
But more substantially, is this just to prevent spurious faults in the context of a new thread using a stale value for POR_EL0?
Not just prevent faults but enforce the permissions from the new thread's POR_EL0. The kernel may continue with a uaccess routine from here, we can't tell.
[...]
So what do we actually gain by having the uaccess routines honour this?
I guess where it matters is more like not accidentally faulting because the previous thread had more restrictive permissions.
That's what I wondered initially, but won't the fault handler retry in that case?
Yes, it will retry and this should be fine (I assume you are only talking about the dropping ISB in the context switch).
For the case of running with a more permissive stale POR_EL0, arguably it's slightly more predictable for the user but, OTOH, some syscalls like readv() could be routed through GUP with no checks. As with MTE, we don't guarantee uaccesses honour the user permissions.
That said, at some point we should sanitise this path anyway and have a single ISB at the end. In the meantime, I'm fine with dropping the ISB here.
commit 3141fb86bee8d48ae47cab1594dad54f974a8899 Author: Joey Gouly joey.gouly@arm.com Date: Tue Sep 3 15:47:26 2024 +0100
fixup! arm64: context switch POR_EL0 register
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index a3a61ecdb165..c224b0955f1a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -515,11 +515,8 @@ static void permission_overlay_switch(struct task_struct *next) return;
current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); - if (current->thread.por_el0 != next->thread.por_el0) { + if (current->thread.por_el0 != next->thread.por_el0) write_sysreg_s(next->thread.por_el0, SYS_POR_EL0); - /* ISB required for kernel uaccess routines when chaning POR_EL0 */ - isb(); - } }
/*
Will, do you want me to re-send the series with this and the permissions diff from the other thread [1], or you ok with applying them when you pull it in?
Thanks, Joey
[1] https://lore.kernel.org/all/20240903144823.GA3669886@e124191.cambridge.arm.c...
On Tue, Sep 03, 2024 at 03:54:13PM +0100, Joey Gouly wrote:
On Mon, Sep 02, 2024 at 08:08:08PM +0100, Catalin Marinas wrote:
On Tue, Aug 27, 2024 at 12:38:04PM +0100, Will Deacon wrote:
On Fri, Aug 23, 2024 at 07:40:52PM +0100, Catalin Marinas wrote:
On Fri, Aug 23, 2024 at 06:08:36PM +0100, Will Deacon wrote:
On Fri, Aug 23, 2024 at 05:41:06PM +0100, Catalin Marinas wrote:
On Fri, Aug 23, 2024 at 03:45:32PM +0100, Will Deacon wrote: > On Thu, Aug 22, 2024 at 04:10:49PM +0100, Joey Gouly wrote: > > +static void permission_overlay_switch(struct task_struct *next) > > +{ > > + if (!system_supports_poe()) > > + return; > > + > > + current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); > > + if (current->thread.por_el0 != next->thread.por_el0) { > > + write_sysreg_s(next->thread.por_el0, SYS_POR_EL0); > > + /* ISB required for kernel uaccess routines when chaning POR_EL0 */ > > nit: typo "chaning". > > But more substantially, is this just to prevent spurious faults in the > context of a new thread using a stale value for POR_EL0?
Not just prevent faults but enforce the permissions from the new thread's POR_EL0. The kernel may continue with a uaccess routine from here, we can't tell.
[...]
So what do we actually gain by having the uaccess routines honour this?
I guess where it matters is more like not accidentally faulting because the previous thread had more restrictive permissions.
That's what I wondered initially, but won't the fault handler retry in that case?
Yes, it will retry and this should be fine (I assume you are only talking about the dropping ISB in the context switch).
For the case of running with a more permissive stale POR_EL0, arguably it's slightly more predictable for the user but, OTOH, some syscalls like readv() could be routed through GUP with no checks. As with MTE, we don't guarantee uaccesses honour the user permissions.
That said, at some point we should sanitise this path anyway and have a single ISB at the end. In the meantime, I'm fine with dropping the ISB here.
commit 3141fb86bee8d48ae47cab1594dad54f974a8899 Author: Joey Gouly joey.gouly@arm.com Date: Tue Sep 3 15:47:26 2024 +0100
fixup! arm64: context switch POR_EL0 register
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index a3a61ecdb165..c224b0955f1a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -515,11 +515,8 @@ static void permission_overlay_switch(struct task_struct *next) return;
current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
if (current->thread.por_el0 != next->thread.por_el0) {
if (current->thread.por_el0 != next->thread.por_el0) write_sysreg_s(next->thread.por_el0, SYS_POR_EL0);
/* ISB required for kernel uaccess routines when chaning POR_EL0 */
isb();
}
}
What about the one in flush_poe()? I'm inclined to drop that as well.
Will, do you want me to re-send the series with this and the permissions diff from the other thread [1], or you ok with applying them when you pull it in?
I'll have a crack now, but if it fails miserably then I'll let you know.
Will
On Wed, Sep 04, 2024 at 11:22:54AM +0100, Will Deacon wrote:
On Tue, Sep 03, 2024 at 03:54:13PM +0100, Joey Gouly wrote:
On Mon, Sep 02, 2024 at 08:08:08PM +0100, Catalin Marinas wrote:
On Tue, Aug 27, 2024 at 12:38:04PM +0100, Will Deacon wrote:
On Fri, Aug 23, 2024 at 07:40:52PM +0100, Catalin Marinas wrote:
On Fri, Aug 23, 2024 at 06:08:36PM +0100, Will Deacon wrote:
On Fri, Aug 23, 2024 at 05:41:06PM +0100, Catalin Marinas wrote: > On Fri, Aug 23, 2024 at 03:45:32PM +0100, Will Deacon wrote: > > On Thu, Aug 22, 2024 at 04:10:49PM +0100, Joey Gouly wrote: > > > +static void permission_overlay_switch(struct task_struct *next) > > > +{ > > > + if (!system_supports_poe()) > > > + return; > > > + > > > + current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); > > > + if (current->thread.por_el0 != next->thread.por_el0) { > > > + write_sysreg_s(next->thread.por_el0, SYS_POR_EL0); > > > + /* ISB required for kernel uaccess routines when chaning POR_EL0 */ > > > > nit: typo "chaning". > > > > But more substantially, is this just to prevent spurious faults in the > > context of a new thread using a stale value for POR_EL0? > > Not just prevent faults but enforce the permissions from the new > thread's POR_EL0. The kernel may continue with a uaccess routine from > here, we can't tell.
[...]
So what do we actually gain by having the uaccess routines honour this?
I guess where it matters is more like not accidentally faulting because the previous thread had more restrictive permissions.
That's what I wondered initially, but won't the fault handler retry in that case?
Yes, it will retry and this should be fine (I assume you are only talking about the dropping ISB in the context switch).
For the case of running with a more permissive stale POR_EL0, arguably it's slightly more predictable for the user but, OTOH, some syscalls like readv() could be routed through GUP with no checks. As with MTE, we don't guarantee uaccesses honour the user permissions.
That said, at some point we should sanitise this path anyway and have a single ISB at the end. In the meantime, I'm fine with dropping the ISB here.
commit 3141fb86bee8d48ae47cab1594dad54f974a8899 Author: Joey Gouly joey.gouly@arm.com Date: Tue Sep 3 15:47:26 2024 +0100
fixup! arm64: context switch POR_EL0 register
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index a3a61ecdb165..c224b0955f1a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -515,11 +515,8 @@ static void permission_overlay_switch(struct task_struct *next) return;
current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
if (current->thread.por_el0 != next->thread.por_el0) {
if (current->thread.por_el0 != next->thread.por_el0) write_sysreg_s(next->thread.por_el0, SYS_POR_EL0);
/* ISB required for kernel uaccess routines when chaning POR_EL0 */
isb();
}
}
What about the one in flush_poe()? I'm inclined to drop that as well.
Yes I guess that one can be removed too. Catalin any comments?
Will, do you want me to re-send the series with this and the permissions diff from the other thread [1], or you ok with applying them when you pull it in?
I'll have a crack now, but if it fails miserably then I'll let you know.
Thanks! Just to make sure, you should pick the patch up from
https://lore.kernel.org/linux-arm-kernel/20240903152937.GA3768522@e124191.ca...
Not the one I linked to in [1] in my previous e-mail.
Thanks, Joey
On Wed, Sep 04, 2024 at 12:32:21PM +0100, Joey Gouly wrote:
On Wed, Sep 04, 2024 at 11:22:54AM +0100, Will Deacon wrote:
On Tue, Sep 03, 2024 at 03:54:13PM +0100, Joey Gouly wrote:
On Mon, Sep 02, 2024 at 08:08:08PM +0100, Catalin Marinas wrote:
On Tue, Aug 27, 2024 at 12:38:04PM +0100, Will Deacon wrote:
On Fri, Aug 23, 2024 at 07:40:52PM +0100, Catalin Marinas wrote:
On Fri, Aug 23, 2024 at 06:08:36PM +0100, Will Deacon wrote: > On Fri, Aug 23, 2024 at 05:41:06PM +0100, Catalin Marinas wrote: > > On Fri, Aug 23, 2024 at 03:45:32PM +0100, Will Deacon wrote: > > > On Thu, Aug 22, 2024 at 04:10:49PM +0100, Joey Gouly wrote: > > > > +static void permission_overlay_switch(struct task_struct *next) > > > > +{ > > > > + if (!system_supports_poe()) > > > > + return; > > > > + > > > > + current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); > > > > + if (current->thread.por_el0 != next->thread.por_el0) { > > > > + write_sysreg_s(next->thread.por_el0, SYS_POR_EL0); > > > > + /* ISB required for kernel uaccess routines when chaning POR_EL0 */ > > > > > > nit: typo "chaning". > > > > > > But more substantially, is this just to prevent spurious faults in the > > > context of a new thread using a stale value for POR_EL0? > > > > Not just prevent faults but enforce the permissions from the new > > thread's POR_EL0. The kernel may continue with a uaccess routine from > > here, we can't tell.
[...]
> So what do we actually gain by having the uaccess routines honour this?
I guess where it matters is more like not accidentally faulting because the previous thread had more restrictive permissions.
That's what I wondered initially, but won't the fault handler retry in that case?
Yes, it will retry and this should be fine (I assume you are only talking about the dropping ISB in the context switch).
For the case of running with a more permissive stale POR_EL0, arguably it's slightly more predictable for the user but, OTOH, some syscalls like readv() could be routed through GUP with no checks. As with MTE, we don't guarantee uaccesses honour the user permissions.
That said, at some point we should sanitise this path anyway and have a single ISB at the end. In the meantime, I'm fine with dropping the ISB here.
commit 3141fb86bee8d48ae47cab1594dad54f974a8899 Author: Joey Gouly joey.gouly@arm.com Date: Tue Sep 3 15:47:26 2024 +0100
fixup! arm64: context switch POR_EL0 register
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index a3a61ecdb165..c224b0955f1a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -515,11 +515,8 @@ static void permission_overlay_switch(struct task_struct *next) return;
current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
if (current->thread.por_el0 != next->thread.por_el0) {
if (current->thread.por_el0 != next->thread.por_el0) write_sysreg_s(next->thread.por_el0, SYS_POR_EL0);
/* ISB required for kernel uaccess routines when chaning POR_EL0 */
isb();
}
}
What about the one in flush_poe()? I'm inclined to drop that as well.
Yes I guess that one can be removed too. Catalin any comments?
Will, do you want me to re-send the series with this and the permissions diff from the other thread [1], or you ok with applying them when you pull it in?
I'll have a crack now, but if it fails miserably then I'll let you know.
Thanks! Just to make sure, you should pick the patch up from
https://lore.kernel.org/linux-arm-kernel/20240903152937.GA3768522@e124191.ca...
Not the one I linked to in [1] in my previous e-mail.
Right, there's quite a lot I need to do:
- Uncorrupt your patches - Fix the conflict in the kvm selftests - Drop the unnecessary ISBs - Fix the ESR checking - Fix the el2_setup labels - Reorder the patches - Drop the patch that is already in kvmarm
Working on it...
Will
On Wed, Sep 04, 2024 at 12:43:02PM +0100, Will Deacon wrote:
On Wed, Sep 04, 2024 at 12:32:21PM +0100, Joey Gouly wrote:
On Wed, Sep 04, 2024 at 11:22:54AM +0100, Will Deacon wrote:
On Tue, Sep 03, 2024 at 03:54:13PM +0100, Joey Gouly wrote:
On Mon, Sep 02, 2024 at 08:08:08PM +0100, Catalin Marinas wrote:
On Tue, Aug 27, 2024 at 12:38:04PM +0100, Will Deacon wrote:
On Fri, Aug 23, 2024 at 07:40:52PM +0100, Catalin Marinas wrote: > On Fri, Aug 23, 2024 at 06:08:36PM +0100, Will Deacon wrote: > > On Fri, Aug 23, 2024 at 05:41:06PM +0100, Catalin Marinas wrote: > > > On Fri, Aug 23, 2024 at 03:45:32PM +0100, Will Deacon wrote: > > > > On Thu, Aug 22, 2024 at 04:10:49PM +0100, Joey Gouly wrote: > > > > > +static void permission_overlay_switch(struct task_struct *next) > > > > > +{ > > > > > + if (!system_supports_poe()) > > > > > + return; > > > > > + > > > > > + current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); > > > > > + if (current->thread.por_el0 != next->thread.por_el0) { > > > > > + write_sysreg_s(next->thread.por_el0, SYS_POR_EL0); > > > > > + /* ISB required for kernel uaccess routines when chaning POR_EL0 */ > > > > > > > > nit: typo "chaning". > > > > > > > > But more substantially, is this just to prevent spurious faults in the > > > > context of a new thread using a stale value for POR_EL0? > > > > > > Not just prevent faults but enforce the permissions from the new > > > thread's POR_EL0. The kernel may continue with a uaccess routine from > > > here, we can't tell.
[...]
> > So what do we actually gain by having the uaccess routines honour this? > > I guess where it matters is more like not accidentally faulting because > the previous thread had more restrictive permissions.
That's what I wondered initially, but won't the fault handler retry in that case?
Yes, it will retry and this should be fine (I assume you are only talking about the dropping ISB in the context switch).
For the case of running with a more permissive stale POR_EL0, arguably it's slightly more predictable for the user but, OTOH, some syscalls like readv() could be routed through GUP with no checks. As with MTE, we don't guarantee uaccesses honour the user permissions.
That said, at some point we should sanitise this path anyway and have a single ISB at the end. In the meantime, I'm fine with dropping the ISB here.
commit 3141fb86bee8d48ae47cab1594dad54f974a8899 Author: Joey Gouly joey.gouly@arm.com Date: Tue Sep 3 15:47:26 2024 +0100
fixup! arm64: context switch POR_EL0 register
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index a3a61ecdb165..c224b0955f1a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -515,11 +515,8 @@ static void permission_overlay_switch(struct task_struct *next) return;
current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
if (current->thread.por_el0 != next->thread.por_el0) {
if (current->thread.por_el0 != next->thread.por_el0) write_sysreg_s(next->thread.por_el0, SYS_POR_EL0);
/* ISB required for kernel uaccess routines when chaning POR_EL0 */
isb();
}
}
What about the one in flush_poe()? I'm inclined to drop that as well.
Yes I guess that one can be removed too. Catalin any comments?
Will, do you want me to re-send the series with this and the permissions diff from the other thread [1], or you ok with applying them when you pull it in?
I'll have a crack now, but if it fails miserably then I'll let you know.
Thanks! Just to make sure, you should pick the patch up from
https://lore.kernel.org/linux-arm-kernel/20240903152937.GA3768522@e124191.ca...
Not the one I linked to in [1] in my previous e-mail.
Right, there's quite a lot I need to do:
- Uncorrupt your patches
- Fix the conflict in the kvm selftests
- Drop the unnecessary ISBs
- Fix the ESR checking
- Fix the el2_setup labels
- Reorder the patches
- Drop the patch that is already in kvmarm
Working on it...
Sorry! I'm happy to rebase onto some arm64 branch if that will help, just let me know.
Will
On Wed, Sep 04, 2024 at 01:55:03PM +0100, Joey Gouly wrote:
On Wed, Sep 04, 2024 at 12:43:02PM +0100, Will Deacon wrote:
Right, there's quite a lot I need to do:
- Uncorrupt your patches
- Fix the conflict in the kvm selftests
- Drop the unnecessary ISBs
- Fix the ESR checking
- Fix the el2_setup labels
- Reorder the patches
- Drop the patch that is already in kvmarm
Working on it...
Sorry! I'm happy to rebase onto some arm64 branch if that will help, just let me know.
Please have a look at for-next/poe (also merged into for-next/core and for-kernelci) and let me know what I got wrong!
For Marc: I reordered the series so the KVM bits (and deps) are all the beginning, should you need them. The branch is based on a merge of the shared branch you created previously.
Cheers,
Will
On Wed, 04 Sep 2024 17:17:58 +0100, Will Deacon will@kernel.org wrote:
On Wed, Sep 04, 2024 at 01:55:03PM +0100, Joey Gouly wrote:
On Wed, Sep 04, 2024 at 12:43:02PM +0100, Will Deacon wrote:
Right, there's quite a lot I need to do:
- Uncorrupt your patches
- Fix the conflict in the kvm selftests
- Drop the unnecessary ISBs
- Fix the ESR checking
- Fix the el2_setup labels
- Reorder the patches
- Drop the patch that is already in kvmarm
Working on it...
Sorry! I'm happy to rebase onto some arm64 branch if that will help, just let me know.
Please have a look at for-next/poe (also merged into for-next/core and for-kernelci) and let me know what I got wrong!
For Marc: I reordered the series so the KVM bits (and deps) are all the beginning, should you need them. The branch is based on a merge of the shared branch you created previously.
I just had a quick check, and while there is a small conflict with kvmarm/next, it is extremely minor (small clash in the vcpu_sysreg, for which the resolving order doesn't matter), and not worth dragging additional patches in the shared branch.
However, if KVM's own S1PIE series [1] ends up being merged (which I'd really like), I will definitely have to pull the prefix in, as this is a bit more involved conflict wise.
Thanks,
M.
[1] http://lore.kernel.org/all/20240903153834.1909472-1-maz@kernel.org
On Wed, Sep 04, 2024 at 05:17:58PM +0100, Will Deacon wrote:
On Wed, Sep 04, 2024 at 01:55:03PM +0100, Joey Gouly wrote:
On Wed, Sep 04, 2024 at 12:43:02PM +0100, Will Deacon wrote:
Right, there's quite a lot I need to do:
- Uncorrupt your patches
- Fix the conflict in the kvm selftests
- Drop the unnecessary ISBs
- Fix the ESR checking
- Fix the el2_setup labels
- Reorder the patches
- Drop the patch that is already in kvmarm
Working on it...
Sorry! I'm happy to rebase onto some arm64 branch if that will help, just let me know.
Please have a look at for-next/poe (also merged into for-next/core and for-kernelci) and let me know what I got wrong!
I pulled for-next/poe and ran the test and it works fine. Also looked at the diff of my branch against your branch, and it looks fine too.
Thanks for your work to get this merged!
For Marc: I reordered the series so the KVM bits (and deps) are all the beginning, should you need them. The branch is based on a merge of the shared branch you created previously.
Cheers,
Will
On Wed, Sep 04, 2024 at 11:22:54AM +0100, Will Deacon wrote:
On Tue, Sep 03, 2024 at 03:54:13PM +0100, Joey Gouly wrote:
commit 3141fb86bee8d48ae47cab1594dad54f974a8899 Author: Joey Gouly joey.gouly@arm.com Date: Tue Sep 3 15:47:26 2024 +0100
fixup! arm64: context switch POR_EL0 register
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index a3a61ecdb165..c224b0955f1a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -515,11 +515,8 @@ static void permission_overlay_switch(struct task_struct *next) return;
current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
if (current->thread.por_el0 != next->thread.por_el0) {
if (current->thread.por_el0 != next->thread.por_el0) write_sysreg_s(next->thread.por_el0, SYS_POR_EL0);
/* ISB required for kernel uaccess routines when chaning POR_EL0 */
isb();
}
}
What about the one in flush_poe()? I'm inclined to drop that as well.
Yes, that's a similar thing.
On 22/08/2024 17:10, Joey Gouly wrote:
@@ -371,6 +382,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (system_supports_tpidr2()) p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0);
if (system_supports_poe())
p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
Here we are only reloading POR_EL0's value if the target is a user thread. However, as this series stands, POR_EL0 is also relevant to kthreads, because any uaccess or GUP done from a kthread will also be checked against POR_EL0. This is especially important in cases like the io_uring kthread, which accesses the memory of the user process that spawned it. To prevent such a kthread from inheriting a stale value of POR_EL0, it seems that we should reload POR_EL0's value in all cases (user and kernel thread).
Other approaches could also be considered (e.g. resetting POR_EL0 to unrestricted when creating a kthread), see my reply on v4 [1].
Kevin
[1] https://lore.kernel.org/linux-arm-kernel/b4f8b351-4c83-43b4-bfbe-8f67f3f56fb...
On 9/11/24 08:01, Kevin Brodsky wrote:
On 22/08/2024 17:10, Joey Gouly wrote:
@@ -371,6 +382,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (system_supports_tpidr2()) p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0);
if (system_supports_poe())
p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
Here we are only reloading POR_EL0's value if the target is a user thread. However, as this series stands, POR_EL0 is also relevant to kthreads, because any uaccess or GUP done from a kthread will also be checked against POR_EL0. This is especially important in cases like the io_uring kthread, which accesses the memory of the user process that spawned it. To prevent such a kthread from inheriting a stale value of POR_EL0, it seems that we should reload POR_EL0's value in all cases (user and kernel thread).
The problem with this is trying to figure out which POR_EL0 to use. The kthread could have been spawned ages ago and might not have a POR_EL0 which is very different from the current value of any of the threads in the process right now.
There's also no great way for a kthread to reach out and grab an updated value. It's all completely inherently racy.
Other approaches could also be considered (e.g. resetting POR_EL0 to unrestricted when creating a kthread), see my reply on v4 [1].
I kinda think this is the only way to go. It's the only sensible, predictable way. I _think_ it's what x86 will end up doing with PKRU, but there's been enough churn there that I'd need to go double check what happens in practice.
Either way, it would be nice to get an io_uring test in here that actually spawns kthreads:
tools/testing/selftests/mm/protection_keys.c
Hi Dave,
On Wed, Sep 11, 2024 at 08:33:54AM -0700, Dave Hansen wrote:
On 9/11/24 08:01, Kevin Brodsky wrote:
On 22/08/2024 17:10, Joey Gouly wrote:
@@ -371,6 +382,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (system_supports_tpidr2()) p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0);
if (system_supports_poe())
p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
Here we are only reloading POR_EL0's value if the target is a user thread. However, as this series stands, POR_EL0 is also relevant to kthreads, because any uaccess or GUP done from a kthread will also be checked against POR_EL0. This is especially important in cases like the io_uring kthread, which accesses the memory of the user process that spawned it. To prevent such a kthread from inheriting a stale value of POR_EL0, it seems that we should reload POR_EL0's value in all cases (user and kernel thread).
The problem with this is trying to figure out which POR_EL0 to use. The kthread could have been spawned ages ago and might not have a POR_EL0 which is very different from the current value of any of the threads in the process right now.
There's also no great way for a kthread to reach out and grab an updated value. It's all completely inherently racy.
Other approaches could also be considered (e.g. resetting POR_EL0 to unrestricted when creating a kthread), see my reply on v4 [1].
I kinda think this is the only way to go. It's the only sensible, predictable way. I _think_ it's what x86 will end up doing with PKRU, but there's been enough churn there that I'd need to go double check what happens in practice.
I agree.
Either way, it would be nice to get an io_uring test in here that actually spawns kthreads:
tools/testing/selftests/mm/protection_keys.c
It would be good to update Documentation/core-api/protection-keys.rst as well, since the example with read() raises more questions than it answers!
Kevin, Joey -- I've got this series queued in arm64 as-is, so perhaps you could send some patches on top so we can iron this out in time for 6.12? I'll also be at LPC next week if you're about.
Cheers,
Will
On Thu, Sep 12, 2024 at 11:50:18AM +0100, Will Deacon wrote:
Hi Dave,
On Wed, Sep 11, 2024 at 08:33:54AM -0700, Dave Hansen wrote:
On 9/11/24 08:01, Kevin Brodsky wrote:
On 22/08/2024 17:10, Joey Gouly wrote:
@@ -371,6 +382,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (system_supports_tpidr2()) p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0);
if (system_supports_poe())
p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
Here we are only reloading POR_EL0's value if the target is a user thread. However, as this series stands, POR_EL0 is also relevant to kthreads, because any uaccess or GUP done from a kthread will also be checked against POR_EL0. This is especially important in cases like the io_uring kthread, which accesses the memory of the user process that spawned it. To prevent such a kthread from inheriting a stale value of POR_EL0, it seems that we should reload POR_EL0's value in all cases (user and kernel thread).
The problem with this is trying to figure out which POR_EL0 to use. The kthread could have been spawned ages ago and might not have a POR_EL0 which is very different from the current value of any of the threads in the process right now.
There's also no great way for a kthread to reach out and grab an updated value. It's all completely inherently racy.
Other approaches could also be considered (e.g. resetting POR_EL0 to unrestricted when creating a kthread), see my reply on v4 [1].
I kinda think this is the only way to go. It's the only sensible, predictable way. I _think_ it's what x86 will end up doing with PKRU, but there's been enough churn there that I'd need to go double check what happens in practice.
I agree.
Either way, it would be nice to get an io_uring test in here that actually spawns kthreads:
tools/testing/selftests/mm/protection_keys.c
It would be good to update Documentation/core-api/protection-keys.rst as well, since the example with read() raises more questions than it answers!
Kevin, Joey -- I've got this series queued in arm64 as-is, so perhaps you could send some patches on top so we can iron this out in time for 6.12? I'll also be at LPC next week if you're about.
I found the code in arch/x86 that does this, I must have missed this previously.
arch/x86/kernel/process.c: int copy_thread()
/* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { p->thread.pkru = pkru_get_init_value(); memset(childregs, 0, sizeof(struct pt_regs)); kthread_frame_init(frame, args->fn, args->fn_arg); return 0; }
I can send a similar patch for arm64. I have no idea how to write io_uring code, so looking for examples I can work with to get a test written. Might just send the arm64 fix first, if that's fine?
Thanks, Joey
On Thu, Sep 12, 2024 at 01:48:35PM +0100, Joey Gouly wrote:
On Thu, Sep 12, 2024 at 11:50:18AM +0100, Will Deacon wrote:
On Wed, Sep 11, 2024 at 08:33:54AM -0700, Dave Hansen wrote:
On 9/11/24 08:01, Kevin Brodsky wrote:
On 22/08/2024 17:10, Joey Gouly wrote:
@@ -371,6 +382,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (system_supports_tpidr2()) p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0);
if (system_supports_poe())
p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
Here we are only reloading POR_EL0's value if the target is a user thread. However, as this series stands, POR_EL0 is also relevant to kthreads, because any uaccess or GUP done from a kthread will also be checked against POR_EL0. This is especially important in cases like the io_uring kthread, which accesses the memory of the user process that spawned it. To prevent such a kthread from inheriting a stale value of POR_EL0, it seems that we should reload POR_EL0's value in all cases (user and kernel thread).
The problem with this is trying to figure out which POR_EL0 to use. The kthread could have been spawned ages ago and might not have a POR_EL0 which is very different from the current value of any of the threads in the process right now.
There's also no great way for a kthread to reach out and grab an updated value. It's all completely inherently racy.
Other approaches could also be considered (e.g. resetting POR_EL0 to unrestricted when creating a kthread), see my reply on v4 [1].
I kinda think this is the only way to go. It's the only sensible, predictable way. I _think_ it's what x86 will end up doing with PKRU, but there's been enough churn there that I'd need to go double check what happens in practice.
I agree.
Either way, it would be nice to get an io_uring test in here that actually spawns kthreads:
tools/testing/selftests/mm/protection_keys.c
It would be good to update Documentation/core-api/protection-keys.rst as well, since the example with read() raises more questions than it answers!
Kevin, Joey -- I've got this series queued in arm64 as-is, so perhaps you could send some patches on top so we can iron this out in time for 6.12? I'll also be at LPC next week if you're about.
I found the code in arch/x86 that does this, I must have missed this previously.
arch/x86/kernel/process.c: int copy_thread()
/* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { p->thread.pkru = pkru_get_init_value(); memset(childregs, 0, sizeof(struct pt_regs)); kthread_frame_init(frame, args->fn, args->fn_arg); return 0; }
I can send a similar patch for arm64. I have no idea how to write io_uring code, so looking for examples I can work with to get a test written. Might just send the arm64 fix first, if that's fine?
I think fix + documentation is what we need before 6.12, but you've still got plenty of time after the merge window.
Cheers,
Will
Dave Hansen dave.hansen@intel.com writes:
On 9/11/24 08:01, Kevin Brodsky wrote:
On 22/08/2024 17:10, Joey Gouly wrote:
@@ -371,6 +382,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (system_supports_tpidr2()) p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0);
if (system_supports_poe())
p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
Here we are only reloading POR_EL0's value if the target is a user thread. However, as this series stands, POR_EL0 is also relevant to kthreads, because any uaccess or GUP done from a kthread will also be checked against POR_EL0. This is especially important in cases like the io_uring kthread, which accesses the memory of the user process that spawned it. To prevent such a kthread from inheriting a stale value of POR_EL0, it seems that we should reload POR_EL0's value in all cases (user and kernel thread).
The problem with this is trying to figure out which POR_EL0 to use. The kthread could have been spawned ages ago and might not have a POR_EL0 which is very different from the current value of any of the threads in the process right now.
There's also no great way for a kthread to reach out and grab an updated value. It's all completely inherently racy.
Other approaches could also be considered (e.g. resetting POR_EL0 to unrestricted when creating a kthread), see my reply on v4 [1].
I kinda think this is the only way to go. It's the only sensible, predictable way. I _think_ it's what x86 will end up doing with PKRU, but there's been enough churn there that I'd need to go double check what happens in practice.
that is also what powerpc does.
/* usage of kthread_use_mm() should inherit the * AMR value of the operating address space. But, the AMR value is * thread-specific and we inherit the address space and not thread * access restrictions. Because of this ignore AMR value when accessing * userspace via kernel thread. */ static __always_inline u64 current_thread_amr(void) { if (current->thread.regs) return current->thread.regs->amr; return default_amr; }
-aneesh
Define the new system registers that POE introduces and context switch them.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Marc Zyngier maz@kernel.org Cc: Oliver Upton oliver.upton@linux.dev Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org --- arch/arm64/include/asm/kvm_host.h | 4 ++++ arch/arm64/include/asm/vncr_mapping.h | 1 + arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 27 ++++++++++++++++++++++ arch/arm64/kvm/sys_regs.c | 19 +++++++++++++-- 4 files changed, 49 insertions(+), 2 deletions(-)
diff --git arch/arm64/include/asm/kvm_host.h arch/arm64/include/asm/kvm_host.h index a33f5996ca9f..5c9de7692201 100644 --- arch/arm64/include/asm/kvm_host.h +++ arch/arm64/include/asm/kvm_host.h @@ -446,6 +446,8 @@ enum vcpu_sysreg { GCR_EL1, /* Tag Control Register */ TFSRE0_EL1, /* Tag Fault Status Register (EL0) */
+ POR_EL0, /* Permission Overlay Register 0 (EL0) */ + /* 32bit specific registers. */ DACR32_EL2, /* Domain Access Control Register */ IFSR32_EL2, /* Instruction Fault Status Register */ @@ -517,6 +519,8 @@ enum vcpu_sysreg { VNCR(PIR_EL1), /* Permission Indirection Register 1 (EL1) */ VNCR(PIRE0_EL1), /* Permission Indirection Register 0 (EL1) */
+ VNCR(POR_EL1), /* Permission Overlay Register 1 (EL1) */ + VNCR(HFGRTR_EL2), VNCR(HFGWTR_EL2), VNCR(HFGITR_EL2), diff --git arch/arm64/include/asm/vncr_mapping.h arch/arm64/include/asm/vncr_mapping.h index df2c47c55972..06f8ec0906a6 100644 --- arch/arm64/include/asm/vncr_mapping.h +++ arch/arm64/include/asm/vncr_mapping.h @@ -52,6 +52,7 @@ #define VNCR_PIRE0_EL1 0x290 #define VNCR_PIRE0_EL2 0x298 #define VNCR_PIR_EL1 0x2A0 +#define VNCR_POR_EL1 0x2A8 #define VNCR_ICH_LR0_EL2 0x400 #define VNCR_ICH_LR1_EL2 0x408 #define VNCR_ICH_LR2_EL2 0x410 diff --git arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 4c0fdabaf8ae..1579a3c08a36 100644 --- arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -16,9 +16,15 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h>
+static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt); + static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) { ctxt_sys_reg(ctxt, MDSCR_EL1) = read_sysreg(mdscr_el1); + + // POR_EL0 can affect uaccess, so must be saved/restored early. + if (ctxt_has_s1poe(ctxt)) + ctxt_sys_reg(ctxt, POR_EL0) = read_sysreg_s(SYS_POR_EL0); }
static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt) @@ -66,6 +72,17 @@ static inline bool ctxt_has_tcrx(struct kvm_cpu_context *ctxt) return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, TCRX, IMP); }
+static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!system_supports_poe()) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, S1POE, IMP); +} + static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) { ctxt_sys_reg(ctxt, SCTLR_EL1) = read_sysreg_el1(SYS_SCTLR); @@ -80,6 +97,9 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR); ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0); } + + if (ctxt_has_s1poe(ctxt)) + ctxt_sys_reg(ctxt, POR_EL1) = read_sysreg_el1(SYS_POR); } ctxt_sys_reg(ctxt, ESR_EL1) = read_sysreg_el1(SYS_ESR); ctxt_sys_reg(ctxt, AFSR0_EL1) = read_sysreg_el1(SYS_AFSR0); @@ -120,6 +140,10 @@ static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt) { write_sysreg(ctxt_sys_reg(ctxt, MDSCR_EL1), mdscr_el1); + + // POR_EL0 can affect uaccess, so must be saved/restored early. + if (ctxt_has_s1poe(ctxt)) + write_sysreg_s(ctxt_sys_reg(ctxt, POR_EL0), SYS_POR_EL0); }
static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) @@ -158,6 +182,9 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1), SYS_PIR); write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0); } + + if (ctxt_has_s1poe(ctxt)) + write_sysreg_el1(ctxt_sys_reg(ctxt, POR_EL1), SYS_POR); } write_sysreg_el1(ctxt_sys_reg(ctxt, ESR_EL1), SYS_ESR); write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR0_EL1), SYS_AFSR0); diff --git arch/arm64/kvm/sys_regs.c arch/arm64/kvm/sys_regs.c index c90324060436..e7208b59ea12 100644 --- arch/arm64/kvm/sys_regs.c +++ arch/arm64/kvm/sys_regs.c @@ -2255,6 +2255,15 @@ static bool access_zcr_el2(struct kvm_vcpu *vcpu, return true; }
+static unsigned int s1poe_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + return 0; + + return REG_HIDDEN; +} + /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 @@ -2492,6 +2501,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1 }, { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1 }, + { SYS_DESC(SYS_POR_EL1), NULL, reset_unknown, POR_EL1, + .visibility = s1poe_visibility }, { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 },
{ SYS_DESC(SYS_LORSA_EL1), trap_loregion }, @@ -2578,6 +2589,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { .access = access_pmovs, .reg = PMOVSSET_EL0, .get_user = get_pmreg, .set_user = set_pmreg },
+ { SYS_DESC(SYS_POR_EL0), NULL, reset_unknown, POR_EL0, + .visibility = s1poe_visibility }, { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 }, { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 }, { SYS_DESC(SYS_TPIDR2_EL0), undef_access }, @@ -4568,8 +4581,6 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) kvm->arch.fgu[HFGxTR_GROUP] = (HFGxTR_EL2_nAMAIR2_EL1 | HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nS2POR_EL1 | - HFGxTR_EL2_nPOR_EL1 | - HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nACCDATA_EL1 | HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK); @@ -4604,6 +4615,10 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1);
+ if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPOR_EL1 | + HFGxTR_EL2_nPOR_EL0); + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) kvm->arch.fgu[HAFGRTR_GROUP] |= ~(HAFGRTR_EL2_RES0 | HAFGRTR_EL2_RES1);
To allow using newer instructions that current assemblers don't know about, replace the `at` instruction with the underlying SYS instruction.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Marc Zyngier maz@kernel.org Cc: Oliver Upton oliver.upton@linux.dev Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org --- arch/arm64/include/asm/kvm_asm.h | 3 ++- arch/arm64/kvm/hyp/include/hyp/fault.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-)
diff --git arch/arm64/include/asm/kvm_asm.h arch/arm64/include/asm/kvm_asm.h index 2181a11b9d92..38d7bfa3966a 100644 --- arch/arm64/include/asm/kvm_asm.h +++ arch/arm64/include/asm/kvm_asm.h @@ -9,6 +9,7 @@
#include <asm/hyp_image.h> #include <asm/insn.h> +#include <asm/sysreg.h> #include <asm/virt.h>
#define ARM_EXIT_WITH_SERROR_BIT 31 @@ -259,7 +260,7 @@ extern u64 __kvm_get_mdcr_el2(void); asm volatile( \ " mrs %1, spsr_el2\n" \ " mrs %2, elr_el2\n" \ - "1: at "at_op", %3\n" \ + "1: " __msr_s(at_op, "%3") "\n" \ " isb\n" \ " b 9f\n" \ "2: msr spsr_el2, %1\n" \ diff --git arch/arm64/kvm/hyp/include/hyp/fault.h arch/arm64/kvm/hyp/include/hyp/fault.h index 9e13c1bc2ad5..487c06099d6f 100644 --- arch/arm64/kvm/hyp/include/hyp/fault.h +++ arch/arm64/kvm/hyp/include/hyp/fault.h @@ -27,7 +27,7 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) * saved the guest context yet, and we may return early... */ par = read_sysreg_par(); - if (!__kvm_at("s1e1r", far)) + if (!__kvm_at(OP_AT_S1E1R, far)) tmp = read_sysreg_par(); else tmp = SYS_PAR_EL1_F; /* back to the guest */
On Thu, Aug 22, 2024 at 04:10:51PM +0100, Joey Gouly wrote:
To allow using newer instructions that current assemblers don't know about, replace the `at` instruction with the underlying SYS instruction.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Marc Zyngier maz@kernel.org Cc: Oliver Upton oliver.upton@linux.dev Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org
arch/arm64/include/asm/kvm_asm.h | 3 ++- arch/arm64/kvm/hyp/include/hyp/fault.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-)
Marc -- what would you like to do with this patch? I think the POE series is really close now, so ideally I'd queue the lot on a branch in arm64 and you could pull the first ~10 patches into kvmarm if you need 'em.
Would what work for you, or did you have something else in mind (since this one is also included in your series adding nv support for AT).
Cheers,
Will
On Fri, 23 Aug 2024 14:48:11 +0100, Will Deacon will@kernel.org wrote:
On Thu, Aug 22, 2024 at 04:10:51PM +0100, Joey Gouly wrote:
To allow using newer instructions that current assemblers don't know about, replace the `at` instruction with the underlying SYS instruction.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Marc Zyngier maz@kernel.org Cc: Oliver Upton oliver.upton@linux.dev Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org
arch/arm64/include/asm/kvm_asm.h | 3 ++- arch/arm64/kvm/hyp/include/hyp/fault.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-)
Marc -- what would you like to do with this patch? I think the POE series is really close now, so ideally I'd queue the lot on a branch in arm64 and you could pull the first ~10 patches into kvmarm if you need 'em.
Would what work for you, or did you have something else in mind (since this one is also included in your series adding nv support for AT).
Yup, that works for me. I can take that prefix as the base for the AT series and drop my copy of this patch.
Thanks,
M.
Hi Will,
On Fri, 23 Aug 2024 14:48:11 +0100, Will Deacon will@kernel.org wrote:
On Thu, Aug 22, 2024 at 04:10:51PM +0100, Joey Gouly wrote:
To allow using newer instructions that current assemblers don't know about, replace the `at` instruction with the underlying SYS instruction.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Marc Zyngier maz@kernel.org Cc: Oliver Upton oliver.upton@linux.dev Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org
arch/arm64/include/asm/kvm_asm.h | 3 ++- arch/arm64/kvm/hyp/include/hyp/fault.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-)
Marc -- what would you like to do with this patch? I think the POE series is really close now, so ideally I'd queue the lot on a branch in arm64 and you could pull the first ~10 patches into kvmarm if you need 'em.
Would what work for you, or did you have something else in mind (since this one is also included in your series adding nv support for AT).
Is there any progress on this front? I am quite eager to queue the AT series, but the dependency on this patch is preventing me to do so.
I can see there are outstanding questions on the POE series, so I was wondering if we should consider reversing the dependency: I can create a stable branch with this single patch, which you can pull as a prefix of the POE series.
Please let me know what you prefer.
Thanks,
M.
Hey Marc,
On Fri, Aug 30, 2024 at 09:01:18AM +0100, Marc Zyngier wrote:
On Fri, 23 Aug 2024 14:48:11 +0100, Will Deacon will@kernel.org wrote:
On Thu, Aug 22, 2024 at 04:10:51PM +0100, Joey Gouly wrote:
To allow using newer instructions that current assemblers don't know about, replace the `at` instruction with the underlying SYS instruction.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Marc Zyngier maz@kernel.org Cc: Oliver Upton oliver.upton@linux.dev Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org
arch/arm64/include/asm/kvm_asm.h | 3 ++- arch/arm64/kvm/hyp/include/hyp/fault.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-)
Marc -- what would you like to do with this patch? I think the POE series is really close now, so ideally I'd queue the lot on a branch in arm64 and you could pull the first ~10 patches into kvmarm if you need 'em.
Would what work for you, or did you have something else in mind (since this one is also included in your series adding nv support for AT).
Is there any progress on this front? I am quite eager to queue the AT series, but the dependency on this patch is preventing me to do so.
I can see there are outstanding questions on the POE series, so I was wondering if we should consider reversing the dependency: I can create a stable branch with this single patch, which you can pull as a prefix of the POE series.
That sounds like a good idea. The uaccess discussion seems to have stalled and I don't really want to merge the series without concluding that.
So please go ahead with this single patch and I'll pull it in if things start moving again.
Will
On Fri, 30 Aug 2024 10:05:22 +0100, Will Deacon will@kernel.org wrote:
Hey Marc,
On Fri, Aug 30, 2024 at 09:01:18AM +0100, Marc Zyngier wrote:
On Fri, 23 Aug 2024 14:48:11 +0100, Will Deacon will@kernel.org wrote:
On Thu, Aug 22, 2024 at 04:10:51PM +0100, Joey Gouly wrote:
To allow using newer instructions that current assemblers don't know about, replace the `at` instruction with the underlying SYS instruction.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Marc Zyngier maz@kernel.org Cc: Oliver Upton oliver.upton@linux.dev Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org
arch/arm64/include/asm/kvm_asm.h | 3 ++- arch/arm64/kvm/hyp/include/hyp/fault.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-)
Marc -- what would you like to do with this patch? I think the POE series is really close now, so ideally I'd queue the lot on a branch in arm64 and you could pull the first ~10 patches into kvmarm if you need 'em.
Would what work for you, or did you have something else in mind (since this one is also included in your series adding nv support for AT).
Is there any progress on this front? I am quite eager to queue the AT series, but the dependency on this patch is preventing me to do so.
I can see there are outstanding questions on the POE series, so I was wondering if we should consider reversing the dependency: I can create a stable branch with this single patch, which you can pull as a prefix of the POE series.
That sounds like a good idea. The uaccess discussion seems to have stalled and I don't really want to merge the series without concluding that.
So please go ahead with this single patch and I'll pull it in if things start moving again.
Now pushed to [1].
Thanks,
M.
[1] https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git/log/?h=arm...
On Thu, Aug 22, 2024 at 04:10:51PM +0100, Joey Gouly wrote:
To allow using newer instructions that current assemblers don't know about, replace the `at` instruction with the underlying SYS instruction.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Marc Zyngier maz@kernel.org Cc: Oliver Upton oliver.upton@linux.dev Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org
arch/arm64/include/asm/kvm_asm.h | 3 ++- arch/arm64/kvm/hyp/include/hyp/fault.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-)
Acked-by: Will Deacon will@kernel.org
diff --git arch/arm64/include/asm/kvm_asm.h arch/arm64/include/asm/kvm_asm.h index 2181a11b9d92..38d7bfa3966a 100644 --- arch/arm64/include/asm/kvm_asm.h +++ arch/arm64/include/asm/kvm_asm.h
FWIW (mainly for Marc): you seem to be missing the 'a/' and 'b/' prefixes here, so my git would't accept the change when I tried to apply locally for testing.
Will
On Fri, 30 Aug 2024 10:25:27 +0100, Will Deacon will@kernel.org wrote:
On Thu, Aug 22, 2024 at 04:10:51PM +0100, Joey Gouly wrote:
To allow using newer instructions that current assemblers don't know about, replace the `at` instruction with the underlying SYS instruction.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Marc Zyngier maz@kernel.org Cc: Oliver Upton oliver.upton@linux.dev Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org
arch/arm64/include/asm/kvm_asm.h | 3 ++- arch/arm64/kvm/hyp/include/hyp/fault.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-)
Acked-by: Will Deacon will@kernel.org
diff --git arch/arm64/include/asm/kvm_asm.h arch/arm64/include/asm/kvm_asm.h index 2181a11b9d92..38d7bfa3966a 100644 --- arch/arm64/include/asm/kvm_asm.h +++ arch/arm64/include/asm/kvm_asm.h
FWIW (mainly for Marc): you seem to be missing the 'a/' and 'b/' prefixes here, so my git would't accept the change when I tried to apply locally for testing.
Seems like a spurious '--no-prefix' was added at patch formatting time, That clashes with git-apply's default '-p1', which strips the first component of the path.
There's probably a way to pass '-p0' to 'git am', but I don't feel like trawling the git documentation by such a temperature...
M.
On Fri, Aug 30, 2024 at 12:23:33PM +0100, Marc Zyngier wrote:
On Fri, 30 Aug 2024 10:25:27 +0100, Will Deacon will@kernel.org wrote:
On Thu, Aug 22, 2024 at 04:10:51PM +0100, Joey Gouly wrote:
To allow using newer instructions that current assemblers don't know about, replace the `at` instruction with the underlying SYS instruction.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Marc Zyngier maz@kernel.org Cc: Oliver Upton oliver.upton@linux.dev Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org
arch/arm64/include/asm/kvm_asm.h | 3 ++- arch/arm64/kvm/hyp/include/hyp/fault.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-)
Acked-by: Will Deacon will@kernel.org
diff --git arch/arm64/include/asm/kvm_asm.h arch/arm64/include/asm/kvm_asm.h index 2181a11b9d92..38d7bfa3966a 100644 --- arch/arm64/include/asm/kvm_asm.h +++ arch/arm64/include/asm/kvm_asm.h
FWIW (mainly for Marc): you seem to be missing the 'a/' and 'b/' prefixes here, so my git would't accept the change when I tried to apply locally for testing.
Seems like a spurious '--no-prefix' was added at patch formatting time, That clashes with git-apply's default '-p1', which strips the first component of the path.
I had --no-prefix in my .git/config for diffs, but I didn't realise that also applied to git format-patch, sorry for that. I have removed it now.
If you want me to resend v5, or something else, let me know.
There's probably a way to pass '-p0' to 'git am', but I don't feel like trawling the git documentation by such a temperature...
M.
related to uaccess: Catalin is away, sure when he's back, so I'm hoping we can resolve that when he's around.
Thanks, Joey
FEAT_ATS1E1A introduces a new instruction: `at s1e1a`. This is an address translation, without permission checks.
POE allows read permissions to be removed from S1 by the guest. This means that an `at` instruction could fail, and not get the IPA.
Switch to using `at s1e1a` so that KVM can get the IPA regardless of S1 permissions.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Marc Zyngier maz@kernel.org Cc: Oliver Upton oliver.upton@linux.dev Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org --- arch/arm64/kvm/hyp/include/hyp/fault.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git arch/arm64/kvm/hyp/include/hyp/fault.h arch/arm64/kvm/hyp/include/hyp/fault.h index 487c06099d6f..17df94570f03 100644 --- arch/arm64/kvm/hyp/include/hyp/fault.h +++ arch/arm64/kvm/hyp/include/hyp/fault.h @@ -14,6 +14,7 @@
static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) { + int ret; u64 par, tmp;
/* @@ -27,7 +28,9 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) * saved the guest context yet, and we may return early... */ par = read_sysreg_par(); - if (!__kvm_at(OP_AT_S1E1R, far)) + ret = system_supports_poe() ? __kvm_at(OP_AT_S1E1A, far) : + __kvm_at(OP_AT_S1E1R, far); + if (!ret) tmp = read_sysreg_par(); else tmp = SYS_PAR_EL1_F; /* back to the guest */
Add the missing sanitisation of ID_AA64MMFR3_EL1, making sure we solely expose S1POE and TCRX (we currently don't support anything else).
[joey: Took Marc's patch for S1PIE, and changed it for S1POE]
Signed-off-by: Marc Zyngier maz@kernel.org Signed-off-by: Joey Gouly joey.gouly@arm.com --- arch/arm64/kvm/sys_regs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git arch/arm64/kvm/sys_regs.c arch/arm64/kvm/sys_regs.c index e7208b59ea12..0f13378e761c 100644 --- arch/arm64/kvm/sys_regs.c +++ arch/arm64/kvm/sys_regs.c @@ -1556,6 +1556,9 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, case SYS_ID_AA64MMFR2_EL1: val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; break; + case SYS_ID_AA64MMFR3_EL1: + val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE; + break; case SYS_ID_MMFR4_EL1: val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX); break; @@ -2427,7 +2430,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64MMFR2_EL1_IDS | ID_AA64MMFR2_EL1_NV | ID_AA64MMFR2_EL1_CCIDX)), - ID_SANITISED(ID_AA64MMFR3_EL1), + ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX | + ID_AA64MMFR3_EL1_S1POE)), ID_SANITISED(ID_AA64MMFR4_EL1), ID_UNALLOCATED(7,5), ID_UNALLOCATED(7,6),
Expose a HWCAP and ID_AA64MMFR3_EL1_S1POE to userspace, so they can be used to check if the CPU supports the feature.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Catalin Marinas catalin.marinas@arm.com Reviewed-by: Anshuman Khandual anshuman.khandual@arm.com --- Documentation/arch/arm64/elf_hwcaps.rst | 2 ++ arch/arm64/include/asm/hwcap.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 1 + arch/arm64/kernel/cpufeature.c | 14 ++++++++++++++ arch/arm64/kernel/cpuinfo.c | 1 + 5 files changed, 19 insertions(+)
diff --git Documentation/arch/arm64/elf_hwcaps.rst Documentation/arch/arm64/elf_hwcaps.rst index 448c1664879b..694f67fa07d1 100644 --- Documentation/arch/arm64/elf_hwcaps.rst +++ Documentation/arch/arm64/elf_hwcaps.rst @@ -365,6 +365,8 @@ HWCAP2_SME_SF8DP2 HWCAP2_SME_SF8DP4 Functionality implied by ID_AA64SMFR0_EL1.SF8DP4 == 0b1.
+HWCAP2_POE + Functionality implied by ID_AA64MMFR3_EL1.S1POE == 0b0001.
4. Unused AT_HWCAP bits ----------------------- diff --git arch/arm64/include/asm/hwcap.h arch/arm64/include/asm/hwcap.h index 4edd3b61df11..a775adddecf2 100644 --- arch/arm64/include/asm/hwcap.h +++ arch/arm64/include/asm/hwcap.h @@ -157,6 +157,7 @@ #define KERNEL_HWCAP_SME_SF8FMA __khwcap2_feature(SME_SF8FMA) #define KERNEL_HWCAP_SME_SF8DP4 __khwcap2_feature(SME_SF8DP4) #define KERNEL_HWCAP_SME_SF8DP2 __khwcap2_feature(SME_SF8DP2) +#define KERNEL_HWCAP_POE __khwcap2_feature(POE)
/* * This yields a mask that user programs can use to figure out what diff --git arch/arm64/include/uapi/asm/hwcap.h arch/arm64/include/uapi/asm/hwcap.h index 285610e626f5..055381b2c615 100644 --- arch/arm64/include/uapi/asm/hwcap.h +++ arch/arm64/include/uapi/asm/hwcap.h @@ -122,5 +122,6 @@ #define HWCAP2_SME_SF8FMA (1UL << 60) #define HWCAP2_SME_SF8DP4 (1UL << 61) #define HWCAP2_SME_SF8DP2 (1UL << 62) +#define HWCAP2_POE (1UL << 63)
#endif /* _UAPI__ASM_HWCAP_H */ diff --git arch/arm64/kernel/cpufeature.c arch/arm64/kernel/cpufeature.c index 2daf5597cd65..718728a85430 100644 --- arch/arm64/kernel/cpufeature.c +++ arch/arm64/kernel/cpufeature.c @@ -466,6 +466,8 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { };
static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_POE), + FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1POE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1PIE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_TCRX_SHIFT, 4, 0), ARM64_FTR_END, @@ -2348,6 +2350,14 @@ static void cpu_enable_mops(const struct arm64_cpu_capabilities *__unused) sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_MSCEn); }
+#ifdef CONFIG_ARM64_POE +static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused) +{ + sysreg_clear_set(REG_TCR2_EL1, 0, TCR2_EL1x_E0POE); + sysreg_clear_set(CPACR_EL1, 0, CPACR_ELx_E0POE); +} +#endif + /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) @@ -2876,6 +2886,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_S1POE, .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, .matches = has_cpuid_feature, + .cpu_enable = cpu_enable_poe, ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) }, #endif @@ -3043,6 +3054,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP2), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E4M3, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E4M3), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E5M2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E5M2), +#ifdef CONFIG_ARM64_POE + HWCAP_CAP(ID_AA64MMFR3_EL1, S1POE, IMP, CAP_HWCAP, KERNEL_HWCAP_POE), +#endif {}, };
diff --git arch/arm64/kernel/cpuinfo.c arch/arm64/kernel/cpuinfo.c index 09eeaa24d456..b9db812082b3 100644 --- arch/arm64/kernel/cpuinfo.c +++ arch/arm64/kernel/cpuinfo.c @@ -143,6 +143,7 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SME_SF8FMA] = "smesf8fma", [KERNEL_HWCAP_SME_SF8DP4] = "smesf8dp4", [KERNEL_HWCAP_SME_SF8DP2] = "smesf8dp2", + [KERNEL_HWCAP_POE] = "poe", };
#ifdef CONFIG_COMPAT
VM_PKEY_BIT[012] will use VM_HIGH_ARCH_[012], move the MTE VM flags to accommodate this.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Acked-by: Catalin Marinas catalin.marinas@arm.com --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git include/linux/mm.h include/linux/mm.h index 56dc2481cc0f..39e6234e2365 100644 --- include/linux/mm.h +++ include/linux/mm.h @@ -378,8 +378,8 @@ extern unsigned int kobjsize(const void *objp); #endif
#if defined(CONFIG_ARM64_MTE) -# define VM_MTE VM_HIGH_ARCH_0 /* Use Tagged memory for access control */ -# define VM_MTE_ALLOWED VM_HIGH_ARCH_1 /* Tagged memory permitted */ +# define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */ +# define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */ #else # define VM_MTE VM_NONE # define VM_MTE_ALLOWED VM_NONE
The 3-bit POIndex is stored in the PTE at bits 60..62.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Acked-by: Catalin Marinas catalin.marinas@arm.com --- arch/arm64/include/asm/pgtable-hwdef.h | 10 ++++++++++ 1 file changed, 10 insertions(+)
diff --git arch/arm64/include/asm/pgtable-hwdef.h arch/arm64/include/asm/pgtable-hwdef.h index 1f60aa1bc750..3f0c3f5c5cef 100644 --- arch/arm64/include/asm/pgtable-hwdef.h +++ arch/arm64/include/asm/pgtable-hwdef.h @@ -199,6 +199,16 @@ #define PTE_PI_IDX_2 53 /* PXN */ #define PTE_PI_IDX_3 54 /* UXN */
+/* + * POIndex[2:0] encoding (Permission Overlay Extension) + */ +#define PTE_PO_IDX_0 (_AT(pteval_t, 1) << 60) +#define PTE_PO_IDX_1 (_AT(pteval_t, 1) << 61) +#define PTE_PO_IDX_2 (_AT(pteval_t, 1) << 62) + +#define PTE_PO_IDX_MASK GENMASK_ULL(62, 60) + + /* * Memory Attribute override for Stage-2 (MemAttr[3:0]) */
Modify arch_calc_vm_prot_bits() and vm_get_page_prot() such that the pkey value is set in the vm_flags and then into the pgprot value.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org --- arch/arm64/include/asm/mman.h | 10 +++++++++- arch/arm64/mm/mmap.c | 11 +++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-)
diff --git arch/arm64/include/asm/mman.h arch/arm64/include/asm/mman.h index 5966ee4a6154..52791715f6e6 100644 --- arch/arm64/include/asm/mman.h +++ arch/arm64/include/asm/mman.h @@ -7,7 +7,7 @@ #include <uapi/asm/mman.h>
static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, - unsigned long pkey __always_unused) + unsigned long pkey) { unsigned long ret = 0;
@@ -17,6 +17,14 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, if (system_supports_mte() && (prot & PROT_MTE)) ret |= VM_MTE;
+#ifdef CONFIG_ARCH_HAS_PKEYS + if (system_supports_poe()) { + ret |= pkey & BIT(0) ? VM_PKEY_BIT0 : 0; + ret |= pkey & BIT(1) ? VM_PKEY_BIT1 : 0; + ret |= pkey & BIT(2) ? VM_PKEY_BIT2 : 0; + } +#endif + return ret; } #define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey) diff --git arch/arm64/mm/mmap.c arch/arm64/mm/mmap.c index 642bdf908b22..7e3ad97e27d8 100644 --- arch/arm64/mm/mmap.c +++ arch/arm64/mm/mmap.c @@ -102,6 +102,17 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags) if (vm_flags & VM_MTE) prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED);
+#ifdef CONFIG_ARCH_HAS_PKEYS + if (system_supports_poe()) { + if (vm_flags & VM_PKEY_BIT0) + prot |= PTE_PO_IDX_0; + if (vm_flags & VM_PKEY_BIT1) + prot |= PTE_PO_IDX_1; + if (vm_flags & VM_PKEY_BIT2) + prot |= PTE_PO_IDX_2; + } +#endif + return __pgprot(prot); } EXPORT_SYMBOL(vm_get_page_prot);
When a PTE is modified, the POIndex must be masked off so that it can be modified.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Catalin Marinas catalin.marinas@arm.com Reviewed-by: Anshuman Khandual anshuman.khandual@arm.com --- arch/arm64/include/asm/pgtable.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git arch/arm64/include/asm/pgtable.h arch/arm64/include/asm/pgtable.h index 7a4f5604be3f..1d0f18d30e1e 100644 --- arch/arm64/include/asm/pgtable.h +++ arch/arm64/include/asm/pgtable.h @@ -1103,7 +1103,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) */ const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY | PTE_PRESENT_INVALID | PTE_VALID | PTE_WRITE | - PTE_GP | PTE_ATTRINDX_MASK; + PTE_GP | PTE_ATTRINDX_MASK | PTE_PO_IDX_MASK; + /* preserve the hardware dirty information */ if (pte_hw_dirty(pte)) pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
If a memory fault occurs that is due to an overlay/pkey fault, report that to userspace with a SEGV_PKUERR.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Catalin Marinas catalin.marinas@arm.com --- arch/arm64/include/asm/traps.h | 1 + arch/arm64/kernel/traps.c | 6 ++++ arch/arm64/mm/fault.c | 55 +++++++++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 1 deletion(-)
diff --git arch/arm64/include/asm/traps.h arch/arm64/include/asm/traps.h index eefe766d6161..d780d1bd2eac 100644 --- arch/arm64/include/asm/traps.h +++ arch/arm64/include/asm/traps.h @@ -25,6 +25,7 @@ try_emulate_armv8_deprecated(struct pt_regs *regs, u32 insn) void force_signal_inject(int signal, int code, unsigned long address, unsigned long err); void arm64_notify_segfault(unsigned long addr); void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str); +void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey); void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str); void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str);
diff --git arch/arm64/kernel/traps.c arch/arm64/kernel/traps.c index 9e22683aa921..9a11bb0db284 100644 --- arch/arm64/kernel/traps.c +++ arch/arm64/kernel/traps.c @@ -273,6 +273,12 @@ void arm64_force_sig_fault(int signo, int code, unsigned long far, force_sig_fault(signo, code, (void __user *)far); }
+void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey) +{ + arm64_show_signal(SIGSEGV, str); + force_sig_pkuerr((void __user *)far, pkey); +} + void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str) { diff --git arch/arm64/mm/fault.c arch/arm64/mm/fault.c index 451ba7cbd5ad..a68055150950 100644 --- arch/arm64/mm/fault.c +++ arch/arm64/mm/fault.c @@ -23,6 +23,7 @@ #include <linux/sched/debug.h> #include <linux/highmem.h> #include <linux/perf_event.h> +#include <linux/pkeys.h> #include <linux/preempt.h> #include <linux/hugetlb.h>
@@ -486,6 +487,23 @@ static void do_bad_area(unsigned long far, unsigned long esr, } }
+static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma, + unsigned int mm_flags) +{ + unsigned long iss2 = ESR_ELx_ISS2(esr); + + if (!system_supports_poe()) + return false; + + if (iss2 & ESR_ELx_Overlay) + return true; + + return !arch_vma_access_permitted(vma, + mm_flags & FAULT_FLAG_WRITE, + mm_flags & FAULT_FLAG_INSTRUCTION, + false); +} + static bool is_el0_instruction_abort(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; @@ -511,6 +529,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, unsigned long addr = untagged_addr(far); struct vm_area_struct *vma; int si_code; + int pkey = -1;
if (kprobe_page_fault(regs, esr)) return 0; @@ -575,6 +594,16 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, count_vm_vma_lock_event(VMA_LOCK_SUCCESS); goto bad_area; } + + if (fault_from_pkey(esr, vma, mm_flags)) { + pkey = vma_pkey(vma); + vma_end_read(vma); + fault = 0; + si_code = SEGV_PKUERR; + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto bad_area; + } + fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs); if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) vma_end_read(vma); @@ -610,7 +639,16 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, goto bad_area; }
+ if (fault_from_pkey(esr, vma, mm_flags)) { + pkey = vma_pkey(vma); + mmap_read_unlock(mm); + fault = 0; + si_code = SEGV_PKUERR; + goto bad_area; + } + fault = handle_mm_fault(vma, addr, mm_flags, regs); + /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) @@ -669,8 +707,23 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name); } else { + /* + * The pkey value that we return to userspace can be different + * from the pkey that caused the fault. + * + * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); + * 2. T1 : set POR_EL0 to deny access to pkey=4, touches, page + * 3. T1 : faults... + * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); + * 5. T1 : enters fault handler, takes mmap_lock, etc... + * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really + * faulted on a pte with its pkey=4. + */ /* Something tried to access memory that out of memory map */ - arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name); + if (si_code == SEGV_PKUERR) + arm64_force_sig_fault_pkey(far, inf->name, pkey); + else + arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name); }
return 0;
On Thu, Aug 22, 2024 at 04:10:59PM +0100, Joey Gouly wrote:
+static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma,
unsigned int mm_flags)
+{
- unsigned long iss2 = ESR_ELx_ISS2(esr);
- if (!system_supports_poe())
return false;
- if (iss2 & ESR_ELx_Overlay)
return true;
Does this need an is_data_abort() && is_instruction_abort() check? Overlay doesn't appear to be defined for all exception types and it wasn't clear enough to me that the callers have done this check.
On Thu, Aug 29, 2024 at 06:55:07PM +0100, Mark Brown wrote:
On Thu, Aug 22, 2024 at 04:10:59PM +0100, Joey Gouly wrote:
+static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma,
unsigned int mm_flags)
+{
- unsigned long iss2 = ESR_ELx_ISS2(esr);
- if (!system_supports_poe())
return false;
- if (iss2 & ESR_ELx_Overlay)
return true;
Does this need an is_data_abort() && is_instruction_abort() check? Overlay doesn't appear to be defined for all exception types and it wasn't clear enough to me that the callers have done this check.
The only callers are in do_page_fault(), which should only be data or instruction aborts. I talked with Catalin and he said it's fine to not check again here.
I can add a permissions check though:
commit 033270f5a9462e998b4dee11fc91b43ac7929756 Author: Joey Gouly joey.gouly@arm.com Date: Tue Sep 3 15:45:59 2024 +0100
fixup! arm64: handle PKEY/POE faults
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index a68055150950..f651553a8ab8 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -495,6 +495,9 @@ static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma, if (!system_supports_poe()) return false;
+ if (!esr_fsc_is_permission_fault(esr)) + return false; + if (iss2 & ESR_ELx_Overlay) return true;
Since the ESR_EL1 documentation says: If a memory access generates a Data Abort for a Permission fault, then this field holds information about the fault.
Thanks, Joey
On Tue, Sep 03, 2024 at 03:50:46PM +0100, Joey Gouly wrote:
On Thu, Aug 29, 2024 at 06:55:07PM +0100, Mark Brown wrote:
On Thu, Aug 22, 2024 at 04:10:59PM +0100, Joey Gouly wrote:
+static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma,
unsigned int mm_flags)
+{
- unsigned long iss2 = ESR_ELx_ISS2(esr);
- if (!system_supports_poe())
return false;
- if (iss2 & ESR_ELx_Overlay)
return true;
Does this need an is_data_abort() && is_instruction_abort() check? Overlay doesn't appear to be defined for all exception types and it wasn't clear enough to me that the callers have done this check.
The only callers are in do_page_fault(), which should only be data or instruction aborts. I talked with Catalin and he said it's fine to not check again here.
I can add a permissions check though:
commit 033270f5a9462e998b4dee11fc91b43ac7929756 Author: Joey Gouly joey.gouly@arm.com Date: Tue Sep 3 15:45:59 2024 +0100
fixup! arm64: handle PKEY/POE faults
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index a68055150950..f651553a8ab8 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -495,6 +495,9 @@ static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma, if (!system_supports_poe()) return false;
if (!esr_fsc_is_permission_fault(esr))
return false;
if (iss2 & ESR_ELx_Overlay) return true;
Since the ESR_EL1 documentation says: If a memory access generates a Data Abort for a Permission fault, then this field holds information about the fault.
Sorry, I was a bit too eager with that patch. The previous patch was bailing out before the vma-backed checks could take place.
It should be:
commit 7b67b149f2f492e907b27521c95639f4ea208221 (HEAD -> permission_overlay_v6) Author: Joey Gouly joey.gouly@arm.com Date: Tue Sep 3 15:45:59 2024 +0100
fixup! arm64: handle PKEY/POE faults
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index a68055150950..8b281cf308b3 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -495,7 +495,7 @@ static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma, if (!system_supports_poe()) return false;
- if (iss2 & ESR_ELx_Overlay) + if (esr_fsc_is_permission_fault(esr) && (iss2 & ESR_ELx_Overlay)) return true;
return !arch_vma_access_permitted(vma,
We do not want take POE into account when clearing the MTE tags.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Catalin Marinas catalin.marinas@arm.com Reviewed-by: Anshuman Khandual anshuman.khandual@arm.com --- arch/arm64/include/asm/pgtable.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-)
diff --git arch/arm64/include/asm/pgtable.h arch/arm64/include/asm/pgtable.h index 1d0f18d30e1e..61a674942a6b 100644 --- arch/arm64/include/asm/pgtable.h +++ arch/arm64/include/asm/pgtable.h @@ -156,8 +156,10 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) * not set) must return false. PROT_NONE mappings do not have the * PTE_VALID bit set. */ -#define pte_access_permitted(pte, write) \ +#define pte_access_permitted_no_overlay(pte, write) \ (((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte))) +#define pte_access_permitted(pte, write) \ + pte_access_permitted_no_overlay(pte, write) #define pmd_access_permitted(pmd, write) \ (pte_access_permitted(pmd_pte(pmd), (write))) #define pud_access_permitted(pud, write) \ @@ -373,10 +375,11 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages) /* * If the PTE would provide user space access to the tags associated * with it then ensure that the MTE tags are synchronised. Although - * pte_access_permitted() returns false for exec only mappings, they - * don't expose tags (instruction fetches don't check tags). + * pte_access_permitted_no_overlay() returns false for exec only + * mappings, they don't expose tags (instruction fetches don't check + * tags). */ - if (system_supports_mte() && pte_access_permitted(pte, false) && + if (system_supports_mte() && pte_access_permitted_no_overlay(pte, false) && !pte_special(pte) && pte_tagged(pte)) mte_sync_tags(pte, nr_pages); }
Implement the PKEYS interface, using the Permission Overlay Extension.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Catalin Marinas catalin.marinas@arm.com --- arch/arm64/include/asm/mmu.h | 1 + arch/arm64/include/asm/mmu_context.h | 46 +++++++++++- arch/arm64/include/asm/pgtable.h | 22 +++++- arch/arm64/include/asm/pkeys.h | 108 +++++++++++++++++++++++++++ arch/arm64/include/asm/por.h | 33 ++++++++ arch/arm64/include/uapi/asm/mman.h | 9 +++ arch/arm64/mm/mmu.c | 45 +++++++++++ 7 files changed, 262 insertions(+), 2 deletions(-) create mode 100644 arch/arm64/include/asm/pkeys.h create mode 100644 arch/arm64/include/asm/por.h
diff --git arch/arm64/include/asm/mmu.h arch/arm64/include/asm/mmu.h index 65977c7783c5..983afeb4eba5 100644 --- arch/arm64/include/asm/mmu.h +++ arch/arm64/include/asm/mmu.h @@ -25,6 +25,7 @@ typedef struct { refcount_t pinned; void *vdso; unsigned long flags; + u8 pkey_allocation_map; } mm_context_t;
/* diff --git arch/arm64/include/asm/mmu_context.h arch/arm64/include/asm/mmu_context.h index bd19f4c758b7..7c09d47e09cb 100644 --- arch/arm64/include/asm/mmu_context.h +++ arch/arm64/include/asm/mmu_context.h @@ -15,12 +15,12 @@ #include <linux/sched/hotplug.h> #include <linux/mm_types.h> #include <linux/pgtable.h> +#include <linux/pkeys.h>
#include <asm/cacheflush.h> #include <asm/cpufeature.h> #include <asm/daifflags.h> #include <asm/proc-fns.h> -#include <asm-generic/mm_hooks.h> #include <asm/cputype.h> #include <asm/sysreg.h> #include <asm/tlbflush.h> @@ -175,9 +175,36 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm) { atomic64_set(&mm->context.id, 0); refcount_set(&mm->context.pinned, 0); + + /* pkey 0 is the default, so always reserve it. */ + mm->context.pkey_allocation_map = BIT(0); + + return 0; +} + +static inline void arch_dup_pkeys(struct mm_struct *oldmm, + struct mm_struct *mm) +{ + /* Duplicate the oldmm pkey state in mm: */ + mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map; +} + +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + arch_dup_pkeys(oldmm, mm); + return 0; }
+static inline void arch_exit_mmap(struct mm_struct *mm) +{ +} + +static inline void arch_unmap(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + #ifdef CONFIG_ARM64_SW_TTBR0_PAN static inline void update_saved_ttbr0(struct task_struct *tsk, struct mm_struct *mm) @@ -267,6 +294,23 @@ static inline unsigned long mm_untag_mask(struct mm_struct *mm) return -1UL >> 8; }
+/* + * Only enforce protection keys on the current process, because there is no + * user context to access POR_EL0 for another address space. + */ +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, + bool write, bool execute, bool foreign) +{ + if (!system_supports_poe()) + return true; + + /* allow access if the VMA is not one from this process */ + if (foreign || vma_is_foreign(vma)) + return true; + + return por_el0_allows_pkey(vma_pkey(vma), write, execute); +} + #include <asm-generic/mmu_context.h>
#endif /* !__ASSEMBLY__ */ diff --git arch/arm64/include/asm/pgtable.h arch/arm64/include/asm/pgtable.h index 61a674942a6b..96c2b0b07c4c 100644 --- arch/arm64/include/asm/pgtable.h +++ arch/arm64/include/asm/pgtable.h @@ -34,6 +34,7 @@
#include <asm/cmpxchg.h> #include <asm/fixmap.h> +#include <asm/por.h> #include <linux/mmdebug.h> #include <linux/mm_types.h> #include <linux/sched.h> @@ -149,6 +150,24 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) #define pte_accessible(mm, pte) \ (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte))
+static inline bool por_el0_allows_pkey(u8 pkey, bool write, bool execute) +{ + u64 por; + + if (!system_supports_poe()) + return true; + + por = read_sysreg_s(SYS_POR_EL0); + + if (write) + return por_elx_allows_write(por, pkey); + + if (execute) + return por_elx_allows_exec(por, pkey); + + return por_elx_allows_read(por, pkey); +} + /* * p??_access_permitted() is true for valid user mappings (PTE_USER * bit set, subject to the write permission check). For execute-only @@ -159,7 +178,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) #define pte_access_permitted_no_overlay(pte, write) \ (((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte))) #define pte_access_permitted(pte, write) \ - pte_access_permitted_no_overlay(pte, write) + (pte_access_permitted_no_overlay(pte, write) && \ + por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false)) #define pmd_access_permitted(pmd, write) \ (pte_access_permitted(pmd_pte(pmd), (write))) #define pud_access_permitted(pud, write) \ diff --git arch/arm64/include/asm/pkeys.h arch/arm64/include/asm/pkeys.h new file mode 100644 index 000000000000..32c352bb36b9 --- /dev/null +++ arch/arm64/include/asm/pkeys.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Arm Ltd. + * + * Based on arch/x86/include/asm/pkeys.h + */ + +#ifndef _ASM_ARM64_PKEYS_H +#define _ASM_ARM64_PKEYS_H + +#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2) + +#define arch_max_pkey() 8 + +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val); + +static inline bool arch_pkeys_enabled(void) +{ + return false; +} + +static inline int vma_pkey(struct vm_area_struct *vma) +{ + return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT; +} + +static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, + int prot, int pkey) +{ + if (pkey != -1) + return pkey; + + return vma_pkey(vma); +} + +static inline int execute_only_pkey(struct mm_struct *mm) +{ + // Execute-only mappings are handled by EPAN/FEAT_PAN3. + WARN_ON_ONCE(!cpus_have_final_cap(ARM64_HAS_EPAN)); + + return -1; +} + +#define mm_pkey_allocation_map(mm) (mm)->context.pkey_allocation_map +#define mm_set_pkey_allocated(mm, pkey) do { \ + mm_pkey_allocation_map(mm) |= (1U << pkey); \ +} while (0) +#define mm_set_pkey_free(mm, pkey) do { \ + mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ +} while (0) + +static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) +{ + /* + * "Allocated" pkeys are those that have been returned + * from pkey_alloc() or pkey 0 which is allocated + * implicitly when the mm is created. + */ + if (pkey < 0 || pkey >= arch_max_pkey()) + return false; + + return mm_pkey_allocation_map(mm) & (1U << pkey); +} + +/* + * Returns a positive, 3-bit key on success, or -1 on failure. + */ +static inline int mm_pkey_alloc(struct mm_struct *mm) +{ + /* + * Note: this is the one and only place we make sure + * that the pkey is valid as far as the hardware is + * concerned. The rest of the kernel trusts that + * only good, valid pkeys come out of here. + */ + u8 all_pkeys_mask = GENMASK(arch_max_pkey() - 1, 0); + int ret; + + if (!arch_pkeys_enabled()) + return -1; + + /* + * Are we out of pkeys? We must handle this specially + * because ffz() behavior is undefined if there are no + * zeros. + */ + if (mm_pkey_allocation_map(mm) == all_pkeys_mask) + return -1; + + ret = ffz(mm_pkey_allocation_map(mm)); + + mm_set_pkey_allocated(mm, ret); + + return ret; +} + +static inline int mm_pkey_free(struct mm_struct *mm, int pkey) +{ + if (!mm_pkey_is_allocated(mm, pkey)) + return -EINVAL; + + mm_set_pkey_free(mm, pkey); + + return 0; +} + +#endif /* _ASM_ARM64_PKEYS_H */ diff --git arch/arm64/include/asm/por.h arch/arm64/include/asm/por.h new file mode 100644 index 000000000000..e06e9f473675 --- /dev/null +++ arch/arm64/include/asm/por.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Arm Ltd. + */ + +#ifndef _ASM_ARM64_POR_H +#define _ASM_ARM64_POR_H + +#define POR_BITS_PER_PKEY 4 +#define POR_ELx_IDX(por_elx, idx) (((por_elx) >> ((idx) * POR_BITS_PER_PKEY)) & 0xf) + +static inline bool por_elx_allows_read(u64 por, u8 pkey) +{ + u8 perm = POR_ELx_IDX(por, pkey); + + return perm & POE_R; +} + +static inline bool por_elx_allows_write(u64 por, u8 pkey) +{ + u8 perm = POR_ELx_IDX(por, pkey); + + return perm & POE_W; +} + +static inline bool por_elx_allows_exec(u64 por, u8 pkey) +{ + u8 perm = POR_ELx_IDX(por, pkey); + + return perm & POE_X; +} + +#endif /* _ASM_ARM64_POR_H */ diff --git arch/arm64/include/uapi/asm/mman.h arch/arm64/include/uapi/asm/mman.h index 1e6482a838e1..e7e0c8216243 100644 --- arch/arm64/include/uapi/asm/mman.h +++ arch/arm64/include/uapi/asm/mman.h @@ -7,4 +7,13 @@ #define PROT_BTI 0x10 /* BTI guarded page */ #define PROT_MTE 0x20 /* Normal Tagged mapping */
+/* Override any generic PKEY permission defines */ +#define PKEY_DISABLE_EXECUTE 0x4 +#define PKEY_DISABLE_READ 0x8 +#undef PKEY_ACCESS_MASK +#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ + PKEY_DISABLE_WRITE |\ + PKEY_DISABLE_READ |\ + PKEY_DISABLE_EXECUTE) + #endif /* ! _UAPI__ASM_MMAN_H */ diff --git arch/arm64/mm/mmu.c arch/arm64/mm/mmu.c index 353ea5dc32b8..e55b02fbddc8 100644 --- arch/arm64/mm/mmu.c +++ arch/arm64/mm/mmu.c @@ -25,6 +25,7 @@ #include <linux/vmalloc.h> #include <linux/set_memory.h> #include <linux/kfence.h> +#include <linux/pkeys.h>
#include <asm/barrier.h> #include <asm/cputype.h> @@ -1549,3 +1550,47 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
cpu_uninstall_idmap(); } + +#ifdef CONFIG_ARCH_HAS_PKEYS +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) +{ + u64 new_por = POE_RXW; + u64 old_por; + u64 pkey_shift; + + if (!system_supports_poe()) + return -ENOSPC; + + /* + * This code should only be called with valid 'pkey' + * values originating from in-kernel users. Complain + * if a bad value is observed. + */ + if (WARN_ON_ONCE(pkey >= arch_max_pkey())) + return -EINVAL; + + /* Set the bits we need in POR: */ + new_por = POE_RXW; + if (init_val & PKEY_DISABLE_WRITE) + new_por &= ~POE_W; + if (init_val & PKEY_DISABLE_ACCESS) + new_por &= ~POE_RW; + if (init_val & PKEY_DISABLE_READ) + new_por &= ~POE_R; + if (init_val & PKEY_DISABLE_EXECUTE) + new_por &= ~POE_X; + + /* Shift the bits in to the correct place in POR for pkey: */ + pkey_shift = pkey * POR_BITS_PER_PKEY; + new_por <<= pkey_shift; + + /* Get old POR and mask off any old bits in place: */ + old_por = read_sysreg_s(SYS_POR_EL0); + old_por &= ~(POE_MASK << pkey_shift); + + /* Write old part along with new part: */ + write_sysreg_s(old_por | new_por, SYS_POR_EL0); + + return 0; +} +#endif
Add PKEY support to signals, by saving and restoring POR_EL0 from the stackframe.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Mark Brown broonie@kernel.org Acked-by: Szabolcs Nagy szabolcs.nagy@arm.com Reviewed-by: Catalin Marinas catalin.marinas@arm.com Reviewed-by: Anshuman Khandual anshuman.khandual@arm.com --- arch/arm64/include/uapi/asm/sigcontext.h | 7 +++ arch/arm64/kernel/signal.c | 62 ++++++++++++++++++++++++ 2 files changed, 69 insertions(+)
diff --git arch/arm64/include/uapi/asm/sigcontext.h arch/arm64/include/uapi/asm/sigcontext.h index 8a45b7a411e0..e4cba8a6c9a2 100644 --- arch/arm64/include/uapi/asm/sigcontext.h +++ arch/arm64/include/uapi/asm/sigcontext.h @@ -98,6 +98,13 @@ struct esr_context { __u64 esr; };
+#define POE_MAGIC 0x504f4530 + +struct poe_context { + struct _aarch64_ctx head; + __u64 por_el0; +}; + /* * extra_context: describes extra space in the signal frame for * additional structures that don't fit in sigcontext.__reserved[]. diff --git arch/arm64/kernel/signal.c arch/arm64/kernel/signal.c index 4a77f4976e11..561986947530 100644 --- arch/arm64/kernel/signal.c +++ arch/arm64/kernel/signal.c @@ -61,6 +61,7 @@ struct rt_sigframe_user_layout { unsigned long za_offset; unsigned long zt_offset; unsigned long fpmr_offset; + unsigned long poe_offset; unsigned long extra_offset; unsigned long end_offset; }; @@ -185,6 +186,8 @@ struct user_ctxs { u32 zt_size; struct fpmr_context __user *fpmr; u32 fpmr_size; + struct poe_context __user *poe; + u32 poe_size; };
static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) @@ -258,6 +261,32 @@ static int restore_fpmr_context(struct user_ctxs *user) return err; }
+static int preserve_poe_context(struct poe_context __user *ctx) +{ + int err = 0; + + __put_user_error(POE_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(read_sysreg_s(SYS_POR_EL0), &ctx->por_el0, err); + + return err; +} + +static int restore_poe_context(struct user_ctxs *user) +{ + u64 por_el0; + int err = 0; + + if (user->poe_size != sizeof(*user->poe)) + return -EINVAL; + + __get_user_error(por_el0, &(user->poe->por_el0), err); + if (!err) + write_sysreg_s(por_el0, SYS_POR_EL0); + + return err; +} + #ifdef CONFIG_ARM64_SVE
static int preserve_sve_context(struct sve_context __user *ctx) @@ -621,6 +650,7 @@ static int parse_user_sigframe(struct user_ctxs *user, user->za = NULL; user->zt = NULL; user->fpmr = NULL; + user->poe = NULL;
if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; @@ -671,6 +701,17 @@ static int parse_user_sigframe(struct user_ctxs *user, /* ignore */ break;
+ case POE_MAGIC: + if (!system_supports_poe()) + goto invalid; + + if (user->poe) + goto invalid; + + user->poe = (struct poe_context __user *)head; + user->poe_size = size; + break; + case SVE_MAGIC: if (!system_supports_sve() && !system_supports_sme()) goto invalid; @@ -857,6 +898,9 @@ static int restore_sigframe(struct pt_regs *regs, if (err == 0 && system_supports_sme2() && user.zt) err = restore_zt_context(&user);
+ if (err == 0 && system_supports_poe() && user.poe) + err = restore_poe_context(&user); + return err; }
@@ -980,6 +1024,13 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, return err; }
+ if (system_supports_poe()) { + err = sigframe_alloc(user, &user->poe_offset, + sizeof(struct poe_context)); + if (err) + return err; + } + return sigframe_alloc_end(user); }
@@ -1042,6 +1093,14 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, err |= preserve_fpmr_context(fpmr_ctx); }
+ if (system_supports_poe() && err == 0 && user->poe_offset) { + struct poe_context __user *poe_ctx = + apply_user_offset(user, user->poe_offset); + + err |= preserve_poe_context(poe_ctx); + } + + /* ZA state if present */ if (system_supports_sme() && err == 0 && user->za_offset) { struct za_context __user *za_ctx = @@ -1178,6 +1237,9 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, sme_smstop(); }
+ if (system_supports_poe()) + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); + if (ka->sa.sa_flags & SA_RESTORER) sigtramp = ka->sa.sa_restorer; else
On 22/08/2024 17:11, Joey Gouly wrote:
@@ -1178,6 +1237,9 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, sme_smstop(); }
- if (system_supports_poe())
write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0);
At the point where setup_return() is called, the signal frame has already been written to the user stack. In other words, we write to the user stack first, and then reset POR_EL0. This may be problematic, especially if we are using the alternate signal stack, which the interrupted POR_EL0 may not grant access to. In that situation uaccess will fail and we'll end up with a SIGSEGV.
This issue has already been discussed on the x86 side, and as it happens patches to reset PKRU early [1] have just landed. I don't think this is a blocker for getting this series landed, but we should try and align with x86. If there's no objection, I'm planning to work on a counterpart to the x86 series (resetting POR_EL0 early during signal delivery).
Kevin
[1] https://lore.kernel.org/lkml/20240802061318.2140081-2-aruna.ramakrishna@orac...
- if (ka->sa.sa_flags & SA_RESTORER) sigtramp = ka->sa.sa_restorer; else
On Tue, Sep 24, 2024 at 01:27:58PM +0200, Kevin Brodsky wrote:
On 22/08/2024 17:11, Joey Gouly wrote:
@@ -1178,6 +1237,9 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, sme_smstop(); }
- if (system_supports_poe())
write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0);
At the point where setup_return() is called, the signal frame has already been written to the user stack. In other words, we write to the user stack first, and then reset POR_EL0. This may be problematic, especially if we are using the alternate signal stack, which the interrupted POR_EL0 may not grant access to. In that situation uaccess will fail and we'll end up with a SIGSEGV.
This issue has already been discussed on the x86 side, and as it happens patches to reset PKRU early [1] have just landed. I don't think this is a blocker for getting this series landed, but we should try and align with x86. If there's no objection, I'm planning to work on a counterpart to the x86 series (resetting POR_EL0 early during signal delivery).
Kevin
[1] https://lore.kernel.org/lkml/20240802061318.2140081-2-aruna.ramakrishna@orac...
+1, all the uaccess in signal delivery is done by the kernel on behalf of the signal handler context, so we should do it with (at least) the same memory permissions that the signal handler is going to be entered with.
(In an ideal world, userspace would save this information itself, using its own handler permissions -- well, no, in an ideal world we wouldn't have the signal delivery mechanism at all, but hopefully you get the idea.)
Cheers ---Dave
Add a regset for POE containing POR_EL0.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Mark Brown broonie@kernel.org Reviewed-by: Catalin Marinas catalin.marinas@arm.com Reviewed-by: Anshuman Khandual anshuman.khandual@arm.com --- arch/arm64/kernel/ptrace.c | 46 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/elf.h | 1 + 2 files changed, 47 insertions(+)
diff --git arch/arm64/kernel/ptrace.c arch/arm64/kernel/ptrace.c index 0d022599eb61..b756578aeaee 100644 --- arch/arm64/kernel/ptrace.c +++ arch/arm64/kernel/ptrace.c @@ -1440,6 +1440,39 @@ static int tagged_addr_ctrl_set(struct task_struct *target, const struct } #endif
+#ifdef CONFIG_ARM64_POE +static int poe_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + if (!system_supports_poe()) + return -EINVAL; + + return membuf_write(&to, &target->thread.por_el0, + sizeof(target->thread.por_el0)); +} + +static int poe_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + long ctrl; + + if (!system_supports_poe()) + return -EINVAL; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); + if (ret) + return ret; + + target->thread.por_el0 = ctrl; + + return 0; +} +#endif + enum aarch64_regset { REGSET_GPR, REGSET_FPR, @@ -1469,6 +1502,9 @@ enum aarch64_regset { #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI REGSET_TAGGED_ADDR_CTRL, #endif +#ifdef CONFIG_ARM64_POE + REGSET_POE +#endif };
static const struct user_regset aarch64_regsets[] = { @@ -1628,6 +1664,16 @@ static const struct user_regset aarch64_regsets[] = { .set = tagged_addr_ctrl_set, }, #endif +#ifdef CONFIG_ARM64_POE + [REGSET_POE] = { + .core_note_type = NT_ARM_POE, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = poe_get, + .set = poe_set, + }, +#endif };
static const struct user_regset_view user_aarch64_view = { diff --git include/uapi/linux/elf.h include/uapi/linux/elf.h index b54b313bcf07..81762ff3c99e 100644 --- include/uapi/linux/elf.h +++ include/uapi/linux/elf.h @@ -441,6 +441,7 @@ typedef struct elf64_shdr { #define NT_ARM_ZA 0x40c /* ARM SME ZA registers */ #define NT_ARM_ZT 0x40d /* ARM SME ZT registers */ #define NT_ARM_FPMR 0x40e /* ARM floating point mode register */ +#define NT_ARM_POE 0x40f /* ARM POE registers */ #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ #define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */
Permission Indirection Extension and Permission Overlay Extension can be enabled independently.
When PIE is disabled and POE is enabled, the permissions set by POR_EL0 will be applied on top of the permissions set in the PTE.
When both PIE and POE are enabled, the permissions set by POR_EL0 will be applied on top of the permissions set by the PIRE0_EL1 register. However PIRE0_EL1 has encodings that specifically enable and disable the overlay from applying.
For example: 0001 Read, Overlay applied. 1000 Read, Overlay not applied.
Switch to using the 'Overlay applied' encodings in PIRE0_EL1, so that PIE and POE can coexist.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Catalin Marinas catalin.marinas@arm.com --- arch/arm64/include/asm/pgtable-prot.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git arch/arm64/include/asm/pgtable-prot.h arch/arm64/include/asm/pgtable-prot.h index b11cfb9fdd37..2a11d0c10760 100644 --- arch/arm64/include/asm/pgtable-prot.h +++ arch/arm64/include/asm/pgtable-prot.h @@ -154,10 +154,10 @@ static inline bool __pure lpa2_is_enabled(void)
#define PIE_E0 ( \ PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY), PIE_R) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW)) + PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW_O))
#define PIE_E1 ( \ PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \
Now that PKEYs support has been implemented, enable it for CPUs that support S1POE.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Acked-by: Catalin Marinas catalin.marinas@arm.com Reviewed-by: Anshuman Khandual anshuman.khandual@arm.com --- arch/arm64/include/asm/pkeys.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git arch/arm64/include/asm/pkeys.h arch/arm64/include/asm/pkeys.h index 32c352bb36b9..19eb1b12b7fc 100644 --- arch/arm64/include/asm/pkeys.h +++ arch/arm64/include/asm/pkeys.h @@ -17,7 +17,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
static inline bool arch_pkeys_enabled(void) { - return false; + return system_supports_poe(); }
static inline int vma_pkey(struct vm_area_struct *vma)
Now that support for POE and Protection Keys has been implemented, add a config to allow users to actually enable it.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Reviewed-by: Anshuman Khandual anshuman.khandual@arm.com Acked-by: Catalin Marinas catalin.marinas@arm.com --- arch/arm64/Kconfig | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+)
diff --git arch/arm64/Kconfig arch/arm64/Kconfig index a2f8ff354ca6..35dfc6275328 100644 --- arch/arm64/Kconfig +++ arch/arm64/Kconfig @@ -2137,6 +2137,29 @@ config ARM64_EPAN if the cpu does not implement the feature. endmenu # "ARMv8.7 architectural features"
+menu "ARMv8.9 architectural features" + +config ARM64_POE + prompt "Permission Overlay Extension" + def_bool y + select ARCH_USES_HIGH_VMA_FLAGS + select ARCH_HAS_PKEYS + help + The Permission Overlay Extension is used to implement Memory + Protection Keys. Memory Protection Keys provides a mechanism for + enforcing page-based protections, but without requiring modification + of the page tables when an application changes protection domains. + + For details, see Documentation/core-api/protection-keys.rst + + If unsure, say y. + +config ARCH_PKEY_BITS + int + default 3 + +endmenu # "ARMv8.9 architectural features" + config ARM64_SVE bool "ARM Scalable Vector Extension support" default y
Put this function in the header so that it can be used by other tests, without needing to link to testcases.c.
This will be used by selftest/mm/protection_keys.c
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Cc: Andrew Morton akpm@linux-foundation.org Cc: Shuah Khan shuah@kernel.org Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Aneesh Kumar K.V aneesh.kumar@linux.ibm.com Reviewed-by: Mark Brown broonie@kernel.org --- .../arm64/signal/testcases/testcases.c | 23 ----------------- .../arm64/signal/testcases/testcases.h | 25 +++++++++++++++++-- 2 files changed, 23 insertions(+), 25 deletions(-)
diff --git tools/testing/selftests/arm64/signal/testcases/testcases.c tools/testing/selftests/arm64/signal/testcases/testcases.c index 674b88cc8c39..e4331440fed0 100644 --- tools/testing/selftests/arm64/signal/testcases/testcases.c +++ tools/testing/selftests/arm64/signal/testcases/testcases.c @@ -6,29 +6,6 @@
#include "testcases.h"
-struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic, - size_t resv_sz, size_t *offset) -{ - size_t offs = 0; - struct _aarch64_ctx *found = NULL; - - if (!head || resv_sz < HDR_SZ) - return found; - - while (offs <= resv_sz - HDR_SZ && - head->magic != magic && head->magic) { - offs += head->size; - head = GET_RESV_NEXT_HEAD(head); - } - if (head->magic == magic) { - found = head; - if (offset) - *offset = offs; - } - - return found; -} - bool validate_extra_context(struct extra_context *extra, char **err, void **extra_data, size_t *extra_size) { diff --git tools/testing/selftests/arm64/signal/testcases/testcases.h tools/testing/selftests/arm64/signal/testcases/testcases.h index 7727126347e0..3185e6875694 100644 --- tools/testing/selftests/arm64/signal/testcases/testcases.h +++ tools/testing/selftests/arm64/signal/testcases/testcases.h @@ -88,8 +88,29 @@ struct fake_sigframe {
bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err);
-struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic, - size_t resv_sz, size_t *offset); +static inline struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic, + size_t resv_sz, size_t *offset) +{ + size_t offs = 0; + struct _aarch64_ctx *found = NULL; + + if (!head || resv_sz < HDR_SZ) + return found; + + while (offs <= resv_sz - HDR_SZ && + head->magic != magic && head->magic) { + offs += head->size; + head = GET_RESV_NEXT_HEAD(head); + } + if (head->magic == magic) { + found = head; + if (offset) + *offset = offs; + } + + return found; +} +
static inline struct _aarch64_ctx *get_terminator(struct _aarch64_ctx *head, size_t resv_sz,
arm64's fpregs are not at a constant offset from sigcontext. Since this is not an important part of the test, don't print the fpregs pointer on arm64.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Cc: Andrew Morton akpm@linux-foundation.org Cc: Shuah Khan shuah@kernel.org Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Aneesh Kumar K.V aneesh.kumar@linux.ibm.com Acked-by: Dave Hansen dave.hansen@linux.intel.com --- tools/testing/selftests/mm/pkey-powerpc.h | 1 + tools/testing/selftests/mm/pkey-x86.h | 2 ++ tools/testing/selftests/mm/protection_keys.c | 6 ++++++ 3 files changed, 9 insertions(+)
diff --git tools/testing/selftests/mm/pkey-powerpc.h tools/testing/selftests/mm/pkey-powerpc.h index ae5df26104e5..6275d0f474b3 100644 --- tools/testing/selftests/mm/pkey-powerpc.h +++ tools/testing/selftests/mm/pkey-powerpc.h @@ -9,6 +9,7 @@ #endif #define REG_IP_IDX PT_NIP #define REG_TRAPNO PT_TRAP +#define MCONTEXT_FPREGS #define gregs gp_regs #define fpregs fp_regs #define si_pkey_offset 0x20 diff --git tools/testing/selftests/mm/pkey-x86.h tools/testing/selftests/mm/pkey-x86.h index 814758e109c0..b9170a26bfcb 100644 --- tools/testing/selftests/mm/pkey-x86.h +++ tools/testing/selftests/mm/pkey-x86.h @@ -15,6 +15,8 @@
#endif
+#define MCONTEXT_FPREGS + #ifndef PKEY_DISABLE_ACCESS # define PKEY_DISABLE_ACCESS 0x1 #endif diff --git tools/testing/selftests/mm/protection_keys.c tools/testing/selftests/mm/protection_keys.c index eaa6d1fc5328..4337106a985e 100644 --- tools/testing/selftests/mm/protection_keys.c +++ tools/testing/selftests/mm/protection_keys.c @@ -314,7 +314,9 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) ucontext_t *uctxt = vucontext; int trapno; unsigned long ip; +#ifdef MCONTEXT_FPREGS char *fpregs; +#endif #if defined(__i386__) || defined(__x86_64__) /* arch */ u32 *pkey_reg_ptr; int pkey_reg_offset; @@ -330,7 +332,9 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; +#ifdef MCONTEXT_FPREGS fpregs = (char *) uctxt->uc_mcontext.fpregs; +#endif
dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", __func__, trapno, ip, si_code_str(si->si_code), @@ -359,7 +363,9 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) #endif /* arch */
dprintf1("siginfo: %p\n", si); +#ifdef MCONTEXT_FPREGS dprintf1(" fpregs: %p\n", fpregs); +#endif
if ((si->si_code == SEGV_MAPERR) || (si->si_code == SEGV_ACCERR) ||
The encoding of the pkey register differs on arm64, than on x86/ppc. On those platforms, a bit in the register is used to disable permissions, for arm64, a bit enabled in the register indicates that the permission is allowed.
This drops two asserts of the form: assert(read_pkey_reg() <= orig_pkey_reg); Because on arm64 this doesn't hold, due to the encoding.
The pkey must be reset to both access allow and write allow in the signal handler. pkey_access_allow() works currently for PowerPC as the PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE have overlapping bits set.
Access to the uc_mcontext is abstracted, as arm64 has a different structure.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Cc: Andrew Morton akpm@linux-foundation.org Cc: Shuah Khan shuah@kernel.org Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Aneesh Kumar K.V aneesh.kumar@linux.ibm.com Acked-by: Dave Hansen dave.hansen@linux.intel.com --- .../arm64/signal/testcases/testcases.h | 3 + tools/testing/selftests/mm/Makefile | 2 +- tools/testing/selftests/mm/pkey-arm64.h | 139 ++++++++++++++++++ tools/testing/selftests/mm/pkey-helpers.h | 8 + tools/testing/selftests/mm/pkey-powerpc.h | 2 + tools/testing/selftests/mm/pkey-x86.h | 2 + tools/testing/selftests/mm/protection_keys.c | 103 +++++++++++-- 7 files changed, 247 insertions(+), 12 deletions(-) create mode 100644 tools/testing/selftests/mm/pkey-arm64.h
diff --git tools/testing/selftests/arm64/signal/testcases/testcases.h tools/testing/selftests/arm64/signal/testcases/testcases.h index 3185e6875694..9872b8912714 100644 --- tools/testing/selftests/arm64/signal/testcases/testcases.h +++ tools/testing/selftests/arm64/signal/testcases/testcases.h @@ -26,6 +26,9 @@ #define HDR_SZ \ sizeof(struct _aarch64_ctx)
+#define GET_UC_RESV_HEAD(uc) \ + (struct _aarch64_ctx *)(&(uc->uc_mcontext.__reserved)) + #define GET_SF_RESV_HEAD(sf) \ (struct _aarch64_ctx *)(&(sf).uc.uc_mcontext.__reserved)
diff --git tools/testing/selftests/mm/Makefile tools/testing/selftests/mm/Makefile index cfad627e8d94..5f2ca591c956 100644 --- tools/testing/selftests/mm/Makefile +++ tools/testing/selftests/mm/Makefile @@ -106,7 +106,7 @@ TEST_GEN_FILES += $(BINARIES_64) endif else
-ifneq (,$(findstring $(ARCH),powerpc)) +ifneq (,$(filter $(ARCH),arm64 powerpc)) TEST_GEN_FILES += protection_keys endif
diff --git tools/testing/selftests/mm/pkey-arm64.h tools/testing/selftests/mm/pkey-arm64.h new file mode 100644 index 000000000000..580e1b0bb38e --- /dev/null +++ tools/testing/selftests/mm/pkey-arm64.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Arm Ltd. + */ + +#ifndef _PKEYS_ARM64_H +#define _PKEYS_ARM64_H + +#include "vm_util.h" +/* for signal frame parsing */ +#include "../arm64/signal/testcases/testcases.h" + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 288 +#endif +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 289 +# define SYS_pkey_free 290 +#endif +#define MCONTEXT_IP(mc) mc.pc +#define MCONTEXT_TRAPNO(mc) -1 + +#define PKEY_MASK 0xf + +#define POE_NONE 0x0 +#define POE_X 0x2 +#define POE_RX 0x3 +#define POE_RWX 0x7 + +#define NR_PKEYS 8 +#define NR_RESERVED_PKEYS 1 /* pkey-0 */ + +#define PKEY_ALLOW_ALL 0x77777777 + +#define PKEY_BITS_PER_PKEY 4 +#define PAGE_SIZE sysconf(_SC_PAGESIZE) +#undef HPAGE_SIZE +#define HPAGE_SIZE default_huge_page_size() + +/* 4-byte instructions * 16384 = 64K page */ +#define __page_o_noops() asm(".rept 16384 ; nop; .endr") + +static inline u64 __read_pkey_reg(void) +{ + u64 pkey_reg = 0; + + // POR_EL0 + asm volatile("mrs %0, S3_3_c10_c2_4" : "=r" (pkey_reg)); + + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + u64 por = pkey_reg; + + dprintf4("%s() changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); + + // POR_EL0 + asm volatile("msr S3_3_c10_c2_4, %0\nisb" :: "r" (por) :); + + dprintf4("%s() pkey register after changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); +} + +static inline int cpu_has_pkeys(void) +{ + /* No simple way to determine this */ + return 1; +} + +static inline u32 pkey_bit_position(int pkey) +{ + return pkey * PKEY_BITS_PER_PKEY; +} + +static inline int get_arch_reserved_keys(void) +{ + return NR_RESERVED_PKEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ +} + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + return PTR_ERR_ENOTSUP; +} + +#define set_pkey_bits set_pkey_bits +static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) +{ + u32 shift = pkey_bit_position(pkey); + u64 new_val = POE_RWX; + + /* mask out bits from pkey in old value */ + reg &= ~((u64)PKEY_MASK << shift); + + if (flags & PKEY_DISABLE_ACCESS) + new_val = POE_X; + else if (flags & PKEY_DISABLE_WRITE) + new_val = POE_RX; + + /* OR in new bits for pkey */ + reg |= new_val << shift; + + return reg; +} + +#define get_pkey_bits get_pkey_bits +static inline u64 get_pkey_bits(u64 reg, int pkey) +{ + u32 shift = pkey_bit_position(pkey); + /* + * shift down the relevant bits to the lowest four, then + * mask off all the other higher bits + */ + u32 perm = (reg >> shift) & PKEY_MASK; + + if (perm == POE_X) + return PKEY_DISABLE_ACCESS; + if (perm == POE_RX) + return PKEY_DISABLE_WRITE; + return 0; +} + +static void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey) +{ + struct _aarch64_ctx *ctx = GET_UC_RESV_HEAD(uctxt); + struct poe_context *poe_ctx = + (struct poe_context *) get_header(ctx, POE_MAGIC, + sizeof(uctxt->uc_mcontext), NULL); + if (poe_ctx) + poe_ctx->por_el0 = pkey; +} + +#endif /* _PKEYS_ARM64_H */ diff --git tools/testing/selftests/mm/pkey-helpers.h tools/testing/selftests/mm/pkey-helpers.h index 1af3156a9db8..15608350fc01 100644 --- tools/testing/selftests/mm/pkey-helpers.h +++ tools/testing/selftests/mm/pkey-helpers.h @@ -91,12 +91,17 @@ void record_pkey_malloc(void *ptr, long size, int prot); #include "pkey-x86.h" #elif defined(__powerpc64__) /* arch */ #include "pkey-powerpc.h" +#elif defined(__aarch64__) /* arch */ +#include "pkey-arm64.h" #else /* arch */ #error Architecture not supported #endif /* arch */
+#ifndef PKEY_MASK #define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) +#endif
+#ifndef set_pkey_bits static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) { u32 shift = pkey_bit_position(pkey); @@ -106,7 +111,9 @@ static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) reg |= (flags & PKEY_MASK) << shift; return reg; } +#endif
+#ifndef get_pkey_bits static inline u64 get_pkey_bits(u64 reg, int pkey) { u32 shift = pkey_bit_position(pkey); @@ -116,6 +123,7 @@ static inline u64 get_pkey_bits(u64 reg, int pkey) */ return ((reg >> shift) & PKEY_MASK); } +#endif
extern u64 shadow_pkey_reg;
diff --git tools/testing/selftests/mm/pkey-powerpc.h tools/testing/selftests/mm/pkey-powerpc.h index 6275d0f474b3..3d0c0bdae5bc 100644 --- tools/testing/selftests/mm/pkey-powerpc.h +++ tools/testing/selftests/mm/pkey-powerpc.h @@ -8,6 +8,8 @@ # define SYS_pkey_free 385 #endif #define REG_IP_IDX PT_NIP +#define MCONTEXT_IP(mc) mc.gp_regs[REG_IP_IDX] +#define MCONTEXT_TRAPNO(mc) mc.gp_regs[REG_TRAPNO] #define REG_TRAPNO PT_TRAP #define MCONTEXT_FPREGS #define gregs gp_regs diff --git tools/testing/selftests/mm/pkey-x86.h tools/testing/selftests/mm/pkey-x86.h index b9170a26bfcb..5f28e26a2511 100644 --- tools/testing/selftests/mm/pkey-x86.h +++ tools/testing/selftests/mm/pkey-x86.h @@ -15,6 +15,8 @@
#endif
+#define MCONTEXT_IP(mc) mc.gregs[REG_IP_IDX] +#define MCONTEXT_TRAPNO(mc) mc.gregs[REG_TRAPNO] #define MCONTEXT_FPREGS
#ifndef PKEY_DISABLE_ACCESS diff --git tools/testing/selftests/mm/protection_keys.c tools/testing/selftests/mm/protection_keys.c index 4337106a985e..0789981b72b9 100644 --- tools/testing/selftests/mm/protection_keys.c +++ tools/testing/selftests/mm/protection_keys.c @@ -147,7 +147,7 @@ void abort_hooks(void) * will then fault, which makes sure that the fault code handles * execute-only memory properly. */ -#ifdef __powerpc64__ +#if defined(__powerpc64__) || defined(__aarch64__) /* This way, both 4K and 64K alignment are maintained */ __attribute__((__aligned__(65536))) #else @@ -212,7 +212,6 @@ void pkey_disable_set(int pkey, int flags) unsigned long syscall_flags = 0; int ret; int pkey_rights; - u64 orig_pkey_reg = read_pkey_reg();
dprintf1("START->%s(%d, 0x%x)\n", __func__, pkey, flags); @@ -242,8 +241,6 @@ void pkey_disable_set(int pkey, int flags)
dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, pkey, read_pkey_reg()); - if (flags) - pkey_assert(read_pkey_reg() >= orig_pkey_reg); dprintf1("END<---%s(%d, 0x%x)\n", __func__, pkey, flags); } @@ -253,7 +250,6 @@ void pkey_disable_clear(int pkey, int flags) unsigned long syscall_flags = 0; int ret; int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u64 orig_pkey_reg = read_pkey_reg();
pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
@@ -273,8 +269,6 @@ void pkey_disable_clear(int pkey, int flags)
dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, pkey, read_pkey_reg()); - if (flags) - assert(read_pkey_reg() <= orig_pkey_reg); }
void pkey_write_allow(int pkey) @@ -330,8 +324,8 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg);
- trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; - ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; + trapno = MCONTEXT_TRAPNO(uctxt->uc_mcontext); + ip = MCONTEXT_IP(uctxt->uc_mcontext); #ifdef MCONTEXT_FPREGS fpregs = (char *) uctxt->uc_mcontext.fpregs; #endif @@ -395,6 +389,8 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) #elif defined(__powerpc64__) /* arch */ /* restore access and let the faulting instruction continue */ pkey_access_allow(siginfo_pkey); +#elif defined(__aarch64__) + aarch64_write_signal_pkey(uctxt, PKEY_ALLOW_ALL); #endif /* arch */ pkey_faults++; dprintf1("<<<<==================================================\n"); @@ -908,7 +904,9 @@ void expected_pkey_fault(int pkey) * test program continue. We now have to restore it. */ if (__read_pkey_reg() != 0) -#else /* arch */ +#elif defined(__aarch64__) + if (__read_pkey_reg() != PKEY_ALLOW_ALL) +#else if (__read_pkey_reg() != shadow_pkey_reg) #endif /* arch */ pkey_assert(0); @@ -1498,6 +1496,11 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey) lots_o_noops_around_write(&scratch); do_not_expect_pkey_fault("executing on PROT_EXEC memory"); expect_fault_on_read_execonly_key(p1, pkey); + + // Reset back to PROT_EXEC | PROT_READ for architectures that support + // non-PKEY execute-only permissions. + ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC | PROT_READ, (u64)pkey); + pkey_assert(!ret); }
void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) @@ -1671,6 +1674,84 @@ void test_ptrace_modifies_pkru(int *ptr, u16 pkey) } #endif
+#if defined(__aarch64__) +void test_ptrace_modifies_pkru(int *ptr, u16 pkey) +{ + pid_t child; + int status, ret; + struct iovec iov; + u64 trace_pkey; + /* Just a random pkey value.. */ + u64 new_pkey = (POE_X << PKEY_BITS_PER_PKEY * 2) | + (POE_NONE << PKEY_BITS_PER_PKEY) | + POE_RWX; + + child = fork(); + pkey_assert(child >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), child); + if (!child) { + ptrace(PTRACE_TRACEME, 0, 0, 0); + + /* Stop and allow the tracer to modify PKRU directly */ + raise(SIGSTOP); + + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + if (__read_pkey_reg() != new_pkey) + exit(1); + + raise(SIGSTOP); + + exit(0); + } + + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + iov.iov_base = &trace_pkey; + iov.iov_len = 8; + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + pkey_assert(trace_pkey == read_pkey_reg()); + + trace_pkey = new_pkey; + + ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + + /* Test that the modification is visible in ptrace before any execution */ + memset(&trace_pkey, 0, sizeof(trace_pkey)); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + pkey_assert(trace_pkey == new_pkey); + + /* Execute the tracee */ + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + + /* Test that the tracee saw the PKRU value change */ + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + /* Test that the modification is visible in ptrace after execution */ + memset(&trace_pkey, 0, sizeof(trace_pkey)); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov); + pkey_assert(ret == 0); + pkey_assert(trace_pkey == new_pkey); + + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFEXITED(status)); + pkey_assert(WEXITSTATUS(status) == 0); +} +#endif + void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) { int size = PAGE_SIZE; @@ -1706,7 +1787,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_pkey_syscalls_bad_args, test_pkey_alloc_exhaust, test_pkey_alloc_free_attach_pkey0, -#if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) test_ptrace_modifies_pkru, #endif };
Check that when POE is enabled, the POR_EL0 register is accessible.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Cc: Mark Brown broonie@kernel.org Cc: Shuah Khan shuah@kernel.org Reviewed-by: Mark Brown broonie@kernel.org --- tools/testing/selftests/arm64/abi/hwcap.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+)
diff --git tools/testing/selftests/arm64/abi/hwcap.c tools/testing/selftests/arm64/abi/hwcap.c index d8909b2b535a..f2d6007a2b98 100644 --- tools/testing/selftests/arm64/abi/hwcap.c +++ tools/testing/selftests/arm64/abi/hwcap.c @@ -156,6 +156,12 @@ static void pmull_sigill(void) asm volatile(".inst 0x0ee0e000" : : : ); }
+static void poe_sigill(void) +{ + /* mrs x0, POR_EL0 */ + asm volatile("mrs x0, S3_3_C10_C2_4" : : : "x0"); +} + static void rng_sigill(void) { asm volatile("mrs x0, S3_3_C2_C4_0" : : : "x0"); @@ -601,6 +607,14 @@ static const struct hwcap_data { .cpuinfo = "pmull", .sigill_fn = pmull_sigill, }, + { + .name = "POE", + .at_hwcap = AT_HWCAP2, + .hwcap_bit = HWCAP2_POE, + .cpuinfo = "poe", + .sigill_fn = poe_sigill, + .sigill_reliable = true, + }, { .name = "RNG", .at_hwcap = AT_HWCAP2,
Teach the signal frame parsing about the new POE frame, avoids warning when it is generated.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Cc: Mark Brown broonie@kernel.org Cc: Shuah Khan shuah@kernel.org Reviewed-by: Mark Brown broonie@kernel.org --- tools/testing/selftests/arm64/signal/testcases/testcases.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git tools/testing/selftests/arm64/signal/testcases/testcases.c tools/testing/selftests/arm64/signal/testcases/testcases.c index e4331440fed0..e6daa94fcd2e 100644 --- tools/testing/selftests/arm64/signal/testcases/testcases.c +++ tools/testing/selftests/arm64/signal/testcases/testcases.c @@ -161,6 +161,10 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err) if (head->size != sizeof(struct esr_context)) *err = "Bad size for esr_context"; break; + case POE_MAGIC: + if (head->size != sizeof(struct poe_context)) + *err = "Bad size for poe_context"; + break; case TPIDR2_MAGIC: if (head->size != sizeof(struct tpidr2_context)) *err = "Bad size for tpidr2_context";
Ensure that we get signal context for POR_EL0 if and only if POE is present on the system.
Copied from the TPIDR2 test.
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Cc: Mark Brown broonie@kernel.org Cc: Shuah Khan shuah@kernel.org Reviewed-by: Mark Brown broonie@kernel.org Acked-by: Shuah Khan skhan@linuxfoundation.org --- .../testing/selftests/arm64/signal/.gitignore | 1 + .../arm64/signal/testcases/poe_siginfo.c | 86 +++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c
diff --git tools/testing/selftests/arm64/signal/.gitignore tools/testing/selftests/arm64/signal/.gitignore index 1ce5b5eac386..b2f2bfd5c6aa 100644 --- tools/testing/selftests/arm64/signal/.gitignore +++ tools/testing/selftests/arm64/signal/.gitignore @@ -2,6 +2,7 @@ mangle_* fake_sigreturn_* fpmr_* +poe_* sme_* ssve_* sve_* diff --git tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c new file mode 100644 index 000000000000..36bd9940ee05 --- /dev/null +++ tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Arm Limited + * + * Verify that the POR_EL0 register context in signal frames is set up as + * expected. + */ + +#include <signal.h> +#include <ucontext.h> +#include <sys/auxv.h> +#include <sys/prctl.h> +#include <unistd.h> +#include <asm/sigcontext.h> + +#include "test_signals_utils.h" +#include "testcases.h" + +static union { + ucontext_t uc; + char buf[1024 * 128]; +} context; + +#define SYS_POR_EL0 "S3_3_C10_C2_4" + +static uint64_t get_por_el0(void) +{ + uint64_t val; + + asm volatile( + "mrs %0, " SYS_POR_EL0 "\n" + : "=r"(val) + : + : ); + + return val; +} + +int poe_present(struct tdescr *td, siginfo_t *si, ucontext_t *uc) +{ + struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context); + struct poe_context *poe_ctx; + size_t offset; + bool in_sigframe; + bool have_poe; + __u64 orig_poe; + + have_poe = getauxval(AT_HWCAP2) & HWCAP2_POE; + if (have_poe) + orig_poe = get_por_el0(); + + if (!get_current_context(td, &context.uc, sizeof(context))) + return 1; + + poe_ctx = (struct poe_context *) + get_header(head, POE_MAGIC, td->live_sz, &offset); + + in_sigframe = poe_ctx != NULL; + + fprintf(stderr, "POR_EL0 sigframe %s on system %s POE\n", + in_sigframe ? "present" : "absent", + have_poe ? "with" : "without"); + + td->pass = (in_sigframe == have_poe); + + /* + * Check that the value we read back was the one present at + * the time that the signal was triggered. + */ + if (have_poe && poe_ctx) { + if (poe_ctx->por_el0 != orig_poe) { + fprintf(stderr, "POR_EL0 in frame is %llx, was %llx\n", + poe_ctx->por_el0, orig_poe); + td->pass = false; + } + } + + return 0; +} + +struct tdescr tde = { + .name = "POR_EL0", + .descr = "Validate that POR_EL0 is present as expected", + .timeout = 3, + .run = poe_present, +};
Add new system registers: - POR_EL1 - POR_EL0
Signed-off-by: Joey Gouly joey.gouly@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Cc: Marc Zyngier maz@kernel.org Cc: Oliver Upton oliver.upton@linux.dev Cc: Shuah Khan shuah@kernel.org Reviewed-by: Mark Brown broonie@kernel.org --- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+)
diff --git tools/testing/selftests/kvm/aarch64/get-reg-list.c tools/testing/selftests/kvm/aarch64/get-reg-list.c index 4abebde78187..d43fb3f49050 100644 --- tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -40,6 +40,18 @@ static struct feature_id_reg feat_id_regs[] = { ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ 8, 1 + }, + { + ARM64_SYS_REG(3, 0, 10, 2, 4), /* POR_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ + 16, + 1 + }, + { + ARM64_SYS_REG(3, 3, 10, 2, 4), /* POR_EL0 */ + ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ + 16, + 1 } };
@@ -468,6 +480,7 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 0, 10, 2, 0), /* MAIR_EL1 */ ARM64_SYS_REG(3, 0, 10, 2, 2), /* PIRE0_EL1 */ ARM64_SYS_REG(3, 0, 10, 2, 3), /* PIR_EL1 */ + ARM64_SYS_REG(3, 0, 10, 2, 4), /* POR_EL1 */ ARM64_SYS_REG(3, 0, 10, 3, 0), /* AMAIR_EL1 */ ARM64_SYS_REG(3, 0, 12, 0, 0), /* VBAR_EL1 */ ARM64_SYS_REG(3, 0, 12, 1, 1), /* DISR_EL1 */ @@ -475,6 +488,7 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 0, 13, 0, 4), /* TPIDR_EL1 */ ARM64_SYS_REG(3, 0, 14, 1, 0), /* CNTKCTL_EL1 */ ARM64_SYS_REG(3, 2, 0, 0, 0), /* CSSELR_EL1 */ + ARM64_SYS_REG(3, 3, 10, 2, 4), /* POR_EL0 */ ARM64_SYS_REG(3, 3, 13, 0, 2), /* TPIDR_EL0 */ ARM64_SYS_REG(3, 3, 13, 0, 3), /* TPIDRRO_EL0 */ ARM64_SYS_REG(3, 3, 14, 0, 1), /* CNTPCT_EL0 */
linux-kselftest-mirror@lists.linaro.org