The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
f6d58266d731 ("KVM: SVM: retrieve VMCB from assembly")
f7ef280132f9 ("KVM: SVM: adjust register allocation for __svm_vcpu_run()")
16fdc1de169e ("KVM: SVM: replace regs argument of __svm_vcpu_run() with vcpu_svm")
debc5a1ec0d1 ("KVM: x86: use a separate asm-offsets.c file")
bb06650634d3 ("KVM: VMX: Convert launched argument to flags")
8bd200d23ec4 ("KVM: VMX: Flatten __vmx_vcpu_run()")
527a534c7326 ("x86/tdx: Provide common base for SEAMCALL and TDCALL C wrappers")
59bd54a84d15 ("x86/tdx: Detect running as a TDX guest in early boot")
6198311093da ("x86/cc: Move arch/x86/{kernel/cc_platform.c => coco/core.c}")
f94909ceb1ed ("x86: Prepare asm files for straight-line-speculation")
22da5a07c75e ("x86/lib/atomic64_386_32: Rename things")
1367afaa2ee9 ("x86/entry: Use the correct fence macro after swapgs in kernel CR3")
c07e45553da1 ("x86/entry: Add a fence for kernel entry SWAPGS in paranoid_entry()")
6e5772c8d9cf ("Merge tag 'x86_cc_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f6d58266d731fd7e63163790aad21e0dbb1d5264 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini(a)redhat.com>
Date: Mon, 7 Nov 2022 04:17:29 -0500
Subject: [PATCH] KVM: SVM: retrieve VMCB from assembly
Continue moving accesses to struct vcpu_svm to vmenter.S. Reducing the
number of arguments limits the chance of mistakes due to different
registers used for argument passing in 32- and 64-bit ABIs; pushing the
VMCB argument and almost immediately popping it into a different
register looks pretty weird.
32-bit ABI is not a concern for __svm_sev_es_vcpu_run() which is 64-bit
only; however, it will soon need @svm to save/restore SPEC_CTRL so stay
consistent with __svm_vcpu_run() and let them share the same prototype.
No functional change intended.
Cc: stable(a)vger.kernel.org
Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
index 30db96852e2d..f1b694e431ae 100644
--- a/arch/x86/kvm/kvm-asm-offsets.c
+++ b/arch/x86/kvm/kvm-asm-offsets.c
@@ -15,6 +15,8 @@ static void __used common(void)
if (IS_ENABLED(CONFIG_KVM_AMD)) {
BLANK();
OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs);
+ OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
+ OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
}
if (IS_ENABLED(CONFIG_KVM_INTEL)) {
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index b412bc5773c5..0c86c435c51f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3914,12 +3914,11 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- unsigned long vmcb_pa = svm->current_vmcb->pa;
guest_state_enter_irqoff();
if (sev_es_guest(vcpu->kvm)) {
- __svm_sev_es_vcpu_run(vmcb_pa);
+ __svm_sev_es_vcpu_run(svm);
} else {
struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
@@ -3930,7 +3929,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
* vmcb02 when switching vmcbs for nested virtualization.
*/
vmload(svm->vmcb01.pa);
- __svm_vcpu_run(vmcb_pa, svm);
+ __svm_vcpu_run(svm);
vmsave(svm->vmcb01.pa);
vmload(__sme_page_pa(sd->save_area));
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 447e25c9101a..7ff1879e73c5 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -683,7 +683,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */
-void __svm_sev_es_vcpu_run(unsigned long vmcb_pa);
-void __svm_vcpu_run(unsigned long vmcb_pa, struct vcpu_svm *svm);
+void __svm_sev_es_vcpu_run(struct vcpu_svm *svm);
+void __svm_vcpu_run(struct vcpu_svm *svm);
#endif
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 531510ab6072..d07bac1952c5 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -32,7 +32,6 @@
/**
* __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
- * @vmcb_pa: unsigned long
* @svm: struct vcpu_svm *
*/
SYM_FUNC_START(__svm_vcpu_run)
@@ -49,16 +48,16 @@ SYM_FUNC_START(__svm_vcpu_run)
push %_ASM_BX
/* Save @svm. */
- push %_ASM_ARG2
-
- /* Save @vmcb. */
push %_ASM_ARG1
+.ifnc _ASM_ARG1, _ASM_DI
/* Move @svm to RDI. */
- mov %_ASM_ARG2, %_ASM_DI
+ mov %_ASM_ARG1, %_ASM_DI
+.endif
- /* "POP" @vmcb to RAX. */
- pop %_ASM_AX
+ /* Get svm->current_vmcb->pa into RAX. */
+ mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
+ mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
/* Load guest registers. */
mov VCPU_RCX(%_ASM_DI), %_ASM_CX
@@ -170,7 +169,7 @@ SYM_FUNC_END(__svm_vcpu_run)
/**
* __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
- * @vmcb_pa: unsigned long
+ * @svm: struct vcpu_svm *
*/
SYM_FUNC_START(__svm_sev_es_vcpu_run)
push %_ASM_BP
@@ -185,8 +184,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
#endif
push %_ASM_BX
- /* Move @vmcb to RAX. */
- mov %_ASM_ARG1, %_ASM_AX
+ /* Get svm->current_vmcb->pa into RAX. */
+ mov SVM_current_vmcb(%_ASM_ARG1), %_ASM_AX
+ mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
/* Enter guest mode */
sti
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
f6d58266d731 ("KVM: SVM: retrieve VMCB from assembly")
f7ef280132f9 ("KVM: SVM: adjust register allocation for __svm_vcpu_run()")
16fdc1de169e ("KVM: SVM: replace regs argument of __svm_vcpu_run() with vcpu_svm")
debc5a1ec0d1 ("KVM: x86: use a separate asm-offsets.c file")
bb06650634d3 ("KVM: VMX: Convert launched argument to flags")
8bd200d23ec4 ("KVM: VMX: Flatten __vmx_vcpu_run()")
527a534c7326 ("x86/tdx: Provide common base for SEAMCALL and TDCALL C wrappers")
59bd54a84d15 ("x86/tdx: Detect running as a TDX guest in early boot")
6198311093da ("x86/cc: Move arch/x86/{kernel/cc_platform.c => coco/core.c}")
f94909ceb1ed ("x86: Prepare asm files for straight-line-speculation")
22da5a07c75e ("x86/lib/atomic64_386_32: Rename things")
1367afaa2ee9 ("x86/entry: Use the correct fence macro after swapgs in kernel CR3")
c07e45553da1 ("x86/entry: Add a fence for kernel entry SWAPGS in paranoid_entry()")
6e5772c8d9cf ("Merge tag 'x86_cc_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f6d58266d731fd7e63163790aad21e0dbb1d5264 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini(a)redhat.com>
Date: Mon, 7 Nov 2022 04:17:29 -0500
Subject: [PATCH] KVM: SVM: retrieve VMCB from assembly
Continue moving accesses to struct vcpu_svm to vmenter.S. Reducing the
number of arguments limits the chance of mistakes due to different
registers used for argument passing in 32- and 64-bit ABIs; pushing the
VMCB argument and almost immediately popping it into a different
register looks pretty weird.
32-bit ABI is not a concern for __svm_sev_es_vcpu_run() which is 64-bit
only; however, it will soon need @svm to save/restore SPEC_CTRL so stay
consistent with __svm_vcpu_run() and let them share the same prototype.
No functional change intended.
Cc: stable(a)vger.kernel.org
Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
index 30db96852e2d..f1b694e431ae 100644
--- a/arch/x86/kvm/kvm-asm-offsets.c
+++ b/arch/x86/kvm/kvm-asm-offsets.c
@@ -15,6 +15,8 @@ static void __used common(void)
if (IS_ENABLED(CONFIG_KVM_AMD)) {
BLANK();
OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs);
+ OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
+ OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
}
if (IS_ENABLED(CONFIG_KVM_INTEL)) {
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index b412bc5773c5..0c86c435c51f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3914,12 +3914,11 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- unsigned long vmcb_pa = svm->current_vmcb->pa;
guest_state_enter_irqoff();
if (sev_es_guest(vcpu->kvm)) {
- __svm_sev_es_vcpu_run(vmcb_pa);
+ __svm_sev_es_vcpu_run(svm);
} else {
struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
@@ -3930,7 +3929,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
* vmcb02 when switching vmcbs for nested virtualization.
*/
vmload(svm->vmcb01.pa);
- __svm_vcpu_run(vmcb_pa, svm);
+ __svm_vcpu_run(svm);
vmsave(svm->vmcb01.pa);
vmload(__sme_page_pa(sd->save_area));
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 447e25c9101a..7ff1879e73c5 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -683,7 +683,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */
-void __svm_sev_es_vcpu_run(unsigned long vmcb_pa);
-void __svm_vcpu_run(unsigned long vmcb_pa, struct vcpu_svm *svm);
+void __svm_sev_es_vcpu_run(struct vcpu_svm *svm);
+void __svm_vcpu_run(struct vcpu_svm *svm);
#endif
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 531510ab6072..d07bac1952c5 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -32,7 +32,6 @@
/**
* __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
- * @vmcb_pa: unsigned long
* @svm: struct vcpu_svm *
*/
SYM_FUNC_START(__svm_vcpu_run)
@@ -49,16 +48,16 @@ SYM_FUNC_START(__svm_vcpu_run)
push %_ASM_BX
/* Save @svm. */
- push %_ASM_ARG2
-
- /* Save @vmcb. */
push %_ASM_ARG1
+.ifnc _ASM_ARG1, _ASM_DI
/* Move @svm to RDI. */
- mov %_ASM_ARG2, %_ASM_DI
+ mov %_ASM_ARG1, %_ASM_DI
+.endif
- /* "POP" @vmcb to RAX. */
- pop %_ASM_AX
+ /* Get svm->current_vmcb->pa into RAX. */
+ mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
+ mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
/* Load guest registers. */
mov VCPU_RCX(%_ASM_DI), %_ASM_CX
@@ -170,7 +169,7 @@ SYM_FUNC_END(__svm_vcpu_run)
/**
* __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
- * @vmcb_pa: unsigned long
+ * @svm: struct vcpu_svm *
*/
SYM_FUNC_START(__svm_sev_es_vcpu_run)
push %_ASM_BP
@@ -185,8 +184,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
#endif
push %_ASM_BX
- /* Move @vmcb to RAX. */
- mov %_ASM_ARG1, %_ASM_AX
+ /* Get svm->current_vmcb->pa into RAX. */
+ mov SVM_current_vmcb(%_ASM_ARG1), %_ASM_AX
+ mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
/* Enter guest mode */
sti
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
f7ef280132f9 ("KVM: SVM: adjust register allocation for __svm_vcpu_run()")
16fdc1de169e ("KVM: SVM: replace regs argument of __svm_vcpu_run() with vcpu_svm")
debc5a1ec0d1 ("KVM: x86: use a separate asm-offsets.c file")
bb06650634d3 ("KVM: VMX: Convert launched argument to flags")
8bd200d23ec4 ("KVM: VMX: Flatten __vmx_vcpu_run()")
527a534c7326 ("x86/tdx: Provide common base for SEAMCALL and TDCALL C wrappers")
59bd54a84d15 ("x86/tdx: Detect running as a TDX guest in early boot")
6198311093da ("x86/cc: Move arch/x86/{kernel/cc_platform.c => coco/core.c}")
f94909ceb1ed ("x86: Prepare asm files for straight-line-speculation")
22da5a07c75e ("x86/lib/atomic64_386_32: Rename things")
1367afaa2ee9 ("x86/entry: Use the correct fence macro after swapgs in kernel CR3")
c07e45553da1 ("x86/entry: Add a fence for kernel entry SWAPGS in paranoid_entry()")
6e5772c8d9cf ("Merge tag 'x86_cc_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f7ef280132f9bf6f82acf5aa5c3c837206eef501 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini(a)redhat.com>
Date: Fri, 28 Oct 2022 17:30:07 -0400
Subject: [PATCH] KVM: SVM: adjust register allocation for __svm_vcpu_run()
32-bit ABI uses RAX/RCX/RDX as its argument registers, so they are in
the way of instructions that hardcode their operands such as RDMSR/WRMSR
or VMLOAD/VMRUN/VMSAVE.
In preparation for moving vmload/vmsave to __svm_vcpu_run(), keep
the pointer to the struct vcpu_svm in %rdi. In particular, it is now
possible to load svm->vmcb01.pa in %rax without clobbering the struct
vcpu_svm pointer.
No functional change intended.
Cc: stable(a)vger.kernel.org
Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index f0ff41103e4c..531510ab6072 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -54,29 +54,29 @@ SYM_FUNC_START(__svm_vcpu_run)
/* Save @vmcb. */
push %_ASM_ARG1
- /* Move @svm to RAX. */
- mov %_ASM_ARG2, %_ASM_AX
+ /* Move @svm to RDI. */
+ mov %_ASM_ARG2, %_ASM_DI
+
+ /* "POP" @vmcb to RAX. */
+ pop %_ASM_AX
/* Load guest registers. */
- mov VCPU_RCX(%_ASM_AX), %_ASM_CX
- mov VCPU_RDX(%_ASM_AX), %_ASM_DX
- mov VCPU_RBX(%_ASM_AX), %_ASM_BX
- mov VCPU_RBP(%_ASM_AX), %_ASM_BP
- mov VCPU_RSI(%_ASM_AX), %_ASM_SI
- mov VCPU_RDI(%_ASM_AX), %_ASM_DI
+ mov VCPU_RCX(%_ASM_DI), %_ASM_CX
+ mov VCPU_RDX(%_ASM_DI), %_ASM_DX
+ mov VCPU_RBX(%_ASM_DI), %_ASM_BX
+ mov VCPU_RBP(%_ASM_DI), %_ASM_BP
+ mov VCPU_RSI(%_ASM_DI), %_ASM_SI
#ifdef CONFIG_X86_64
- mov VCPU_R8 (%_ASM_AX), %r8
- mov VCPU_R9 (%_ASM_AX), %r9
- mov VCPU_R10(%_ASM_AX), %r10
- mov VCPU_R11(%_ASM_AX), %r11
- mov VCPU_R12(%_ASM_AX), %r12
- mov VCPU_R13(%_ASM_AX), %r13
- mov VCPU_R14(%_ASM_AX), %r14
- mov VCPU_R15(%_ASM_AX), %r15
+ mov VCPU_R8 (%_ASM_DI), %r8
+ mov VCPU_R9 (%_ASM_DI), %r9
+ mov VCPU_R10(%_ASM_DI), %r10
+ mov VCPU_R11(%_ASM_DI), %r11
+ mov VCPU_R12(%_ASM_DI), %r12
+ mov VCPU_R13(%_ASM_DI), %r13
+ mov VCPU_R14(%_ASM_DI), %r14
+ mov VCPU_R15(%_ASM_DI), %r15
#endif
-
- /* "POP" @vmcb to RAX. */
- pop %_ASM_AX
+ mov VCPU_RDI(%_ASM_DI), %_ASM_DI
/* Enter guest mode */
sti
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
f7ef280132f9 ("KVM: SVM: adjust register allocation for __svm_vcpu_run()")
16fdc1de169e ("KVM: SVM: replace regs argument of __svm_vcpu_run() with vcpu_svm")
debc5a1ec0d1 ("KVM: x86: use a separate asm-offsets.c file")
bb06650634d3 ("KVM: VMX: Convert launched argument to flags")
8bd200d23ec4 ("KVM: VMX: Flatten __vmx_vcpu_run()")
527a534c7326 ("x86/tdx: Provide common base for SEAMCALL and TDCALL C wrappers")
59bd54a84d15 ("x86/tdx: Detect running as a TDX guest in early boot")
6198311093da ("x86/cc: Move arch/x86/{kernel/cc_platform.c => coco/core.c}")
f94909ceb1ed ("x86: Prepare asm files for straight-line-speculation")
22da5a07c75e ("x86/lib/atomic64_386_32: Rename things")
1367afaa2ee9 ("x86/entry: Use the correct fence macro after swapgs in kernel CR3")
c07e45553da1 ("x86/entry: Add a fence for kernel entry SWAPGS in paranoid_entry()")
6e5772c8d9cf ("Merge tag 'x86_cc_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f7ef280132f9bf6f82acf5aa5c3c837206eef501 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini(a)redhat.com>
Date: Fri, 28 Oct 2022 17:30:07 -0400
Subject: [PATCH] KVM: SVM: adjust register allocation for __svm_vcpu_run()
32-bit ABI uses RAX/RCX/RDX as its argument registers, so they are in
the way of instructions that hardcode their operands such as RDMSR/WRMSR
or VMLOAD/VMRUN/VMSAVE.
In preparation for moving vmload/vmsave to __svm_vcpu_run(), keep
the pointer to the struct vcpu_svm in %rdi. In particular, it is now
possible to load svm->vmcb01.pa in %rax without clobbering the struct
vcpu_svm pointer.
No functional change intended.
Cc: stable(a)vger.kernel.org
Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index f0ff41103e4c..531510ab6072 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -54,29 +54,29 @@ SYM_FUNC_START(__svm_vcpu_run)
/* Save @vmcb. */
push %_ASM_ARG1
- /* Move @svm to RAX. */
- mov %_ASM_ARG2, %_ASM_AX
+ /* Move @svm to RDI. */
+ mov %_ASM_ARG2, %_ASM_DI
+
+ /* "POP" @vmcb to RAX. */
+ pop %_ASM_AX
/* Load guest registers. */
- mov VCPU_RCX(%_ASM_AX), %_ASM_CX
- mov VCPU_RDX(%_ASM_AX), %_ASM_DX
- mov VCPU_RBX(%_ASM_AX), %_ASM_BX
- mov VCPU_RBP(%_ASM_AX), %_ASM_BP
- mov VCPU_RSI(%_ASM_AX), %_ASM_SI
- mov VCPU_RDI(%_ASM_AX), %_ASM_DI
+ mov VCPU_RCX(%_ASM_DI), %_ASM_CX
+ mov VCPU_RDX(%_ASM_DI), %_ASM_DX
+ mov VCPU_RBX(%_ASM_DI), %_ASM_BX
+ mov VCPU_RBP(%_ASM_DI), %_ASM_BP
+ mov VCPU_RSI(%_ASM_DI), %_ASM_SI
#ifdef CONFIG_X86_64
- mov VCPU_R8 (%_ASM_AX), %r8
- mov VCPU_R9 (%_ASM_AX), %r9
- mov VCPU_R10(%_ASM_AX), %r10
- mov VCPU_R11(%_ASM_AX), %r11
- mov VCPU_R12(%_ASM_AX), %r12
- mov VCPU_R13(%_ASM_AX), %r13
- mov VCPU_R14(%_ASM_AX), %r14
- mov VCPU_R15(%_ASM_AX), %r15
+ mov VCPU_R8 (%_ASM_DI), %r8
+ mov VCPU_R9 (%_ASM_DI), %r9
+ mov VCPU_R10(%_ASM_DI), %r10
+ mov VCPU_R11(%_ASM_DI), %r11
+ mov VCPU_R12(%_ASM_DI), %r12
+ mov VCPU_R13(%_ASM_DI), %r13
+ mov VCPU_R14(%_ASM_DI), %r14
+ mov VCPU_R15(%_ASM_DI), %r15
#endif
-
- /* "POP" @vmcb to RAX. */
- pop %_ASM_AX
+ mov VCPU_RDI(%_ASM_DI), %_ASM_DI
/* Enter guest mode */
sti
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
9f2febf3f04d ("KVM: SVM: move MSR_IA32_SPEC_CTRL save/restore to assembly")
e287bd005ad9 ("KVM: SVM: restore host save area from assembly")
e61ab42de874 ("KVM: SVM: move guest vmsave/vmload back to assembly")
73412dfeea72 ("KVM: SVM: do not allocate struct svm_cpu_data dynamically")
181d0fb0bb02 ("KVM: SVM: remove dead field from struct svm_cpu_data")
f6d58266d731 ("KVM: SVM: retrieve VMCB from assembly")
f7ef280132f9 ("KVM: SVM: adjust register allocation for __svm_vcpu_run()")
16fdc1de169e ("KVM: SVM: replace regs argument of __svm_vcpu_run() with vcpu_svm")
debc5a1ec0d1 ("KVM: x86: use a separate asm-offsets.c file")
07853adc29a0 ("KVM: VMX: Prevent RSB underflow before vmenter")
fc02735b14ff ("KVM: VMX: Prevent guest RSB poisoning attacks with eIBRS")
bb06650634d3 ("KVM: VMX: Convert launched argument to flags")
8bd200d23ec4 ("KVM: VMX: Flatten __vmx_vcpu_run()")
acac5e98ef8d ("x86/speculation: Remove x86_spec_ctrl_mask")
bbb69e8bee1b ("x86/speculation: Use cached host SPEC_CTRL value for guest entry/exit")
7c693f54c873 ("x86/speculation: Add spectre_v2=ibrs option to support Kernel IBRS")
c779bc1a9002 ("x86/bugs: Optimize SPEC_CTRL MSR writes")
caa0ff24d5d0 ("x86/bugs: Keep a per-CPU IA32_SPEC_CTRL value")
a149180fbcf3 ("x86: Add magic AMD return-thunk")
d9e9d2300681 ("x86,objtool: Create .return_sites")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9f2febf3f04daebdaaa5a43cfa20e3844905c0f9 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini(a)redhat.com>
Date: Fri, 30 Sep 2022 14:24:40 -0400
Subject: [PATCH] KVM: SVM: move MSR_IA32_SPEC_CTRL save/restore to assembly
Restoration of the host IA32_SPEC_CTRL value is probably too late
with respect to the return thunk training sequence.
With respect to the user/kernel boundary, AMD says, "If software chooses
to toggle STIBP (e.g., set STIBP on kernel entry, and clear it on kernel
exit), software should set STIBP to 1 before executing the return thunk
training sequence." I assume the same requirements apply to the guest/host
boundary. The return thunk training sequence is in vmenter.S, quite close
to the VM-exit. On hosts without V_SPEC_CTRL, however, the host's
IA32_SPEC_CTRL value is not restored until much later.
To avoid this, move the restoration of host SPEC_CTRL to assembly and,
for consistency, move the restoration of the guest SPEC_CTRL as well.
This is not particularly difficult, apart from some care to cover both
32- and 64-bit, and to share code between SEV-ES and normal vmentry.
Cc: stable(a)vger.kernel.org
Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
Suggested-by: Jim Mattson <jmattson(a)google.com>
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index da7c361f47e0..6ec0b7ce7453 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -196,22 +196,15 @@ void __init check_bugs(void)
}
/*
- * NOTE: This function is *only* called for SVM. VMX spec_ctrl handling is
- * done in vmenter.S.
+ * NOTE: This function is *only* called for SVM, since Intel uses
+ * MSR_IA32_SPEC_CTRL for SSBD.
*/
void
x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
{
- u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current();
+ u64 guestval, hostval;
struct thread_info *ti = current_thread_info();
- if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
- if (hostval != guestval) {
- msrval = setguest ? guestval : hostval;
- wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
- }
- }
-
/*
* If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
* MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
index 1b805cd24d66..24a710d37323 100644
--- a/arch/x86/kvm/kvm-asm-offsets.c
+++ b/arch/x86/kvm/kvm-asm-offsets.c
@@ -16,6 +16,7 @@ static void __used common(void)
BLANK();
OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs);
OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
+ OFFSET(SVM_spec_ctrl, vcpu_svm, spec_ctrl);
OFFSET(SVM_vmcb01, vcpu_svm, vmcb01);
OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
OFFSET(SD_save_area_pa, svm_cpu_data, save_area_pa);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 469c1b5617af..cf1aed25f4ab 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -720,6 +720,15 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
u32 offset;
u32 *msrpm;
+ /*
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
to_svm(vcpu)->msrpm;
@@ -3901,16 +3910,16 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
{
struct vcpu_svm *svm = to_svm(vcpu);
guest_state_enter_irqoff();
if (sev_es_guest(vcpu->kvm))
- __svm_sev_es_vcpu_run(svm);
+ __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
else
- __svm_vcpu_run(svm);
+ __svm_vcpu_run(svm, spec_ctrl_intercepted);
guest_state_exit_irqoff();
}
@@ -3918,6 +3927,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
trace_kvm_entry(vcpu);
@@ -3976,26 +3986,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
- svm_vcpu_enter_exit(vcpu);
-
- /*
- * We do not use IBRS in the kernel. If this vCPU has used the
- * SPEC_CTRL MSR it may have left it on; save the value and
- * turn it off. This is much more efficient than blindly adding
- * it to the atomic save/restore list. Especially as the former
- * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
- *
- * For non-nested case:
- * If the L01 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- *
- * For nested case:
- * If the L02 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- */
- if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
- unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
- svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
+ svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
if (!sev_es_guest(vcpu->kvm))
reload_tss(vcpu);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 83955a4e520e..199a2ecef1ce 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -682,7 +682,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */
-void __svm_sev_es_vcpu_run(struct vcpu_svm *svm);
-void __svm_vcpu_run(struct vcpu_svm *svm);
+void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
#endif
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 57440acfc73e..34367dc203f2 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -32,9 +32,69 @@
.section .noinstr.text, "ax"
+.macro RESTORE_GUEST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 800f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+801:
+.endm
+.macro RESTORE_GUEST_SPEC_CTRL_BODY
+800:
+ /*
+ * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
+ * host's, write the MSR. This is kept out-of-line so that the common
+ * case does not have to jump.
+ *
+ * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
+ * there must not be any returns or indirect branches between this code
+ * and vmentry.
+ */
+ movl SVM_spec_ctrl(%_ASM_DI), %eax
+ cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ je 801b
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+ xor %edx, %edx
+ wrmsr
+ jmp 801b
+.endm
+
+.macro RESTORE_HOST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+901:
+.endm
+.macro RESTORE_HOST_SPEC_CTRL_BODY
+900:
+ /* Same for after vmexit. */
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+
+ /*
+ * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
+ * if it was not intercepted during guest execution.
+ */
+ cmpb $0, (%_ASM_SP)
+ jnz 998f
+ rdmsr
+ movl %eax, SVM_spec_ctrl(%_ASM_DI)
+998:
+
+ /* Now restore the host value of the MSR if different from the guest's. */
+ movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ cmp SVM_spec_ctrl(%_ASM_DI), %eax
+ je 901b
+ xor %edx, %edx
+ wrmsr
+ jmp 901b
+.endm
+
+
/**
* __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
* @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
*/
SYM_FUNC_START(__svm_vcpu_run)
push %_ASM_BP
@@ -54,17 +114,26 @@ SYM_FUNC_START(__svm_vcpu_run)
* order compared to when they are needed.
*/
+ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
+ push %_ASM_ARG2
+
/* Needed to restore access to percpu variables. */
__ASM_SIZE(push) PER_CPU_VAR(svm_data + SD_save_area_pa)
- /* Save @svm. */
+ /* Finally save @svm. */
push %_ASM_ARG1
.ifnc _ASM_ARG1, _ASM_DI
- /* Move @svm to RDI. */
+ /*
+ * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
+ * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
+ */
mov %_ASM_ARG1, %_ASM_DI
.endif
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_GUEST_SPEC_CTRL
+
/*
* Use a single vmcb (vmcb01 because it's always valid) for
* context switching guest state via VMLOAD/VMSAVE, that way
@@ -142,6 +211,9 @@ SYM_FUNC_START(__svm_vcpu_run)
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_HOST_SPEC_CTRL
+
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
* untrained as soon as we exit the VM and are back to the
@@ -177,6 +249,9 @@ SYM_FUNC_START(__svm_vcpu_run)
xor %r15d, %r15d
#endif
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
pop %_ASM_BX
#ifdef CONFIG_X86_64
@@ -191,6 +266,9 @@ SYM_FUNC_START(__svm_vcpu_run)
pop %_ASM_BP
RET
+ RESTORE_GUEST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY
+
10: cmpb $0, kvm_rebooting
jne 2b
ud2
@@ -214,6 +292,7 @@ SYM_FUNC_END(__svm_vcpu_run)
/**
* __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
* @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
*/
SYM_FUNC_START(__svm_sev_es_vcpu_run)
push %_ASM_BP
@@ -228,8 +307,30 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
#endif
push %_ASM_BX
+ /*
+ * Save variables needed after vmexit on the stack, in inverse
+ * order compared to when they are needed.
+ */
+
+ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
+ push %_ASM_ARG2
+
+ /* Save @svm. */
+ push %_ASM_ARG1
+
+.ifnc _ASM_ARG1, _ASM_DI
+ /*
+ * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
+ * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
+ */
+ mov %_ASM_ARG1, %_ASM_DI
+.endif
+
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_GUEST_SPEC_CTRL
+
/* Get svm->current_vmcb->pa into RAX. */
- mov SVM_current_vmcb(%_ASM_ARG1), %_ASM_AX
+ mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
/* Enter guest mode */
@@ -239,11 +340,17 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
2: cli
+ /* Pop @svm to RDI, guest registers have been saved already. */
+ pop %_ASM_DI
+
#ifdef CONFIG_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_HOST_SPEC_CTRL
+
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
* untrained as soon as we exit the VM and are back to the
@@ -253,6 +360,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
*/
UNTRAIN_RET
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
pop %_ASM_BX
#ifdef CONFIG_X86_64
@@ -267,6 +377,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
pop %_ASM_BP
RET
+ RESTORE_GUEST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY
+
3: cmpb $0, kvm_rebooting
jne 2b
ud2
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
9f2febf3f04d ("KVM: SVM: move MSR_IA32_SPEC_CTRL save/restore to assembly")
e287bd005ad9 ("KVM: SVM: restore host save area from assembly")
e61ab42de874 ("KVM: SVM: move guest vmsave/vmload back to assembly")
73412dfeea72 ("KVM: SVM: do not allocate struct svm_cpu_data dynamically")
181d0fb0bb02 ("KVM: SVM: remove dead field from struct svm_cpu_data")
f6d58266d731 ("KVM: SVM: retrieve VMCB from assembly")
f7ef280132f9 ("KVM: SVM: adjust register allocation for __svm_vcpu_run()")
16fdc1de169e ("KVM: SVM: replace regs argument of __svm_vcpu_run() with vcpu_svm")
debc5a1ec0d1 ("KVM: x86: use a separate asm-offsets.c file")
07853adc29a0 ("KVM: VMX: Prevent RSB underflow before vmenter")
fc02735b14ff ("KVM: VMX: Prevent guest RSB poisoning attacks with eIBRS")
bb06650634d3 ("KVM: VMX: Convert launched argument to flags")
8bd200d23ec4 ("KVM: VMX: Flatten __vmx_vcpu_run()")
acac5e98ef8d ("x86/speculation: Remove x86_spec_ctrl_mask")
bbb69e8bee1b ("x86/speculation: Use cached host SPEC_CTRL value for guest entry/exit")
7c693f54c873 ("x86/speculation: Add spectre_v2=ibrs option to support Kernel IBRS")
c779bc1a9002 ("x86/bugs: Optimize SPEC_CTRL MSR writes")
caa0ff24d5d0 ("x86/bugs: Keep a per-CPU IA32_SPEC_CTRL value")
a149180fbcf3 ("x86: Add magic AMD return-thunk")
d9e9d2300681 ("x86,objtool: Create .return_sites")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9f2febf3f04daebdaaa5a43cfa20e3844905c0f9 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini(a)redhat.com>
Date: Fri, 30 Sep 2022 14:24:40 -0400
Subject: [PATCH] KVM: SVM: move MSR_IA32_SPEC_CTRL save/restore to assembly
Restoration of the host IA32_SPEC_CTRL value is probably too late
with respect to the return thunk training sequence.
With respect to the user/kernel boundary, AMD says, "If software chooses
to toggle STIBP (e.g., set STIBP on kernel entry, and clear it on kernel
exit), software should set STIBP to 1 before executing the return thunk
training sequence." I assume the same requirements apply to the guest/host
boundary. The return thunk training sequence is in vmenter.S, quite close
to the VM-exit. On hosts without V_SPEC_CTRL, however, the host's
IA32_SPEC_CTRL value is not restored until much later.
To avoid this, move the restoration of host SPEC_CTRL to assembly and,
for consistency, move the restoration of the guest SPEC_CTRL as well.
This is not particularly difficult, apart from some care to cover both
32- and 64-bit, and to share code between SEV-ES and normal vmentry.
Cc: stable(a)vger.kernel.org
Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
Suggested-by: Jim Mattson <jmattson(a)google.com>
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index da7c361f47e0..6ec0b7ce7453 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -196,22 +196,15 @@ void __init check_bugs(void)
}
/*
- * NOTE: This function is *only* called for SVM. VMX spec_ctrl handling is
- * done in vmenter.S.
+ * NOTE: This function is *only* called for SVM, since Intel uses
+ * MSR_IA32_SPEC_CTRL for SSBD.
*/
void
x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
{
- u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current();
+ u64 guestval, hostval;
struct thread_info *ti = current_thread_info();
- if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
- if (hostval != guestval) {
- msrval = setguest ? guestval : hostval;
- wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
- }
- }
-
/*
* If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
* MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
index 1b805cd24d66..24a710d37323 100644
--- a/arch/x86/kvm/kvm-asm-offsets.c
+++ b/arch/x86/kvm/kvm-asm-offsets.c
@@ -16,6 +16,7 @@ static void __used common(void)
BLANK();
OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs);
OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
+ OFFSET(SVM_spec_ctrl, vcpu_svm, spec_ctrl);
OFFSET(SVM_vmcb01, vcpu_svm, vmcb01);
OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
OFFSET(SD_save_area_pa, svm_cpu_data, save_area_pa);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 469c1b5617af..cf1aed25f4ab 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -720,6 +720,15 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
u32 offset;
u32 *msrpm;
+ /*
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
to_svm(vcpu)->msrpm;
@@ -3901,16 +3910,16 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
{
struct vcpu_svm *svm = to_svm(vcpu);
guest_state_enter_irqoff();
if (sev_es_guest(vcpu->kvm))
- __svm_sev_es_vcpu_run(svm);
+ __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
else
- __svm_vcpu_run(svm);
+ __svm_vcpu_run(svm, spec_ctrl_intercepted);
guest_state_exit_irqoff();
}
@@ -3918,6 +3927,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
trace_kvm_entry(vcpu);
@@ -3976,26 +3986,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
- svm_vcpu_enter_exit(vcpu);
-
- /*
- * We do not use IBRS in the kernel. If this vCPU has used the
- * SPEC_CTRL MSR it may have left it on; save the value and
- * turn it off. This is much more efficient than blindly adding
- * it to the atomic save/restore list. Especially as the former
- * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
- *
- * For non-nested case:
- * If the L01 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- *
- * For nested case:
- * If the L02 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- */
- if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
- unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
- svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
+ svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
if (!sev_es_guest(vcpu->kvm))
reload_tss(vcpu);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 83955a4e520e..199a2ecef1ce 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -682,7 +682,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */
-void __svm_sev_es_vcpu_run(struct vcpu_svm *svm);
-void __svm_vcpu_run(struct vcpu_svm *svm);
+void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
#endif
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 57440acfc73e..34367dc203f2 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -32,9 +32,69 @@
.section .noinstr.text, "ax"
+.macro RESTORE_GUEST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 800f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+801:
+.endm
+.macro RESTORE_GUEST_SPEC_CTRL_BODY
+800:
+ /*
+ * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
+ * host's, write the MSR. This is kept out-of-line so that the common
+ * case does not have to jump.
+ *
+ * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
+ * there must not be any returns or indirect branches between this code
+ * and vmentry.
+ */
+ movl SVM_spec_ctrl(%_ASM_DI), %eax
+ cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ je 801b
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+ xor %edx, %edx
+ wrmsr
+ jmp 801b
+.endm
+
+.macro RESTORE_HOST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+901:
+.endm
+.macro RESTORE_HOST_SPEC_CTRL_BODY
+900:
+ /* Same for after vmexit. */
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+
+ /*
+ * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
+ * if it was not intercepted during guest execution.
+ */
+ cmpb $0, (%_ASM_SP)
+ jnz 998f
+ rdmsr
+ movl %eax, SVM_spec_ctrl(%_ASM_DI)
+998:
+
+ /* Now restore the host value of the MSR if different from the guest's. */
+ movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ cmp SVM_spec_ctrl(%_ASM_DI), %eax
+ je 901b
+ xor %edx, %edx
+ wrmsr
+ jmp 901b
+.endm
+
+
/**
* __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
* @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
*/
SYM_FUNC_START(__svm_vcpu_run)
push %_ASM_BP
@@ -54,17 +114,26 @@ SYM_FUNC_START(__svm_vcpu_run)
* order compared to when they are needed.
*/
+ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
+ push %_ASM_ARG2
+
/* Needed to restore access to percpu variables. */
__ASM_SIZE(push) PER_CPU_VAR(svm_data + SD_save_area_pa)
- /* Save @svm. */
+ /* Finally save @svm. */
push %_ASM_ARG1
.ifnc _ASM_ARG1, _ASM_DI
- /* Move @svm to RDI. */
+ /*
+ * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
+ * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
+ */
mov %_ASM_ARG1, %_ASM_DI
.endif
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_GUEST_SPEC_CTRL
+
/*
* Use a single vmcb (vmcb01 because it's always valid) for
* context switching guest state via VMLOAD/VMSAVE, that way
@@ -142,6 +211,9 @@ SYM_FUNC_START(__svm_vcpu_run)
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_HOST_SPEC_CTRL
+
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
* untrained as soon as we exit the VM and are back to the
@@ -177,6 +249,9 @@ SYM_FUNC_START(__svm_vcpu_run)
xor %r15d, %r15d
#endif
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
pop %_ASM_BX
#ifdef CONFIG_X86_64
@@ -191,6 +266,9 @@ SYM_FUNC_START(__svm_vcpu_run)
pop %_ASM_BP
RET
+ RESTORE_GUEST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY
+
10: cmpb $0, kvm_rebooting
jne 2b
ud2
@@ -214,6 +292,7 @@ SYM_FUNC_END(__svm_vcpu_run)
/**
* __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
* @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
*/
SYM_FUNC_START(__svm_sev_es_vcpu_run)
push %_ASM_BP
@@ -228,8 +307,30 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
#endif
push %_ASM_BX
+ /*
+ * Save variables needed after vmexit on the stack, in inverse
+ * order compared to when they are needed.
+ */
+
+ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
+ push %_ASM_ARG2
+
+ /* Save @svm. */
+ push %_ASM_ARG1
+
+.ifnc _ASM_ARG1, _ASM_DI
+ /*
+ * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
+ * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
+ */
+ mov %_ASM_ARG1, %_ASM_DI
+.endif
+
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_GUEST_SPEC_CTRL
+
/* Get svm->current_vmcb->pa into RAX. */
- mov SVM_current_vmcb(%_ASM_ARG1), %_ASM_AX
+ mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
/* Enter guest mode */
@@ -239,11 +340,17 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
2: cli
+ /* Pop @svm to RDI, guest registers have been saved already. */
+ pop %_ASM_DI
+
#ifdef CONFIG_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_HOST_SPEC_CTRL
+
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
* untrained as soon as we exit the VM and are back to the
@@ -253,6 +360,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
*/
UNTRAIN_RET
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
pop %_ASM_BX
#ifdef CONFIG_X86_64
@@ -267,6 +377,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
pop %_ASM_BP
RET
+ RESTORE_GUEST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY
+
3: cmpb $0, kvm_rebooting
jne 2b
ud2
The patch below does not apply to the 6.0-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
9f2febf3f04d ("KVM: SVM: move MSR_IA32_SPEC_CTRL save/restore to assembly")
e287bd005ad9 ("KVM: SVM: restore host save area from assembly")
e61ab42de874 ("KVM: SVM: move guest vmsave/vmload back to assembly")
73412dfeea72 ("KVM: SVM: do not allocate struct svm_cpu_data dynamically")
181d0fb0bb02 ("KVM: SVM: remove dead field from struct svm_cpu_data")
f6d58266d731 ("KVM: SVM: retrieve VMCB from assembly")
f7ef280132f9 ("KVM: SVM: adjust register allocation for __svm_vcpu_run()")
16fdc1de169e ("KVM: SVM: replace regs argument of __svm_vcpu_run() with vcpu_svm")
debc5a1ec0d1 ("KVM: x86: use a separate asm-offsets.c file")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9f2febf3f04daebdaaa5a43cfa20e3844905c0f9 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini(a)redhat.com>
Date: Fri, 30 Sep 2022 14:24:40 -0400
Subject: [PATCH] KVM: SVM: move MSR_IA32_SPEC_CTRL save/restore to assembly
Restoration of the host IA32_SPEC_CTRL value is probably too late
with respect to the return thunk training sequence.
With respect to the user/kernel boundary, AMD says, "If software chooses
to toggle STIBP (e.g., set STIBP on kernel entry, and clear it on kernel
exit), software should set STIBP to 1 before executing the return thunk
training sequence." I assume the same requirements apply to the guest/host
boundary. The return thunk training sequence is in vmenter.S, quite close
to the VM-exit. On hosts without V_SPEC_CTRL, however, the host's
IA32_SPEC_CTRL value is not restored until much later.
To avoid this, move the restoration of host SPEC_CTRL to assembly and,
for consistency, move the restoration of the guest SPEC_CTRL as well.
This is not particularly difficult, apart from some care to cover both
32- and 64-bit, and to share code between SEV-ES and normal vmentry.
Cc: stable(a)vger.kernel.org
Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
Suggested-by: Jim Mattson <jmattson(a)google.com>
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index da7c361f47e0..6ec0b7ce7453 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -196,22 +196,15 @@ void __init check_bugs(void)
}
/*
- * NOTE: This function is *only* called for SVM. VMX spec_ctrl handling is
- * done in vmenter.S.
+ * NOTE: This function is *only* called for SVM, since Intel uses
+ * MSR_IA32_SPEC_CTRL for SSBD.
*/
void
x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
{
- u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current();
+ u64 guestval, hostval;
struct thread_info *ti = current_thread_info();
- if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
- if (hostval != guestval) {
- msrval = setguest ? guestval : hostval;
- wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
- }
- }
-
/*
* If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
* MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
index 1b805cd24d66..24a710d37323 100644
--- a/arch/x86/kvm/kvm-asm-offsets.c
+++ b/arch/x86/kvm/kvm-asm-offsets.c
@@ -16,6 +16,7 @@ static void __used common(void)
BLANK();
OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs);
OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
+ OFFSET(SVM_spec_ctrl, vcpu_svm, spec_ctrl);
OFFSET(SVM_vmcb01, vcpu_svm, vmcb01);
OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
OFFSET(SD_save_area_pa, svm_cpu_data, save_area_pa);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 469c1b5617af..cf1aed25f4ab 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -720,6 +720,15 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
u32 offset;
u32 *msrpm;
+ /*
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
to_svm(vcpu)->msrpm;
@@ -3901,16 +3910,16 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
{
struct vcpu_svm *svm = to_svm(vcpu);
guest_state_enter_irqoff();
if (sev_es_guest(vcpu->kvm))
- __svm_sev_es_vcpu_run(svm);
+ __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
else
- __svm_vcpu_run(svm);
+ __svm_vcpu_run(svm, spec_ctrl_intercepted);
guest_state_exit_irqoff();
}
@@ -3918,6 +3927,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
trace_kvm_entry(vcpu);
@@ -3976,26 +3986,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
- svm_vcpu_enter_exit(vcpu);
-
- /*
- * We do not use IBRS in the kernel. If this vCPU has used the
- * SPEC_CTRL MSR it may have left it on; save the value and
- * turn it off. This is much more efficient than blindly adding
- * it to the atomic save/restore list. Especially as the former
- * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
- *
- * For non-nested case:
- * If the L01 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- *
- * For nested case:
- * If the L02 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- */
- if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
- unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
- svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
+ svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
if (!sev_es_guest(vcpu->kvm))
reload_tss(vcpu);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 83955a4e520e..199a2ecef1ce 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -682,7 +682,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */
-void __svm_sev_es_vcpu_run(struct vcpu_svm *svm);
-void __svm_vcpu_run(struct vcpu_svm *svm);
+void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
#endif
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 57440acfc73e..34367dc203f2 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -32,9 +32,69 @@
.section .noinstr.text, "ax"
+.macro RESTORE_GUEST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 800f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+801:
+.endm
+.macro RESTORE_GUEST_SPEC_CTRL_BODY
+800:
+ /*
+ * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
+ * host's, write the MSR. This is kept out-of-line so that the common
+ * case does not have to jump.
+ *
+ * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
+ * there must not be any returns or indirect branches between this code
+ * and vmentry.
+ */
+ movl SVM_spec_ctrl(%_ASM_DI), %eax
+ cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ je 801b
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+ xor %edx, %edx
+ wrmsr
+ jmp 801b
+.endm
+
+.macro RESTORE_HOST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+901:
+.endm
+.macro RESTORE_HOST_SPEC_CTRL_BODY
+900:
+ /* Same for after vmexit. */
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+
+ /*
+ * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
+ * if it was not intercepted during guest execution.
+ */
+ cmpb $0, (%_ASM_SP)
+ jnz 998f
+ rdmsr
+ movl %eax, SVM_spec_ctrl(%_ASM_DI)
+998:
+
+ /* Now restore the host value of the MSR if different from the guest's. */
+ movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ cmp SVM_spec_ctrl(%_ASM_DI), %eax
+ je 901b
+ xor %edx, %edx
+ wrmsr
+ jmp 901b
+.endm
+
+
/**
* __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
* @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
*/
SYM_FUNC_START(__svm_vcpu_run)
push %_ASM_BP
@@ -54,17 +114,26 @@ SYM_FUNC_START(__svm_vcpu_run)
* order compared to when they are needed.
*/
+ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
+ push %_ASM_ARG2
+
/* Needed to restore access to percpu variables. */
__ASM_SIZE(push) PER_CPU_VAR(svm_data + SD_save_area_pa)
- /* Save @svm. */
+ /* Finally save @svm. */
push %_ASM_ARG1
.ifnc _ASM_ARG1, _ASM_DI
- /* Move @svm to RDI. */
+ /*
+ * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
+ * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
+ */
mov %_ASM_ARG1, %_ASM_DI
.endif
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_GUEST_SPEC_CTRL
+
/*
* Use a single vmcb (vmcb01 because it's always valid) for
* context switching guest state via VMLOAD/VMSAVE, that way
@@ -142,6 +211,9 @@ SYM_FUNC_START(__svm_vcpu_run)
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_HOST_SPEC_CTRL
+
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
* untrained as soon as we exit the VM and are back to the
@@ -177,6 +249,9 @@ SYM_FUNC_START(__svm_vcpu_run)
xor %r15d, %r15d
#endif
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
pop %_ASM_BX
#ifdef CONFIG_X86_64
@@ -191,6 +266,9 @@ SYM_FUNC_START(__svm_vcpu_run)
pop %_ASM_BP
RET
+ RESTORE_GUEST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY
+
10: cmpb $0, kvm_rebooting
jne 2b
ud2
@@ -214,6 +292,7 @@ SYM_FUNC_END(__svm_vcpu_run)
/**
* __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
* @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
*/
SYM_FUNC_START(__svm_sev_es_vcpu_run)
push %_ASM_BP
@@ -228,8 +307,30 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
#endif
push %_ASM_BX
+ /*
+ * Save variables needed after vmexit on the stack, in inverse
+ * order compared to when they are needed.
+ */
+
+ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
+ push %_ASM_ARG2
+
+ /* Save @svm. */
+ push %_ASM_ARG1
+
+.ifnc _ASM_ARG1, _ASM_DI
+ /*
+ * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
+ * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
+ */
+ mov %_ASM_ARG1, %_ASM_DI
+.endif
+
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_GUEST_SPEC_CTRL
+
/* Get svm->current_vmcb->pa into RAX. */
- mov SVM_current_vmcb(%_ASM_ARG1), %_ASM_AX
+ mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
/* Enter guest mode */
@@ -239,11 +340,17 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
2: cli
+ /* Pop @svm to RDI, guest registers have been saved already. */
+ pop %_ASM_DI
+
#ifdef CONFIG_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_HOST_SPEC_CTRL
+
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
* untrained as soon as we exit the VM and are back to the
@@ -253,6 +360,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
*/
UNTRAIN_RET
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
pop %_ASM_BX
#ifdef CONFIG_X86_64
@@ -267,6 +377,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
pop %_ASM_BP
RET
+ RESTORE_GUEST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY
+
3: cmpb $0, kvm_rebooting
jne 2b
ud2
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
debc5a1ec0d1 ("KVM: x86: use a separate asm-offsets.c file")
bb06650634d3 ("KVM: VMX: Convert launched argument to flags")
8bd200d23ec4 ("KVM: VMX: Flatten __vmx_vcpu_run()")
527a534c7326 ("x86/tdx: Provide common base for SEAMCALL and TDCALL C wrappers")
59bd54a84d15 ("x86/tdx: Detect running as a TDX guest in early boot")
6198311093da ("x86/cc: Move arch/x86/{kernel/cc_platform.c => coco/core.c}")
f94909ceb1ed ("x86: Prepare asm files for straight-line-speculation")
22da5a07c75e ("x86/lib/atomic64_386_32: Rename things")
1367afaa2ee9 ("x86/entry: Use the correct fence macro after swapgs in kernel CR3")
c07e45553da1 ("x86/entry: Add a fence for kernel entry SWAPGS in paranoid_entry()")
6e5772c8d9cf ("Merge tag 'x86_cc_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From debc5a1ec0d195ffea70d11efeffb713de9fdbc7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini(a)redhat.com>
Date: Tue, 8 Nov 2022 09:44:53 +0100
Subject: [PATCH] KVM: x86: use a separate asm-offsets.c file
This already removes an ugly #include "" from asm-offsets.c, but
especially it avoids a future error when trying to define asm-offsets
for KVM's svm/svm.h header.
This would not work for kernel/asm-offsets.c, because svm/svm.h
includes kvm_cache_regs.h which is not in the include path when
compiling asm-offsets.c. The problem is not there if the .c file is
in arch/x86/kvm.
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Cc: stable(a)vger.kernel.org
Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index cb50589a7102..437308004ef2 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -19,7 +19,6 @@
#include <asm/suspend.h>
#include <asm/tlbflush.h>
#include <asm/tdx.h>
-#include "../kvm/vmx/vmx.h"
#ifdef CONFIG_XEN
#include <xen/interface/xen.h>
@@ -108,9 +107,4 @@ static void __used common(void)
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
-
- if (IS_ENABLED(CONFIG_KVM_INTEL)) {
- BLANK();
- OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl);
- }
}
diff --git a/arch/x86/kvm/.gitignore b/arch/x86/kvm/.gitignore
new file mode 100644
index 000000000000..615d6ff35c00
--- /dev/null
+++ b/arch/x86/kvm/.gitignore
@@ -0,0 +1,2 @@
+/kvm-asm-offsets.s
+/kvm-asm-offsets.h
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 30f244b64523..a02cf9baacc8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -34,3 +34,12 @@ endif
obj-$(CONFIG_KVM) += kvm.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
obj-$(CONFIG_KVM_AMD) += kvm-amd.o
+
+AFLAGS_vmx/vmenter.o := -iquote $(obj)
+$(obj)/vmx/vmenter.o: $(obj)/kvm-asm-offsets.h
+
+$(obj)/kvm-asm-offsets.h: $(obj)/kvm-asm-offsets.s FORCE
+ $(call filechk,offsets,__KVM_ASM_OFFSETS_H__)
+
+targets += kvm-asm-offsets.s
+clean-files += kvm-asm-offsets.h
diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
new file mode 100644
index 000000000000..9d84f2b32d7f
--- /dev/null
+++ b/arch/x86/kvm/kvm-asm-offsets.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+#define COMPILE_OFFSETS
+
+#include <linux/kbuild.h>
+#include "vmx/vmx.h"
+
+static void __used common(void)
+{
+ if (IS_ENABLED(CONFIG_KVM_INTEL)) {
+ BLANK();
+ OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl);
+ }
+}
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 8477d8bdd69c..0b5db4de4d09 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -1,12 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <asm/asm.h>
-#include <asm/asm-offsets.h>
#include <asm/bitsperlong.h>
#include <asm/kvm_vcpu_regs.h>
#include <asm/nospec-branch.h>
#include <asm/percpu.h>
#include <asm/segment.h>
+#include "kvm-asm-offsets.h"
#include "run_flags.h"
#define WORD_SIZE (BITS_PER_LONG / 8)
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
debc5a1ec0d1 ("KVM: x86: use a separate asm-offsets.c file")
bb06650634d3 ("KVM: VMX: Convert launched argument to flags")
8bd200d23ec4 ("KVM: VMX: Flatten __vmx_vcpu_run()")
527a534c7326 ("x86/tdx: Provide common base for SEAMCALL and TDCALL C wrappers")
59bd54a84d15 ("x86/tdx: Detect running as a TDX guest in early boot")
6198311093da ("x86/cc: Move arch/x86/{kernel/cc_platform.c => coco/core.c}")
f94909ceb1ed ("x86: Prepare asm files for straight-line-speculation")
22da5a07c75e ("x86/lib/atomic64_386_32: Rename things")
1367afaa2ee9 ("x86/entry: Use the correct fence macro after swapgs in kernel CR3")
c07e45553da1 ("x86/entry: Add a fence for kernel entry SWAPGS in paranoid_entry()")
6e5772c8d9cf ("Merge tag 'x86_cc_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From debc5a1ec0d195ffea70d11efeffb713de9fdbc7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini(a)redhat.com>
Date: Tue, 8 Nov 2022 09:44:53 +0100
Subject: [PATCH] KVM: x86: use a separate asm-offsets.c file
This already removes an ugly #include "" from asm-offsets.c, but
especially it avoids a future error when trying to define asm-offsets
for KVM's svm/svm.h header.
This would not work for kernel/asm-offsets.c, because svm/svm.h
includes kvm_cache_regs.h which is not in the include path when
compiling asm-offsets.c. The problem is not there if the .c file is
in arch/x86/kvm.
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Cc: stable(a)vger.kernel.org
Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index cb50589a7102..437308004ef2 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -19,7 +19,6 @@
#include <asm/suspend.h>
#include <asm/tlbflush.h>
#include <asm/tdx.h>
-#include "../kvm/vmx/vmx.h"
#ifdef CONFIG_XEN
#include <xen/interface/xen.h>
@@ -108,9 +107,4 @@ static void __used common(void)
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
-
- if (IS_ENABLED(CONFIG_KVM_INTEL)) {
- BLANK();
- OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl);
- }
}
diff --git a/arch/x86/kvm/.gitignore b/arch/x86/kvm/.gitignore
new file mode 100644
index 000000000000..615d6ff35c00
--- /dev/null
+++ b/arch/x86/kvm/.gitignore
@@ -0,0 +1,2 @@
+/kvm-asm-offsets.s
+/kvm-asm-offsets.h
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 30f244b64523..a02cf9baacc8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -34,3 +34,12 @@ endif
obj-$(CONFIG_KVM) += kvm.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
obj-$(CONFIG_KVM_AMD) += kvm-amd.o
+
+AFLAGS_vmx/vmenter.o := -iquote $(obj)
+$(obj)/vmx/vmenter.o: $(obj)/kvm-asm-offsets.h
+
+$(obj)/kvm-asm-offsets.h: $(obj)/kvm-asm-offsets.s FORCE
+ $(call filechk,offsets,__KVM_ASM_OFFSETS_H__)
+
+targets += kvm-asm-offsets.s
+clean-files += kvm-asm-offsets.h
diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
new file mode 100644
index 000000000000..9d84f2b32d7f
--- /dev/null
+++ b/arch/x86/kvm/kvm-asm-offsets.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+#define COMPILE_OFFSETS
+
+#include <linux/kbuild.h>
+#include "vmx/vmx.h"
+
+static void __used common(void)
+{
+ if (IS_ENABLED(CONFIG_KVM_INTEL)) {
+ BLANK();
+ OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl);
+ }
+}
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 8477d8bdd69c..0b5db4de4d09 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -1,12 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <asm/asm.h>
-#include <asm/asm-offsets.h>
#include <asm/bitsperlong.h>
#include <asm/kvm_vcpu_regs.h>
#include <asm/nospec-branch.h>
#include <asm/percpu.h>
#include <asm/segment.h>
+#include "kvm-asm-offsets.h"
#include "run_flags.h"
#define WORD_SIZE (BITS_PER_LONG / 8)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
8631ef59b622 ("KVM: x86/pmu: Do not speculatively query Intel GP PMCs that don't exist yet")
902caeb6841a ("KVM: x86/pmu: Add PEBS_DATA_CFG MSR emulation to support adaptive PEBS")
8183a538cd95 ("KVM: x86/pmu: Add IA32_DS_AREA MSR emulation to support guest DS")
6ebe44366bde ("KVM: x86/pmu: Adjust precise_ip to emulate Ice Lake guest PDIR counter")
79f3e3b58386 ("KVM: x86/pmu: Reprogram PEBS event to emulate guest PEBS counter")
c59a1f106f5c ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS")
0d23dc34a7ce ("x86/perf/core: Add pebs_capable to store valid PEBS_COUNTER_MASK value")
2c985527dd8d ("KVM: x86/pmu: Introduce the ctrl_mask value for fixed counter")
39a4d779546a ("perf/x86/core: Pass "struct kvm_pmu *" to determine the guest values")
38904911e864 ("Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8631ef59b62290c7d88e7209e35dfb47f33f4902 Mon Sep 17 00:00:00 2001
From: Like Xu <likexu(a)tencent.com>
Date: Mon, 19 Sep 2022 17:10:06 +0800
Subject: [PATCH] KVM: x86/pmu: Do not speculatively query Intel GP PMCs that
don't exist yet
The SDM lists an architectural MSR IA32_CORE_CAPABILITIES (0xCF)
that limits the theoretical maximum value of the Intel GP PMC MSRs
allocated at 0xC1 to 14; likewise the Intel April 2022 SDM adds
IA32_OVERCLOCKING_STATUS at 0x195 which limits the number of event
selection MSRs to 15 (0x186-0x194).
Limiting the maximum number of counters to 14 or 18 based on the currently
allocated MSRs is clearly fragile, and it seems likely that Intel will
even place PMCs 8-15 at a completely different range of MSR indices.
So stop at the maximum number of GP PMCs supported today on Intel
processors.
There are some machines, like Intel P4 with non Architectural PMU, that
may indeed have 18 counters, but those counters are in a completely
different MSR address range and are not supported by KVM.
Cc: Vitaly Kuznetsov <vkuznets(a)redhat.com>
Cc: stable(a)vger.kernel.org
Fixes: cf05a67b68b8 ("KVM: x86: omit "impossible" pmu MSRs from MSR list")
Suggested-by: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Like Xu <likexu(a)tencent.com>
Reviewed-by: Jim Mattson <jmattson(a)google.com>
Message-Id: <20220919091008.60695-1-likexu(a)tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5f5eb577d583..73716fab120f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1442,20 +1442,10 @@ static const u32 msrs_to_save_all[] = {
MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
- MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
- MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
- MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
- MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
- MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
- MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
- MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
- MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
- MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
- MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
@@ -7041,12 +7031,12 @@ static void kvm_init_msr_list(void)
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
continue;
break;
- case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
+ case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 7:
if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
continue;
break;
- case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
+ case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 7:
if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
continue;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
8631ef59b622 ("KVM: x86/pmu: Do not speculatively query Intel GP PMCs that don't exist yet")
902caeb6841a ("KVM: x86/pmu: Add PEBS_DATA_CFG MSR emulation to support adaptive PEBS")
8183a538cd95 ("KVM: x86/pmu: Add IA32_DS_AREA MSR emulation to support guest DS")
6ebe44366bde ("KVM: x86/pmu: Adjust precise_ip to emulate Ice Lake guest PDIR counter")
79f3e3b58386 ("KVM: x86/pmu: Reprogram PEBS event to emulate guest PEBS counter")
c59a1f106f5c ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS")
0d23dc34a7ce ("x86/perf/core: Add pebs_capable to store valid PEBS_COUNTER_MASK value")
2c985527dd8d ("KVM: x86/pmu: Introduce the ctrl_mask value for fixed counter")
39a4d779546a ("perf/x86/core: Pass "struct kvm_pmu *" to determine the guest values")
38904911e864 ("Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8631ef59b62290c7d88e7209e35dfb47f33f4902 Mon Sep 17 00:00:00 2001
From: Like Xu <likexu(a)tencent.com>
Date: Mon, 19 Sep 2022 17:10:06 +0800
Subject: [PATCH] KVM: x86/pmu: Do not speculatively query Intel GP PMCs that
don't exist yet
The SDM lists an architectural MSR IA32_CORE_CAPABILITIES (0xCF)
that limits the theoretical maximum value of the Intel GP PMC MSRs
allocated at 0xC1 to 14; likewise the Intel April 2022 SDM adds
IA32_OVERCLOCKING_STATUS at 0x195 which limits the number of event
selection MSRs to 15 (0x186-0x194).
Limiting the maximum number of counters to 14 or 18 based on the currently
allocated MSRs is clearly fragile, and it seems likely that Intel will
even place PMCs 8-15 at a completely different range of MSR indices.
So stop at the maximum number of GP PMCs supported today on Intel
processors.
There are some machines, like Intel P4 with non Architectural PMU, that
may indeed have 18 counters, but those counters are in a completely
different MSR address range and are not supported by KVM.
Cc: Vitaly Kuznetsov <vkuznets(a)redhat.com>
Cc: stable(a)vger.kernel.org
Fixes: cf05a67b68b8 ("KVM: x86: omit "impossible" pmu MSRs from MSR list")
Suggested-by: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Like Xu <likexu(a)tencent.com>
Reviewed-by: Jim Mattson <jmattson(a)google.com>
Message-Id: <20220919091008.60695-1-likexu(a)tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5f5eb577d583..73716fab120f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1442,20 +1442,10 @@ static const u32 msrs_to_save_all[] = {
MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
- MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
- MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
- MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
- MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
- MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
- MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
- MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
- MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
- MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
- MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
@@ -7041,12 +7031,12 @@ static void kvm_init_msr_list(void)
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
continue;
break;
- case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
+ case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 7:
if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
continue;
break;
- case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
+ case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 7:
if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
continue;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
8631ef59b622 ("KVM: x86/pmu: Do not speculatively query Intel GP PMCs that don't exist yet")
902caeb6841a ("KVM: x86/pmu: Add PEBS_DATA_CFG MSR emulation to support adaptive PEBS")
8183a538cd95 ("KVM: x86/pmu: Add IA32_DS_AREA MSR emulation to support guest DS")
6ebe44366bde ("KVM: x86/pmu: Adjust precise_ip to emulate Ice Lake guest PDIR counter")
79f3e3b58386 ("KVM: x86/pmu: Reprogram PEBS event to emulate guest PEBS counter")
c59a1f106f5c ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS")
0d23dc34a7ce ("x86/perf/core: Add pebs_capable to store valid PEBS_COUNTER_MASK value")
2c985527dd8d ("KVM: x86/pmu: Introduce the ctrl_mask value for fixed counter")
39a4d779546a ("perf/x86/core: Pass "struct kvm_pmu *" to determine the guest values")
38904911e864 ("Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8631ef59b62290c7d88e7209e35dfb47f33f4902 Mon Sep 17 00:00:00 2001
From: Like Xu <likexu(a)tencent.com>
Date: Mon, 19 Sep 2022 17:10:06 +0800
Subject: [PATCH] KVM: x86/pmu: Do not speculatively query Intel GP PMCs that
don't exist yet
The SDM lists an architectural MSR IA32_CORE_CAPABILITIES (0xCF)
that limits the theoretical maximum value of the Intel GP PMC MSRs
allocated at 0xC1 to 14; likewise the Intel April 2022 SDM adds
IA32_OVERCLOCKING_STATUS at 0x195 which limits the number of event
selection MSRs to 15 (0x186-0x194).
Limiting the maximum number of counters to 14 or 18 based on the currently
allocated MSRs is clearly fragile, and it seems likely that Intel will
even place PMCs 8-15 at a completely different range of MSR indices.
So stop at the maximum number of GP PMCs supported today on Intel
processors.
There are some machines, like Intel P4 with non Architectural PMU, that
may indeed have 18 counters, but those counters are in a completely
different MSR address range and are not supported by KVM.
Cc: Vitaly Kuznetsov <vkuznets(a)redhat.com>
Cc: stable(a)vger.kernel.org
Fixes: cf05a67b68b8 ("KVM: x86: omit "impossible" pmu MSRs from MSR list")
Suggested-by: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Like Xu <likexu(a)tencent.com>
Reviewed-by: Jim Mattson <jmattson(a)google.com>
Message-Id: <20220919091008.60695-1-likexu(a)tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5f5eb577d583..73716fab120f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1442,20 +1442,10 @@ static const u32 msrs_to_save_all[] = {
MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
- MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
- MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
- MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
- MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
- MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
- MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
- MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
- MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
- MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
- MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
@@ -7041,12 +7031,12 @@ static void kvm_init_msr_list(void)
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
continue;
break;
- case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
+ case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 7:
if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
continue;
break;
- case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
+ case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 7:
if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
continue;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
6d3085e4d89a ("KVM: x86/mmu: Block all page faults during kvm_zap_gfn_range()")
20ec3ebd707c ("KVM: Rename mmu_notifier_* to mmu_invalidate_*")
65e3b446bcce ("KVM: x86/mmu: Document the "rules" for using host_pfn_mapping_level()")
a8ac499bb6ab ("KVM: x86/mmu: Don't require refcounted "struct page" to create huge SPTEs")
9202aee816c8 ("KVM: x86/mmu: Rename pte_list_{destroy,remove}() to show they zap SPTEs")
a42989e7fbb0 ("KVM: x86/mmu: Directly "destroy" PTE list when recycling rmaps")
2ff9039a75a8 ("KVM: x86/mmu: Decouple rmap_add() and link_shadow_page() from kvm_vcpu")
6ec6509eea39 ("KVM: x86/mmu: Pass const memslot to rmap_add()")
5d49f08c2e08 ("KVM: x86/mmu: Shove refcounted page dependency into host_pfn_mapping_level()")
b14b2690c50e ("KVM: Rename/refactor kvm_is_reserved_pfn() to kvm_pfn_to_refcounted_page()")
284dc4930773 ("KVM: Take a 'struct page', not a pfn in kvm_is_zone_device_page()")
b1624f99aa8f ("KVM: Remove kvm_vcpu_gfn_to_page() and kvm_vcpu_gpa_to_page()")
6573a6910ce4 ("KVM: Don't WARN if kvm_pfn_to_page() encounters a "reserved" pfn")
8e1c69149f27 ("KVM: Avoid pfn_to_page() and vice versa when releasing pages")
a1040b0d42ac ("KVM: Don't set Accessed/Dirty bits for ZERO_PAGE")
b31455e96f00 ("Merge branch 'kvm-5.20-early-patches' into HEAD")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6d3085e4d89ad7e6c7f1c6cf929d903393565861 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Fri, 11 Nov 2022 00:18:41 +0000
Subject: [PATCH] KVM: x86/mmu: Block all page faults during
kvm_zap_gfn_range()
When zapping a GFN range, pass 0 => ALL_ONES for the to-be-invalidated
range to effectively block all page faults while the zap is in-progress.
The invalidation helpers take a host virtual address, whereas zapping a
GFN obviously provides a guest physical address and with the wrong unit
of measurement (frame vs. byte).
Alternatively, KVM could walk all memslots to get the associated HVAs,
but thanks to SMM, that would require multiple lookups. And practically
speaking, kvm_zap_gfn_range() usage is quite rare and not a hot path,
e.g. MTRR and CR0.CD are almost guaranteed to be done only on vCPU0
during boot, and APICv inhibits are similarly infrequent operations.
Fixes: edb298c663fc ("KVM: x86/mmu: bump mmu notifier count in kvm_zap_gfn_range")
Reported-by: Chao Peng <chao.p.peng(a)linux.intel.com>
Cc: stable(a)vger.kernel.org
Cc: Maxim Levitsky <mlevitsk(a)redhat.com>
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20221111001841.2412598-1-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6f81539061d6..1ccb769f62af 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6056,7 +6056,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
write_lock(&kvm->mmu_lock);
- kvm_mmu_invalidate_begin(kvm, gfn_start, gfn_end);
+ kvm_mmu_invalidate_begin(kvm, 0, -1ul);
flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
@@ -6070,7 +6070,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
gfn_end - gfn_start);
- kvm_mmu_invalidate_end(kvm, gfn_start, gfn_end);
+ kvm_mmu_invalidate_end(kvm, 0, -1ul);
write_unlock(&kvm->mmu_lock);
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
0ec8ce073944 ("dmaengine: idxd: Do not enable user type Work Queue without Shared Virtual Addressing")
403a2e236538 ("dmaengine: idxd: change MSIX allocation based on per wq activation")
23a50c803565 ("dmaengine: idxd: fix descriptor flushing locking")
ec0d64231615 ("dmaengine: idxd: embed irq_entry in idxd_wq struct")
56fc39f5a367 ("dmaengine: idxd: handle interrupt handle revoked event")
f6d442f7088c ("dmaengine: idxd: handle invalid interrupt handle descriptors")
bd5970a0d01f ("dmaengine: idxd: create locked version of idxd_quiesce() call")
46c6df1c958e ("dmaengine: idxd: add helper for per interrupt handle drain")
eb0cf33a91b4 ("dmaengine: idxd: move interrupt handle assignment")
8b67426e0558 ("dmaengine: idxd: int handle management refactoring")
5d78abb6fbc9 ("dmaengine: idxd: rework descriptor free path on failure")
a3e340c1574b ("dmaengine: idxd: fix resource leak on dmaengine driver disable")
e530a9f3db41 ("dmaengine: idxd: reconfig device after device reset command")
88d97ea82cbe ("dmaengine: idxd: add halt interrupt support")
85f604af9c83 ("dmaengine: idxd: move out percpu_ref_exit() to ensure it's outside submission")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0ec8ce07394442d722806fe61b901a5b2b17249d Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu(a)intel.com>
Date: Fri, 14 Oct 2022 15:25:41 -0700
Subject: [PATCH] dmaengine: idxd: Do not enable user type Work Queue without
Shared Virtual Addressing
When the idxd_user_drv driver is bound to a Work Queue (WQ) device
without IOMMU or with IOMMU Passthrough without Shared Virtual
Addressing (SVA), the application gains direct access to physical
memory via the device by programming physical address to a submitted
descriptor. This allows direct userspace read and write access to
arbitrary physical memory. This is inconsistent with the security
goals of a good kernel API.
Unlike vfio_pci driver, the IDXD char device driver does not provide any
ways to pin user pages and translate the address from user VA to IOVA or
PA without IOMMU SVA. Therefore the application has no way to instruct the
device to perform DMA function. This makes the char device not usable for
normal application usage.
Since user type WQ without SVA cannot be used for normal application usage
and presents the security issue, bind idxd_user_drv driver and enable user
type WQ only when SVA is enabled (i.e. user PASID is enabled).
Fixes: 448c3de8ac83 ("dmaengine: idxd: create user driver for wq 'device'")
Cc: stable(a)vger.kernel.org
Suggested-by: Arjan Van De Ven <arjan.van.de.ven(a)intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu(a)intel.com>
Reviewed-by: Dave Jiang <dave.jiang(a)intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel(a)redhat.com>
Link: https://lore.kernel.org/r/20221014222541.3912195-1-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index c2808fd081d6..a9b96b18772f 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -312,6 +312,24 @@ static int idxd_user_drv_probe(struct idxd_dev *idxd_dev)
if (idxd->state != IDXD_DEV_ENABLED)
return -ENXIO;
+ /*
+ * User type WQ is enabled only when SVA is enabled for two reasons:
+ * - If no IOMMU or IOMMU Passthrough without SVA, userspace
+ * can directly access physical address through the WQ.
+ * - The IDXD cdev driver does not provide any ways to pin
+ * user pages and translate the address from user VA to IOVA or
+ * PA without IOMMU SVA. Therefore the application has no way
+ * to instruct the device to perform DMA function. This makes
+ * the cdev not usable for normal application usage.
+ */
+ if (!device_user_pasid_enabled(idxd)) {
+ idxd->cmd_status = IDXD_SCMD_WQ_USER_NO_IOMMU;
+ dev_dbg(&idxd->pdev->dev,
+ "User type WQ cannot be enabled without SVA.\n");
+
+ return -EOPNOTSUPP;
+ }
+
mutex_lock(&wq->wq_lock);
wq->type = IDXD_WQT_USER;
rc = drv_enable_wq(wq);
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 095299c75828..2b9e7feba3f3 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -29,6 +29,7 @@ enum idxd_scmd_stat {
IDXD_SCMD_WQ_NO_SIZE = 0x800e0000,
IDXD_SCMD_WQ_NO_PRIV = 0x800f0000,
IDXD_SCMD_WQ_IRQ_ERR = 0x80100000,
+ IDXD_SCMD_WQ_USER_NO_IOMMU = 0x80110000,
};
#define IDXD_SCMD_SOFTERR_MASK 0x80000000
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
8625147cafaa ("hugetlbfs: don't delete error page from pagecache")
7e1813d48dd3 ("hugetlb: rename remove_huge_page to hugetlb_delete_from_page_cache")
1508062ecd55 ("hugetlbfs: Convert remove_inode_hugepages() to use filemap_get_folios()")
d9ef44de5d73 ("hugetlb: Convert huge_add_to_page_cache() to use a folio")
dd0f230a0a80 ("mm: hwpoison: refactor refcount check handling")
ea6d0630100b ("mm/hwpoison: do not lock page again when me_huge_page() successfully recovers")
171936ddaf97 ("mm/memory-failure: use a mutex to avoid memory_failure() races")
e32905e57358 ("userfaultfd: hugetlbfs: fix new flag usage in error path")
15b836536321 ("mm/hugetlb: remove unused variable pseudo_vma in remove_inode_hugepages()")
d4241a049ac0 ("mm/hugetlb: avoid calculating fault_mutex_hash in truncate_op case")
d6995da31122 ("hugetlb: use page.private for hugetlb specific page flags")
585fc0d2871c ("mm: hugetlbfs: fix cannot migrate the fallocated HugeTLB page")
a8b2c2ce89d4 ("mm,hwpoison: take free pages off the buddy freelists")
5d1fd5dc877b ("mm,hwpoison: introduce MF_MSG_UNSPLIT_THP")
694bf0b0cdf9 ("mm,hwpoison: unify THP handling for hard and soft offline")
dd6e2402fad9 ("mm,hwpoison: kill put_hwpoison_page")
7e27f22c9e40 ("mm,hwpoison: unexport get_hwpoison_page and make it static")
bbe88753bd42 ("mm/hugetlb: make hugetlb migration callback CMA aware")
41b4dc14ee80 ("mm/gup: restrict CMA region by using allocation scope API")
19fc7bed252c ("mm/migrate: introduce a standard migration target allocation function")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8625147cafaa9ba74713d682f5185eb62cb2aedb Mon Sep 17 00:00:00 2001
From: James Houghton <jthoughton(a)google.com>
Date: Tue, 18 Oct 2022 20:01:25 +0000
Subject: [PATCH] hugetlbfs: don't delete error page from pagecache
This change is very similar to the change that was made for shmem [1], and
it solves the same problem but for HugeTLBFS instead.
Currently, when poison is found in a HugeTLB page, the page is removed
from the page cache. That means that attempting to map or read that
hugepage in the future will result in a new hugepage being allocated
instead of notifying the user that the page was poisoned. As [1] states,
this is effectively memory corruption.
The fix is to leave the page in the page cache. If the user attempts to
use a poisoned HugeTLB page with a syscall, the syscall will fail with
EIO, the same error code that shmem uses. For attempts to map the page,
the thread will get a BUS_MCEERR_AR SIGBUS.
[1]: commit a76054266661 ("mm: shmem: don't truncate page if memory failure happens")
Link: https://lkml.kernel.org/r/20221018200125.848471-1-jthoughton@google.com
Signed-off-by: James Houghton <jthoughton(a)google.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reviewed-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Tested-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: James Houghton <jthoughton(a)google.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dd54f67e47fd..df7772335dc0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -328,6 +328,12 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
unlock_page(page);
+ if (PageHWPoison(page)) {
+ put_page(page);
+ retval = -EIO;
+ break;
+ }
+
/*
* We have the page, copy it to user space buffer.
*/
@@ -1111,13 +1117,6 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
static int hugetlbfs_error_remove_page(struct address_space *mapping,
struct page *page)
{
- struct inode *inode = mapping->host;
- pgoff_t index = page->index;
-
- hugetlb_delete_from_page_cache(page);
- if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
- hugetlb_fix_reserve_counts(inode);
-
return 0;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 546df97c31e4..e48f8ef45b17 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6111,6 +6111,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
ptl = huge_pte_lock(h, dst_mm, dst_pte);
+ ret = -EIO;
+ if (PageHWPoison(page))
+ goto out_release_unlock;
+
/*
* We allow to overwrite a pte marker: consider when both MISSING|WP
* registered, we firstly wr-protect a none pte which has no page cache
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 145bb561ddb3..bead6bccc7f2 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1080,6 +1080,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
int res;
struct page *hpage = compound_head(p);
struct address_space *mapping;
+ bool extra_pins = false;
if (!PageHuge(hpage))
return MF_DELAYED;
@@ -1087,6 +1088,8 @@ static int me_huge_page(struct page_state *ps, struct page *p)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
+ /* The page is kept in page cache. */
+ extra_pins = true;
unlock_page(hpage);
} else {
unlock_page(hpage);
@@ -1104,7 +1107,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
}
}
- if (has_extra_refcount(ps, p, false))
+ if (has_extra_refcount(ps, p, extra_pins))
res = MF_FAILED;
return res;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
8625147cafaa ("hugetlbfs: don't delete error page from pagecache")
7e1813d48dd3 ("hugetlb: rename remove_huge_page to hugetlb_delete_from_page_cache")
1508062ecd55 ("hugetlbfs: Convert remove_inode_hugepages() to use filemap_get_folios()")
d9ef44de5d73 ("hugetlb: Convert huge_add_to_page_cache() to use a folio")
dd0f230a0a80 ("mm: hwpoison: refactor refcount check handling")
ea6d0630100b ("mm/hwpoison: do not lock page again when me_huge_page() successfully recovers")
171936ddaf97 ("mm/memory-failure: use a mutex to avoid memory_failure() races")
e32905e57358 ("userfaultfd: hugetlbfs: fix new flag usage in error path")
15b836536321 ("mm/hugetlb: remove unused variable pseudo_vma in remove_inode_hugepages()")
d4241a049ac0 ("mm/hugetlb: avoid calculating fault_mutex_hash in truncate_op case")
d6995da31122 ("hugetlb: use page.private for hugetlb specific page flags")
585fc0d2871c ("mm: hugetlbfs: fix cannot migrate the fallocated HugeTLB page")
a8b2c2ce89d4 ("mm,hwpoison: take free pages off the buddy freelists")
5d1fd5dc877b ("mm,hwpoison: introduce MF_MSG_UNSPLIT_THP")
694bf0b0cdf9 ("mm,hwpoison: unify THP handling for hard and soft offline")
dd6e2402fad9 ("mm,hwpoison: kill put_hwpoison_page")
7e27f22c9e40 ("mm,hwpoison: unexport get_hwpoison_page and make it static")
bbe88753bd42 ("mm/hugetlb: make hugetlb migration callback CMA aware")
41b4dc14ee80 ("mm/gup: restrict CMA region by using allocation scope API")
19fc7bed252c ("mm/migrate: introduce a standard migration target allocation function")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8625147cafaa9ba74713d682f5185eb62cb2aedb Mon Sep 17 00:00:00 2001
From: James Houghton <jthoughton(a)google.com>
Date: Tue, 18 Oct 2022 20:01:25 +0000
Subject: [PATCH] hugetlbfs: don't delete error page from pagecache
This change is very similar to the change that was made for shmem [1], and
it solves the same problem but for HugeTLBFS instead.
Currently, when poison is found in a HugeTLB page, the page is removed
from the page cache. That means that attempting to map or read that
hugepage in the future will result in a new hugepage being allocated
instead of notifying the user that the page was poisoned. As [1] states,
this is effectively memory corruption.
The fix is to leave the page in the page cache. If the user attempts to
use a poisoned HugeTLB page with a syscall, the syscall will fail with
EIO, the same error code that shmem uses. For attempts to map the page,
the thread will get a BUS_MCEERR_AR SIGBUS.
[1]: commit a76054266661 ("mm: shmem: don't truncate page if memory failure happens")
Link: https://lkml.kernel.org/r/20221018200125.848471-1-jthoughton@google.com
Signed-off-by: James Houghton <jthoughton(a)google.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reviewed-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Tested-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: James Houghton <jthoughton(a)google.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dd54f67e47fd..df7772335dc0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -328,6 +328,12 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
unlock_page(page);
+ if (PageHWPoison(page)) {
+ put_page(page);
+ retval = -EIO;
+ break;
+ }
+
/*
* We have the page, copy it to user space buffer.
*/
@@ -1111,13 +1117,6 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
static int hugetlbfs_error_remove_page(struct address_space *mapping,
struct page *page)
{
- struct inode *inode = mapping->host;
- pgoff_t index = page->index;
-
- hugetlb_delete_from_page_cache(page);
- if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
- hugetlb_fix_reserve_counts(inode);
-
return 0;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 546df97c31e4..e48f8ef45b17 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6111,6 +6111,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
ptl = huge_pte_lock(h, dst_mm, dst_pte);
+ ret = -EIO;
+ if (PageHWPoison(page))
+ goto out_release_unlock;
+
/*
* We allow to overwrite a pte marker: consider when both MISSING|WP
* registered, we firstly wr-protect a none pte which has no page cache
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 145bb561ddb3..bead6bccc7f2 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1080,6 +1080,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
int res;
struct page *hpage = compound_head(p);
struct address_space *mapping;
+ bool extra_pins = false;
if (!PageHuge(hpage))
return MF_DELAYED;
@@ -1087,6 +1088,8 @@ static int me_huge_page(struct page_state *ps, struct page *p)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
+ /* The page is kept in page cache. */
+ extra_pins = true;
unlock_page(hpage);
} else {
unlock_page(hpage);
@@ -1104,7 +1107,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
}
}
- if (has_extra_refcount(ps, p, false))
+ if (has_extra_refcount(ps, p, extra_pins))
res = MF_FAILED;
return res;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
8625147cafaa ("hugetlbfs: don't delete error page from pagecache")
7e1813d48dd3 ("hugetlb: rename remove_huge_page to hugetlb_delete_from_page_cache")
1508062ecd55 ("hugetlbfs: Convert remove_inode_hugepages() to use filemap_get_folios()")
d9ef44de5d73 ("hugetlb: Convert huge_add_to_page_cache() to use a folio")
dd0f230a0a80 ("mm: hwpoison: refactor refcount check handling")
ea6d0630100b ("mm/hwpoison: do not lock page again when me_huge_page() successfully recovers")
171936ddaf97 ("mm/memory-failure: use a mutex to avoid memory_failure() races")
e32905e57358 ("userfaultfd: hugetlbfs: fix new flag usage in error path")
15b836536321 ("mm/hugetlb: remove unused variable pseudo_vma in remove_inode_hugepages()")
d4241a049ac0 ("mm/hugetlb: avoid calculating fault_mutex_hash in truncate_op case")
d6995da31122 ("hugetlb: use page.private for hugetlb specific page flags")
585fc0d2871c ("mm: hugetlbfs: fix cannot migrate the fallocated HugeTLB page")
a8b2c2ce89d4 ("mm,hwpoison: take free pages off the buddy freelists")
5d1fd5dc877b ("mm,hwpoison: introduce MF_MSG_UNSPLIT_THP")
694bf0b0cdf9 ("mm,hwpoison: unify THP handling for hard and soft offline")
dd6e2402fad9 ("mm,hwpoison: kill put_hwpoison_page")
7e27f22c9e40 ("mm,hwpoison: unexport get_hwpoison_page and make it static")
bbe88753bd42 ("mm/hugetlb: make hugetlb migration callback CMA aware")
41b4dc14ee80 ("mm/gup: restrict CMA region by using allocation scope API")
19fc7bed252c ("mm/migrate: introduce a standard migration target allocation function")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8625147cafaa9ba74713d682f5185eb62cb2aedb Mon Sep 17 00:00:00 2001
From: James Houghton <jthoughton(a)google.com>
Date: Tue, 18 Oct 2022 20:01:25 +0000
Subject: [PATCH] hugetlbfs: don't delete error page from pagecache
This change is very similar to the change that was made for shmem [1], and
it solves the same problem but for HugeTLBFS instead.
Currently, when poison is found in a HugeTLB page, the page is removed
from the page cache. That means that attempting to map or read that
hugepage in the future will result in a new hugepage being allocated
instead of notifying the user that the page was poisoned. As [1] states,
this is effectively memory corruption.
The fix is to leave the page in the page cache. If the user attempts to
use a poisoned HugeTLB page with a syscall, the syscall will fail with
EIO, the same error code that shmem uses. For attempts to map the page,
the thread will get a BUS_MCEERR_AR SIGBUS.
[1]: commit a76054266661 ("mm: shmem: don't truncate page if memory failure happens")
Link: https://lkml.kernel.org/r/20221018200125.848471-1-jthoughton@google.com
Signed-off-by: James Houghton <jthoughton(a)google.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reviewed-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Tested-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: James Houghton <jthoughton(a)google.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dd54f67e47fd..df7772335dc0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -328,6 +328,12 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
unlock_page(page);
+ if (PageHWPoison(page)) {
+ put_page(page);
+ retval = -EIO;
+ break;
+ }
+
/*
* We have the page, copy it to user space buffer.
*/
@@ -1111,13 +1117,6 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
static int hugetlbfs_error_remove_page(struct address_space *mapping,
struct page *page)
{
- struct inode *inode = mapping->host;
- pgoff_t index = page->index;
-
- hugetlb_delete_from_page_cache(page);
- if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
- hugetlb_fix_reserve_counts(inode);
-
return 0;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 546df97c31e4..e48f8ef45b17 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6111,6 +6111,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
ptl = huge_pte_lock(h, dst_mm, dst_pte);
+ ret = -EIO;
+ if (PageHWPoison(page))
+ goto out_release_unlock;
+
/*
* We allow to overwrite a pte marker: consider when both MISSING|WP
* registered, we firstly wr-protect a none pte which has no page cache
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 145bb561ddb3..bead6bccc7f2 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1080,6 +1080,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
int res;
struct page *hpage = compound_head(p);
struct address_space *mapping;
+ bool extra_pins = false;
if (!PageHuge(hpage))
return MF_DELAYED;
@@ -1087,6 +1088,8 @@ static int me_huge_page(struct page_state *ps, struct page *p)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
+ /* The page is kept in page cache. */
+ extra_pins = true;
unlock_page(hpage);
} else {
unlock_page(hpage);
@@ -1104,7 +1107,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
}
}
- if (has_extra_refcount(ps, p, false))
+ if (has_extra_refcount(ps, p, extra_pins))
res = MF_FAILED;
return res;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
8625147cafaa ("hugetlbfs: don't delete error page from pagecache")
7e1813d48dd3 ("hugetlb: rename remove_huge_page to hugetlb_delete_from_page_cache")
1508062ecd55 ("hugetlbfs: Convert remove_inode_hugepages() to use filemap_get_folios()")
d9ef44de5d73 ("hugetlb: Convert huge_add_to_page_cache() to use a folio")
dd0f230a0a80 ("mm: hwpoison: refactor refcount check handling")
ea6d0630100b ("mm/hwpoison: do not lock page again when me_huge_page() successfully recovers")
171936ddaf97 ("mm/memory-failure: use a mutex to avoid memory_failure() races")
e32905e57358 ("userfaultfd: hugetlbfs: fix new flag usage in error path")
15b836536321 ("mm/hugetlb: remove unused variable pseudo_vma in remove_inode_hugepages()")
d4241a049ac0 ("mm/hugetlb: avoid calculating fault_mutex_hash in truncate_op case")
d6995da31122 ("hugetlb: use page.private for hugetlb specific page flags")
585fc0d2871c ("mm: hugetlbfs: fix cannot migrate the fallocated HugeTLB page")
a8b2c2ce89d4 ("mm,hwpoison: take free pages off the buddy freelists")
5d1fd5dc877b ("mm,hwpoison: introduce MF_MSG_UNSPLIT_THP")
694bf0b0cdf9 ("mm,hwpoison: unify THP handling for hard and soft offline")
dd6e2402fad9 ("mm,hwpoison: kill put_hwpoison_page")
7e27f22c9e40 ("mm,hwpoison: unexport get_hwpoison_page and make it static")
bbe88753bd42 ("mm/hugetlb: make hugetlb migration callback CMA aware")
41b4dc14ee80 ("mm/gup: restrict CMA region by using allocation scope API")
19fc7bed252c ("mm/migrate: introduce a standard migration target allocation function")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8625147cafaa9ba74713d682f5185eb62cb2aedb Mon Sep 17 00:00:00 2001
From: James Houghton <jthoughton(a)google.com>
Date: Tue, 18 Oct 2022 20:01:25 +0000
Subject: [PATCH] hugetlbfs: don't delete error page from pagecache
This change is very similar to the change that was made for shmem [1], and
it solves the same problem but for HugeTLBFS instead.
Currently, when poison is found in a HugeTLB page, the page is removed
from the page cache. That means that attempting to map or read that
hugepage in the future will result in a new hugepage being allocated
instead of notifying the user that the page was poisoned. As [1] states,
this is effectively memory corruption.
The fix is to leave the page in the page cache. If the user attempts to
use a poisoned HugeTLB page with a syscall, the syscall will fail with
EIO, the same error code that shmem uses. For attempts to map the page,
the thread will get a BUS_MCEERR_AR SIGBUS.
[1]: commit a76054266661 ("mm: shmem: don't truncate page if memory failure happens")
Link: https://lkml.kernel.org/r/20221018200125.848471-1-jthoughton@google.com
Signed-off-by: James Houghton <jthoughton(a)google.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reviewed-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Tested-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: James Houghton <jthoughton(a)google.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dd54f67e47fd..df7772335dc0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -328,6 +328,12 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
unlock_page(page);
+ if (PageHWPoison(page)) {
+ put_page(page);
+ retval = -EIO;
+ break;
+ }
+
/*
* We have the page, copy it to user space buffer.
*/
@@ -1111,13 +1117,6 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
static int hugetlbfs_error_remove_page(struct address_space *mapping,
struct page *page)
{
- struct inode *inode = mapping->host;
- pgoff_t index = page->index;
-
- hugetlb_delete_from_page_cache(page);
- if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
- hugetlb_fix_reserve_counts(inode);
-
return 0;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 546df97c31e4..e48f8ef45b17 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6111,6 +6111,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
ptl = huge_pte_lock(h, dst_mm, dst_pte);
+ ret = -EIO;
+ if (PageHWPoison(page))
+ goto out_release_unlock;
+
/*
* We allow to overwrite a pte marker: consider when both MISSING|WP
* registered, we firstly wr-protect a none pte which has no page cache
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 145bb561ddb3..bead6bccc7f2 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1080,6 +1080,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
int res;
struct page *hpage = compound_head(p);
struct address_space *mapping;
+ bool extra_pins = false;
if (!PageHuge(hpage))
return MF_DELAYED;
@@ -1087,6 +1088,8 @@ static int me_huge_page(struct page_state *ps, struct page *p)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
+ /* The page is kept in page cache. */
+ extra_pins = true;
unlock_page(hpage);
} else {
unlock_page(hpage);
@@ -1104,7 +1107,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
}
}
- if (has_extra_refcount(ps, p, false))
+ if (has_extra_refcount(ps, p, extra_pins))
res = MF_FAILED;
return res;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
8625147cafaa ("hugetlbfs: don't delete error page from pagecache")
7e1813d48dd3 ("hugetlb: rename remove_huge_page to hugetlb_delete_from_page_cache")
1508062ecd55 ("hugetlbfs: Convert remove_inode_hugepages() to use filemap_get_folios()")
d9ef44de5d73 ("hugetlb: Convert huge_add_to_page_cache() to use a folio")
dd0f230a0a80 ("mm: hwpoison: refactor refcount check handling")
ea6d0630100b ("mm/hwpoison: do not lock page again when me_huge_page() successfully recovers")
171936ddaf97 ("mm/memory-failure: use a mutex to avoid memory_failure() races")
e32905e57358 ("userfaultfd: hugetlbfs: fix new flag usage in error path")
15b836536321 ("mm/hugetlb: remove unused variable pseudo_vma in remove_inode_hugepages()")
d4241a049ac0 ("mm/hugetlb: avoid calculating fault_mutex_hash in truncate_op case")
d6995da31122 ("hugetlb: use page.private for hugetlb specific page flags")
585fc0d2871c ("mm: hugetlbfs: fix cannot migrate the fallocated HugeTLB page")
a8b2c2ce89d4 ("mm,hwpoison: take free pages off the buddy freelists")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8625147cafaa9ba74713d682f5185eb62cb2aedb Mon Sep 17 00:00:00 2001
From: James Houghton <jthoughton(a)google.com>
Date: Tue, 18 Oct 2022 20:01:25 +0000
Subject: [PATCH] hugetlbfs: don't delete error page from pagecache
This change is very similar to the change that was made for shmem [1], and
it solves the same problem but for HugeTLBFS instead.
Currently, when poison is found in a HugeTLB page, the page is removed
from the page cache. That means that attempting to map or read that
hugepage in the future will result in a new hugepage being allocated
instead of notifying the user that the page was poisoned. As [1] states,
this is effectively memory corruption.
The fix is to leave the page in the page cache. If the user attempts to
use a poisoned HugeTLB page with a syscall, the syscall will fail with
EIO, the same error code that shmem uses. For attempts to map the page,
the thread will get a BUS_MCEERR_AR SIGBUS.
[1]: commit a76054266661 ("mm: shmem: don't truncate page if memory failure happens")
Link: https://lkml.kernel.org/r/20221018200125.848471-1-jthoughton@google.com
Signed-off-by: James Houghton <jthoughton(a)google.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reviewed-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Tested-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: James Houghton <jthoughton(a)google.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dd54f67e47fd..df7772335dc0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -328,6 +328,12 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
unlock_page(page);
+ if (PageHWPoison(page)) {
+ put_page(page);
+ retval = -EIO;
+ break;
+ }
+
/*
* We have the page, copy it to user space buffer.
*/
@@ -1111,13 +1117,6 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
static int hugetlbfs_error_remove_page(struct address_space *mapping,
struct page *page)
{
- struct inode *inode = mapping->host;
- pgoff_t index = page->index;
-
- hugetlb_delete_from_page_cache(page);
- if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
- hugetlb_fix_reserve_counts(inode);
-
return 0;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 546df97c31e4..e48f8ef45b17 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6111,6 +6111,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
ptl = huge_pte_lock(h, dst_mm, dst_pte);
+ ret = -EIO;
+ if (PageHWPoison(page))
+ goto out_release_unlock;
+
/*
* We allow to overwrite a pte marker: consider when both MISSING|WP
* registered, we firstly wr-protect a none pte which has no page cache
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 145bb561ddb3..bead6bccc7f2 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1080,6 +1080,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
int res;
struct page *hpage = compound_head(p);
struct address_space *mapping;
+ bool extra_pins = false;
if (!PageHuge(hpage))
return MF_DELAYED;
@@ -1087,6 +1088,8 @@ static int me_huge_page(struct page_state *ps, struct page *p)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
+ /* The page is kept in page cache. */
+ extra_pins = true;
unlock_page(hpage);
} else {
unlock_page(hpage);
@@ -1104,7 +1107,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
}
}
- if (has_extra_refcount(ps, p, false))
+ if (has_extra_refcount(ps, p, extra_pins))
res = MF_FAILED;
return res;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
8625147cafaa ("hugetlbfs: don't delete error page from pagecache")
7e1813d48dd3 ("hugetlb: rename remove_huge_page to hugetlb_delete_from_page_cache")
1508062ecd55 ("hugetlbfs: Convert remove_inode_hugepages() to use filemap_get_folios()")
d9ef44de5d73 ("hugetlb: Convert huge_add_to_page_cache() to use a folio")
dd0f230a0a80 ("mm: hwpoison: refactor refcount check handling")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8625147cafaa9ba74713d682f5185eb62cb2aedb Mon Sep 17 00:00:00 2001
From: James Houghton <jthoughton(a)google.com>
Date: Tue, 18 Oct 2022 20:01:25 +0000
Subject: [PATCH] hugetlbfs: don't delete error page from pagecache
This change is very similar to the change that was made for shmem [1], and
it solves the same problem but for HugeTLBFS instead.
Currently, when poison is found in a HugeTLB page, the page is removed
from the page cache. That means that attempting to map or read that
hugepage in the future will result in a new hugepage being allocated
instead of notifying the user that the page was poisoned. As [1] states,
this is effectively memory corruption.
The fix is to leave the page in the page cache. If the user attempts to
use a poisoned HugeTLB page with a syscall, the syscall will fail with
EIO, the same error code that shmem uses. For attempts to map the page,
the thread will get a BUS_MCEERR_AR SIGBUS.
[1]: commit a76054266661 ("mm: shmem: don't truncate page if memory failure happens")
Link: https://lkml.kernel.org/r/20221018200125.848471-1-jthoughton@google.com
Signed-off-by: James Houghton <jthoughton(a)google.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reviewed-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Tested-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: James Houghton <jthoughton(a)google.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dd54f67e47fd..df7772335dc0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -328,6 +328,12 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
unlock_page(page);
+ if (PageHWPoison(page)) {
+ put_page(page);
+ retval = -EIO;
+ break;
+ }
+
/*
* We have the page, copy it to user space buffer.
*/
@@ -1111,13 +1117,6 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
static int hugetlbfs_error_remove_page(struct address_space *mapping,
struct page *page)
{
- struct inode *inode = mapping->host;
- pgoff_t index = page->index;
-
- hugetlb_delete_from_page_cache(page);
- if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
- hugetlb_fix_reserve_counts(inode);
-
return 0;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 546df97c31e4..e48f8ef45b17 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6111,6 +6111,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
ptl = huge_pte_lock(h, dst_mm, dst_pte);
+ ret = -EIO;
+ if (PageHWPoison(page))
+ goto out_release_unlock;
+
/*
* We allow to overwrite a pte marker: consider when both MISSING|WP
* registered, we firstly wr-protect a none pte which has no page cache
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 145bb561ddb3..bead6bccc7f2 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1080,6 +1080,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
int res;
struct page *hpage = compound_head(p);
struct address_space *mapping;
+ bool extra_pins = false;
if (!PageHuge(hpage))
return MF_DELAYED;
@@ -1087,6 +1088,8 @@ static int me_huge_page(struct page_state *ps, struct page *p)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
+ /* The page is kept in page cache. */
+ extra_pins = true;
unlock_page(hpage);
} else {
unlock_page(hpage);
@@ -1104,7 +1107,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
}
}
- if (has_extra_refcount(ps, p, false))
+ if (has_extra_refcount(ps, p, extra_pins))
res = MF_FAILED;
return res;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
fb1dec44c675 ("mms: sdhci-esdhc-imx: Fix SDHCI_RESET_ALL for CQHCI")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fb1dec44c6750bb414f47b929c8c175a1a127c31 Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris(a)chromium.org>
Date: Wed, 26 Oct 2022 12:42:06 -0700
Subject: [PATCH] mms: sdhci-esdhc-imx: Fix SDHCI_RESET_ALL for CQHCI
[[ NOTE: this is completely untested by the author, but included solely
because, as noted in commit df57d73276b8 ("mmc: sdhci-pci: Fix
SDHCI_RESET_ALL for CQHCI for Intel GLK-based controllers"), "other
drivers using CQHCI might benefit from a similar change, if they
also have CQHCI reset by SDHCI_RESET_ALL." We've now seen the same
bug on at least MSM, Arasan, and Intel hardware. ]]
SDHCI_RESET_ALL resets will reset the hardware CQE state, but we aren't
tracking that properly in software. When out of sync, we may trigger
various timeouts.
It's not typical to perform resets while CQE is enabled, but this may
occur in some suspend or error recovery scenarios.
Include this fix by way of the new sdhci_and_cqhci_reset() helper.
This patch depends on (and should not compile without) the patch
entitled "mmc: cqhci: Provide helper for resetting both SDHCI and
CQHCI".
Fixes: bb6e358169bf ("mmc: sdhci-esdhc-imx: add CMDQ support")
Signed-off-by: Brian Norris <briannorris(a)chromium.org>
Reviewed-by: Haibo Chen <haibo.chen(a)nxp.com>
Acked-by: Adrian Hunter <adrian.hunter(a)intel.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20221026124150.v4.4.I7d01f9ad11bacdc9213dee61b791…
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c
index 747df79d90ee..89225faa242a 100644
--- a/drivers/mmc/host/sdhci-esdhc-imx.c
+++ b/drivers/mmc/host/sdhci-esdhc-imx.c
@@ -25,6 +25,7 @@
#include <linux/of_device.h>
#include <linux/pinctrl/consumer.h>
#include <linux/pm_runtime.h>
+#include "sdhci-cqhci.h"
#include "sdhci-pltfm.h"
#include "sdhci-esdhc.h"
#include "cqhci.h"
@@ -1288,7 +1289,7 @@ static void esdhc_set_uhs_signaling(struct sdhci_host *host, unsigned timing)
static void esdhc_reset(struct sdhci_host *host, u8 mask)
{
- sdhci_reset(host, mask);
+ sdhci_and_cqhci_reset(host, mask);
sdhci_writel(host, host->ier, SDHCI_INT_ENABLE);
sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
f90daa975911 ("drm/i915/dmabuf: fix sg_table handling in map_dma_buf")
10be98a77c55 ("drm/i915: Move more GEM objects under gem/")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f90daa975911961b65070ec72bd7dd8d448f9ef7 Mon Sep 17 00:00:00 2001
From: Matthew Auld <matthew.auld(a)intel.com>
Date: Fri, 28 Oct 2022 16:50:26 +0100
Subject: [PATCH] drm/i915/dmabuf: fix sg_table handling in map_dma_buf
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
We need to iterate over the original entries here for the sg_table,
pulling out the struct page for each one, to be remapped. However
currently this incorrectly iterates over the final dma mapped entries,
which is likely just one gigantic sg entry if the iommu is enabled,
leading to us only mapping the first struct page (and any physically
contiguous pages following it), even if there is potentially lots more
data to follow.
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/7306
Fixes: 1286ff739773 ("i915: add dmabuf/prime buffer sharing support.")
Signed-off-by: Matthew Auld <matthew.auld(a)intel.com>
Cc: Lionel Landwerlin <lionel.g.landwerlin(a)intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin(a)linux.intel.com>
Cc: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Cc: Michael J. Ruhl <michael.j.ruhl(a)intel.com>
Cc: <stable(a)vger.kernel.org> # v3.5+
Reviewed-by: Michael J. Ruhl <michael.j.ruhl(a)intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20221028155029.494736-1-matth…
(cherry picked from commit 28d52f99bbca7227008cf580c9194c9b3516968e)
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com>
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
index f5062d0c6333..824971a1ceec 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
@@ -40,13 +40,13 @@ static struct sg_table *i915_gem_map_dma_buf(struct dma_buf_attachment *attachme
goto err;
}
- ret = sg_alloc_table(st, obj->mm.pages->nents, GFP_KERNEL);
+ ret = sg_alloc_table(st, obj->mm.pages->orig_nents, GFP_KERNEL);
if (ret)
goto err_free;
src = obj->mm.pages->sgl;
dst = st->sgl;
- for (i = 0; i < obj->mm.pages->nents; i++) {
+ for (i = 0; i < obj->mm.pages->orig_nents; i++) {
sg_set_page(dst, sg_page(src), src->length, 0);
dst = sg_next(dst);
src = sg_next(src);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
8cccf05fe857 ("nilfs2: fix use-after-free bug of ns_writer on remount")
1751e8a6cb93 ("Rename superblock flags (MS_xyz -> SB_xyz)")
b04a23421bf6 ("Merge branch 'overlayfs-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8cccf05fe857a18ee26e20d11a8455a73ffd4efd Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Date: Fri, 4 Nov 2022 23:29:59 +0900
Subject: [PATCH] nilfs2: fix use-after-free bug of ns_writer on remount
If a nilfs2 filesystem is downgraded to read-only due to metadata
corruption on disk and is remounted read/write, or if emergency read-only
remount is performed, detaching a log writer and synchronizing the
filesystem can be done at the same time.
In these cases, use-after-free of the log writer (hereinafter
nilfs->ns_writer) can happen as shown in the scenario below:
Task1 Task2
-------------------------------- ------------------------------
nilfs_construct_segment
nilfs_segctor_sync
init_wait
init_waitqueue_entry
add_wait_queue
schedule
nilfs_remount (R/W remount case)
nilfs_attach_log_writer
nilfs_detach_log_writer
nilfs_segctor_destroy
kfree
finish_wait
_raw_spin_lock_irqsave
__raw_spin_lock_irqsave
do_raw_spin_lock
debug_spin_lock_before <-- use-after-free
While Task1 is sleeping, nilfs->ns_writer is freed by Task2. After Task1
waked up, Task1 accesses nilfs->ns_writer which is already freed. This
scenario diagram is based on the Shigeru Yoshida's post [1].
This patch fixes the issue by not detaching nilfs->ns_writer on remount so
that this UAF race doesn't happen. Along with this change, this patch
also inserts a few necessary read-only checks with superblock instance
where only the ns_writer pointer was used to check if the filesystem is
read-only.
Link: https://syzkaller.appspot.com/bug?id=79a4c002e960419ca173d55e863bd09e8112df…
Link: https://lkml.kernel.org/r/20221103141759.1836312-1-syoshida@redhat.com [1]
Link: https://lkml.kernel.org/r/20221104142959.28296-1-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+f816fa82f8783f7a02bb(a)syzkaller.appspotmail.com
Reported-by: Shigeru Yoshida <syoshida(a)redhat.com>
Tested-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index b4cebad21b48..3335ef352915 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -317,7 +317,7 @@ void nilfs_relax_pressure_in_lock(struct super_block *sb)
struct the_nilfs *nilfs = sb->s_fs_info;
struct nilfs_sc_info *sci = nilfs->ns_writer;
- if (!sci || !sci->sc_flush_request)
+ if (sb_rdonly(sb) || unlikely(!sci) || !sci->sc_flush_request)
return;
set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
@@ -2242,7 +2242,7 @@ int nilfs_construct_segment(struct super_block *sb)
struct nilfs_sc_info *sci = nilfs->ns_writer;
struct nilfs_transaction_info *ti;
- if (!sci)
+ if (sb_rdonly(sb) || unlikely(!sci))
return -EROFS;
/* A call inside transactions causes a deadlock. */
@@ -2280,7 +2280,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
struct nilfs_transaction_info ti;
int err = 0;
- if (!sci)
+ if (sb_rdonly(sb) || unlikely(!sci))
return -EROFS;
nilfs_transaction_lock(sb, &ti, 0);
@@ -2776,11 +2776,12 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
if (nilfs->ns_writer) {
/*
- * This happens if the filesystem was remounted
- * read/write after nilfs_error degenerated it into a
- * read-only mount.
+ * This happens if the filesystem is made read-only by
+ * __nilfs_error or nilfs_remount and then remounted
+ * read/write. In these cases, reuse the existing
+ * writer.
*/
- nilfs_detach_log_writer(sb);
+ return 0;
}
nilfs->ns_writer = nilfs_segctor_new(sb, root);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index ba108f915391..6edb6e0dd61f 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1133,8 +1133,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
goto out;
if (*flags & SB_RDONLY) {
- /* Shutting down log writer */
- nilfs_detach_log_writer(sb);
sb->s_flags |= SB_RDONLY;
/*
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
8cccf05fe857 ("nilfs2: fix use-after-free bug of ns_writer on remount")
1751e8a6cb93 ("Rename superblock flags (MS_xyz -> SB_xyz)")
b04a23421bf6 ("Merge branch 'overlayfs-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8cccf05fe857a18ee26e20d11a8455a73ffd4efd Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Date: Fri, 4 Nov 2022 23:29:59 +0900
Subject: [PATCH] nilfs2: fix use-after-free bug of ns_writer on remount
If a nilfs2 filesystem is downgraded to read-only due to metadata
corruption on disk and is remounted read/write, or if emergency read-only
remount is performed, detaching a log writer and synchronizing the
filesystem can be done at the same time.
In these cases, use-after-free of the log writer (hereinafter
nilfs->ns_writer) can happen as shown in the scenario below:
Task1 Task2
-------------------------------- ------------------------------
nilfs_construct_segment
nilfs_segctor_sync
init_wait
init_waitqueue_entry
add_wait_queue
schedule
nilfs_remount (R/W remount case)
nilfs_attach_log_writer
nilfs_detach_log_writer
nilfs_segctor_destroy
kfree
finish_wait
_raw_spin_lock_irqsave
__raw_spin_lock_irqsave
do_raw_spin_lock
debug_spin_lock_before <-- use-after-free
While Task1 is sleeping, nilfs->ns_writer is freed by Task2. After Task1
waked up, Task1 accesses nilfs->ns_writer which is already freed. This
scenario diagram is based on the Shigeru Yoshida's post [1].
This patch fixes the issue by not detaching nilfs->ns_writer on remount so
that this UAF race doesn't happen. Along with this change, this patch
also inserts a few necessary read-only checks with superblock instance
where only the ns_writer pointer was used to check if the filesystem is
read-only.
Link: https://syzkaller.appspot.com/bug?id=79a4c002e960419ca173d55e863bd09e8112df…
Link: https://lkml.kernel.org/r/20221103141759.1836312-1-syoshida@redhat.com [1]
Link: https://lkml.kernel.org/r/20221104142959.28296-1-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+f816fa82f8783f7a02bb(a)syzkaller.appspotmail.com
Reported-by: Shigeru Yoshida <syoshida(a)redhat.com>
Tested-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index b4cebad21b48..3335ef352915 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -317,7 +317,7 @@ void nilfs_relax_pressure_in_lock(struct super_block *sb)
struct the_nilfs *nilfs = sb->s_fs_info;
struct nilfs_sc_info *sci = nilfs->ns_writer;
- if (!sci || !sci->sc_flush_request)
+ if (sb_rdonly(sb) || unlikely(!sci) || !sci->sc_flush_request)
return;
set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
@@ -2242,7 +2242,7 @@ int nilfs_construct_segment(struct super_block *sb)
struct nilfs_sc_info *sci = nilfs->ns_writer;
struct nilfs_transaction_info *ti;
- if (!sci)
+ if (sb_rdonly(sb) || unlikely(!sci))
return -EROFS;
/* A call inside transactions causes a deadlock. */
@@ -2280,7 +2280,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
struct nilfs_transaction_info ti;
int err = 0;
- if (!sci)
+ if (sb_rdonly(sb) || unlikely(!sci))
return -EROFS;
nilfs_transaction_lock(sb, &ti, 0);
@@ -2776,11 +2776,12 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
if (nilfs->ns_writer) {
/*
- * This happens if the filesystem was remounted
- * read/write after nilfs_error degenerated it into a
- * read-only mount.
+ * This happens if the filesystem is made read-only by
+ * __nilfs_error or nilfs_remount and then remounted
+ * read/write. In these cases, reuse the existing
+ * writer.
*/
- nilfs_detach_log_writer(sb);
+ return 0;
}
nilfs->ns_writer = nilfs_segctor_new(sb, root);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index ba108f915391..6edb6e0dd61f 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1133,8 +1133,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
goto out;
if (*flags & SB_RDONLY) {
- /* Shutting down log writer */
- nilfs_detach_log_writer(sb);
sb->s_flags |= SB_RDONLY;
/*
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
ebb5fd38f411 ("mmc: cqhci: Provide helper for resetting both SDHCI and CQHCI")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ebb5fd38f41132e6924cb33b647337f4a5d5360c Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris(a)chromium.org>
Date: Wed, 26 Oct 2022 12:42:03 -0700
Subject: [PATCH] mmc: cqhci: Provide helper for resetting both SDHCI and CQHCI
Several SDHCI drivers need to deactivate command queueing in their reset
hook (see sdhci_cqhci_reset() / sdhci-pci-core.c, for example), and
several more are coming.
Those reset implementations have some small subtleties (e.g., ordering
of initialization of SDHCI vs. CQHCI might leave us resetting with a
NULL ->cqe_private), and are often identical across different host
drivers.
We also don't want to force a dependency between SDHCI and CQHCI, or
vice versa; non-SDHCI drivers use CQHCI, and SDHCI drivers might support
command queueing through some other means.
So, implement a small helper, to avoid repeating the same mistakes in
different drivers. Simply stick it in a header, because it's so small it
doesn't deserve its own module right now, and inlining to each driver is
pretty reasonable.
This is marked for -stable, as it is an important prerequisite patch for
several SDHCI controller bugfixes that follow.
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Brian Norris <briannorris(a)chromium.org>
Acked-by: Adrian Hunter <adrian.hunter(a)intel.com>
Reviewed-by: Florian Fainelli <f.fainelli(a)gmail.com>
Link: https://lore.kernel.org/r/20221026124150.v4.1.Ie85faa09432bfe1b0890d8c24ff9…
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/host/sdhci-cqhci.h b/drivers/mmc/host/sdhci-cqhci.h
new file mode 100644
index 000000000000..cf8e7ba71bbd
--- /dev/null
+++ b/drivers/mmc/host/sdhci-cqhci.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2022 The Chromium OS Authors
+ *
+ * Support that applies to the combination of SDHCI and CQHCI, while not
+ * expressing a dependency between the two modules.
+ */
+
+#ifndef __MMC_HOST_SDHCI_CQHCI_H__
+#define __MMC_HOST_SDHCI_CQHCI_H__
+
+#include "cqhci.h"
+#include "sdhci.h"
+
+static inline void sdhci_and_cqhci_reset(struct sdhci_host *host, u8 mask)
+{
+ if ((host->mmc->caps2 & MMC_CAP2_CQE) && (mask & SDHCI_RESET_ALL) &&
+ host->mmc->cqe_private)
+ cqhci_deactivate(host->mmc);
+
+ sdhci_reset(host, mask);
+}
+
+#endif /* __MMC_HOST_SDHCI_CQHCI_H__ */
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
5d249ac37fc2 ("mmc: sdhci-of-arasan: Fix SDHCI_RESET_ALL for CQHCI")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5d249ac37fc2396e8acc1adb0650cdacae5a990d Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris(a)chromium.org>
Date: Wed, 26 Oct 2022 12:42:04 -0700
Subject: [PATCH] mmc: sdhci-of-arasan: Fix SDHCI_RESET_ALL for CQHCI
SDHCI_RESET_ALL resets will reset the hardware CQE state, but we aren't
tracking that properly in software. When out of sync, we may trigger
various timeouts.
It's not typical to perform resets while CQE is enabled, but one
particular case I hit commonly enough: mmc_suspend() -> mmc_power_off().
Typically we will eventually deactivate CQE (cqhci_suspend() ->
cqhci_deactivate()), but that's not guaranteed -- in particular, if
we perform a partial (e.g., interrupted) system suspend.
The same bug was already found and fixed for two other drivers, in v5.7
and v5.9:
5cf583f1fb9c ("mmc: sdhci-msm: Deactivate CQE during SDHC reset")
df57d73276b8 ("mmc: sdhci-pci: Fix SDHCI_RESET_ALL for CQHCI for Intel
GLK-based controllers")
The latter is especially prescient, saying "other drivers using CQHCI
might benefit from a similar change, if they also have CQHCI reset by
SDHCI_RESET_ALL."
So like these other patches, deactivate CQHCI when resetting the
controller. Do this via the new sdhci_and_cqhci_reset() helper.
This patch depends on (and should not compile without) the patch
entitled "mmc: cqhci: Provide helper for resetting both SDHCI and
CQHCI".
Fixes: 84362d79f436 ("mmc: sdhci-of-arasan: Add CQHCI support for arasan,sdhci-5.1")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Brian Norris <briannorris(a)chromium.org>
Reviewed-by: Guenter Roeck <linux(a)roeck-us.net>
Acked-by: Adrian Hunter <adrian.hunter(a)intel.com>
Link: https://lore.kernel.org/r/20221026124150.v4.2.I29f6a2189e84e35ad89c1833793d…
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/host/sdhci-of-arasan.c b/drivers/mmc/host/sdhci-of-arasan.c
index 3997cad1f793..cfb891430174 100644
--- a/drivers/mmc/host/sdhci-of-arasan.c
+++ b/drivers/mmc/host/sdhci-of-arasan.c
@@ -25,6 +25,7 @@
#include <linux/firmware/xlnx-zynqmp.h>
#include "cqhci.h"
+#include "sdhci-cqhci.h"
#include "sdhci-pltfm.h"
#define SDHCI_ARASAN_VENDOR_REGISTER 0x78
@@ -366,7 +367,7 @@ static void sdhci_arasan_reset(struct sdhci_host *host, u8 mask)
struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
struct sdhci_arasan_data *sdhci_arasan = sdhci_pltfm_priv(pltfm_host);
- sdhci_reset(host, mask);
+ sdhci_and_cqhci_reset(host, mask);
if (sdhci_arasan->quirks & SDHCI_ARASAN_QUIRK_FORCE_CDTEST) {
ctrl = sdhci_readb(host, SDHCI_HOST_CONTROL);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
f002f45a00ee ("mmc: sdhci-esdhc-imx: use the correct host caps for MMC_CAP_8_BIT_DATA")
1ed5c3b22fc7 ("mmc: sdhci-esdhc-imx: Propagate ESDHC_FLAG_HS400* only on 8bit bus")
2991ad76d253 ("mmc: sdhci-esdhc-imx: advertise HS400 mode through MMC caps")
854a22997ad5 ("mmc: sdhci-esdhc-imx: Convert the driver to DT-only")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f002f45a00ee14214d96b18b9a555fe2c56afb20 Mon Sep 17 00:00:00 2001
From: Haibo Chen <haibo.chen(a)nxp.com>
Date: Tue, 8 Nov 2022 15:45:03 +0800
Subject: [PATCH] mmc: sdhci-esdhc-imx: use the correct host caps for
MMC_CAP_8_BIT_DATA
MMC_CAP_8_BIT_DATA belongs to struct mmc_host, not struct sdhci_host.
So correct it here.
Fixes: 1ed5c3b22fc7 ("mmc: sdhci-esdhc-imx: Propagate ESDHC_FLAG_HS400* only on 8bit bus")
Signed-off-by: Haibo Chen <haibo.chen(a)nxp.com>
Cc: stable(a)vger.kernel.org
Acked-by: Adrian Hunter <adrian.hunter(a)intel.com>
Link: https://lore.kernel.org/r/1667893503-20583-1-git-send-email-haibo.chen@nxp.…
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c
index 89225faa242a..31ea0a2fce35 100644
--- a/drivers/mmc/host/sdhci-esdhc-imx.c
+++ b/drivers/mmc/host/sdhci-esdhc-imx.c
@@ -1672,14 +1672,14 @@ static int sdhci_esdhc_imx_probe(struct platform_device *pdev)
if (imx_data->socdata->flags & ESDHC_FLAG_ERR004536)
host->quirks |= SDHCI_QUIRK_BROKEN_ADMA;
- if (host->caps & MMC_CAP_8_BIT_DATA &&
+ if (host->mmc->caps & MMC_CAP_8_BIT_DATA &&
imx_data->socdata->flags & ESDHC_FLAG_HS400)
host->mmc->caps2 |= MMC_CAP2_HS400;
if (imx_data->socdata->flags & ESDHC_FLAG_BROKEN_AUTO_CMD23)
host->quirks2 |= SDHCI_QUIRK2_ACMD23_BROKEN;
- if (host->caps & MMC_CAP_8_BIT_DATA &&
+ if (host->mmc->caps & MMC_CAP_8_BIT_DATA &&
imx_data->socdata->flags & ESDHC_FLAG_HS400_ES) {
host->mmc->caps2 |= MMC_CAP2_HS400_ES;
host->mmc_host_ops.hs400_enhanced_strobe =
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
56baa208f910 ("mmc: sdhci-brcmstb: Fix SDHCI_RESET_ALL for CQHCI")
6bcc55fe648b ("mmc: sdhci-brcmstb: Enable Clock Gating to save power")
f3a70f991dd0 ("mmc: sdhci-brcmstb: Re-organize flags")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 56baa208f91061ff27ec2d93fbc483f624d373b4 Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris(a)chromium.org>
Date: Wed, 26 Oct 2022 12:42:05 -0700
Subject: [PATCH] mmc: sdhci-brcmstb: Fix SDHCI_RESET_ALL for CQHCI
[[ NOTE: this is completely untested by the author, but included solely
because, as noted in commit df57d73276b8 ("mmc: sdhci-pci: Fix
SDHCI_RESET_ALL for CQHCI for Intel GLK-based controllers"), "other
drivers using CQHCI might benefit from a similar change, if they
also have CQHCI reset by SDHCI_RESET_ALL." We've now seen the same
bug on at least MSM, Arasan, and Intel hardware. ]]
SDHCI_RESET_ALL resets will reset the hardware CQE state, but we aren't
tracking that properly in software. When out of sync, we may trigger
various timeouts.
It's not typical to perform resets while CQE is enabled, but this may
occur in some suspend or error recovery scenarios.
Include this fix by way of the new sdhci_and_cqhci_reset() helper.
I only patch the bcm7216 variant even though others potentially *could*
provide the 'supports-cqe' property (and thus enable CQHCI), because
d46ba2d17f90 ("mmc: sdhci-brcmstb: Add support for Command Queuing
(CQE)") and some Broadcom folks confirm that only the 7216 variant
actually supports it.
This patch depends on (and should not compile without) the patch
entitled "mmc: cqhci: Provide helper for resetting both SDHCI and
CQHCI".
Fixes: d46ba2d17f90 ("mmc: sdhci-brcmstb: Add support for Command Queuing (CQE)")
Signed-off-by: Brian Norris <briannorris(a)chromium.org>
Reviewed-by: Florian Fainelli <f.fainelli(a)gmail.com>
Acked-by: Adrian Hunter <adrian.hunter(a)intel.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20221026124150.v4.3.I6a715feab6d01f760455865e968e…
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/host/sdhci-brcmstb.c b/drivers/mmc/host/sdhci-brcmstb.c
index aff36a933ebe..55d8bd232695 100644
--- a/drivers/mmc/host/sdhci-brcmstb.c
+++ b/drivers/mmc/host/sdhci-brcmstb.c
@@ -12,6 +12,7 @@
#include <linux/bitops.h>
#include <linux/delay.h>
+#include "sdhci-cqhci.h"
#include "sdhci-pltfm.h"
#include "cqhci.h"
@@ -55,7 +56,7 @@ static void brcmstb_reset(struct sdhci_host *host, u8 mask)
struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
struct sdhci_brcmstb_priv *priv = sdhci_pltfm_priv(pltfm_host);
- sdhci_reset(host, mask);
+ sdhci_and_cqhci_reset(host, mask);
/* Reset will clear this, so re-enable it */
if (priv->flags & BRCMSTB_PRIV_FLAGS_GATE_CLOCK)
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
56baa208f910 ("mmc: sdhci-brcmstb: Fix SDHCI_RESET_ALL for CQHCI")
6bcc55fe648b ("mmc: sdhci-brcmstb: Enable Clock Gating to save power")
f3a70f991dd0 ("mmc: sdhci-brcmstb: Re-organize flags")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 56baa208f91061ff27ec2d93fbc483f624d373b4 Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris(a)chromium.org>
Date: Wed, 26 Oct 2022 12:42:05 -0700
Subject: [PATCH] mmc: sdhci-brcmstb: Fix SDHCI_RESET_ALL for CQHCI
[[ NOTE: this is completely untested by the author, but included solely
because, as noted in commit df57d73276b8 ("mmc: sdhci-pci: Fix
SDHCI_RESET_ALL for CQHCI for Intel GLK-based controllers"), "other
drivers using CQHCI might benefit from a similar change, if they
also have CQHCI reset by SDHCI_RESET_ALL." We've now seen the same
bug on at least MSM, Arasan, and Intel hardware. ]]
SDHCI_RESET_ALL resets will reset the hardware CQE state, but we aren't
tracking that properly in software. When out of sync, we may trigger
various timeouts.
It's not typical to perform resets while CQE is enabled, but this may
occur in some suspend or error recovery scenarios.
Include this fix by way of the new sdhci_and_cqhci_reset() helper.
I only patch the bcm7216 variant even though others potentially *could*
provide the 'supports-cqe' property (and thus enable CQHCI), because
d46ba2d17f90 ("mmc: sdhci-brcmstb: Add support for Command Queuing
(CQE)") and some Broadcom folks confirm that only the 7216 variant
actually supports it.
This patch depends on (and should not compile without) the patch
entitled "mmc: cqhci: Provide helper for resetting both SDHCI and
CQHCI".
Fixes: d46ba2d17f90 ("mmc: sdhci-brcmstb: Add support for Command Queuing (CQE)")
Signed-off-by: Brian Norris <briannorris(a)chromium.org>
Reviewed-by: Florian Fainelli <f.fainelli(a)gmail.com>
Acked-by: Adrian Hunter <adrian.hunter(a)intel.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20221026124150.v4.3.I6a715feab6d01f760455865e968e…
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/host/sdhci-brcmstb.c b/drivers/mmc/host/sdhci-brcmstb.c
index aff36a933ebe..55d8bd232695 100644
--- a/drivers/mmc/host/sdhci-brcmstb.c
+++ b/drivers/mmc/host/sdhci-brcmstb.c
@@ -12,6 +12,7 @@
#include <linux/bitops.h>
#include <linux/delay.h>
+#include "sdhci-cqhci.h"
#include "sdhci-pltfm.h"
#include "cqhci.h"
@@ -55,7 +56,7 @@ static void brcmstb_reset(struct sdhci_host *host, u8 mask)
struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
struct sdhci_brcmstb_priv *priv = sdhci_pltfm_priv(pltfm_host);
- sdhci_reset(host, mask);
+ sdhci_and_cqhci_reset(host, mask);
/* Reset will clear this, so re-enable it */
if (priv->flags & BRCMSTB_PRIV_FLAGS_GATE_CLOCK)
Hello
I will like to use the liberty of this medium to inform you as a consultant,that my principal is interested in investing his bond/funds as a silent business partner in your company.Taking into proper
consideration the Return on Investment(ROI) based on a ten (10) year strategic plan.
I shall give you details when you reply.
Regards,
[Public]
Hi,
One more fix was just merged for the DRM buddy regression from 5.19.
This was merged before it got confirmed successful by some of the reporters, so it didn't
get the stable tag added directly.
e0b26b948246 ("drm/amdgpu: Fix the lpfn checking condition in drm buddy")
Can you please take it to 6.0.y? Here are some other tags you can add to it when committed.
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2213
Fixes: c9cad937c0c5 ("drm/amdgpu: add drm buddy support to amdgpu")
Thanks,
The patch below does not apply to the 6.0-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
2081b3bd0c11 ("arm64: fix rodata=full again")
b9dd04a20f81 ("arm64/mm: fold check for KFENCE into can_set_direct_map()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2081b3bd0c11757725dcab9ba5d38e1bddb03459 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb(a)kernel.org>
Date: Thu, 3 Nov 2022 18:00:15 +0100
Subject: [PATCH] arm64: fix rodata=full again
Commit 2e8cff0a0eee87b2 ("arm64: fix rodata=full") addressed a couple of
issues with the rodata= kernel command line option, which is not a
simple boolean on arm64, and inadvertently got broken due to changes in
the generic bool handling.
Unfortunately, the resulting code never clears the rodata_full boolean
variable if it defaults to true and rodata=on or rodata=off is passed,
as the generic code is not aware of the existence of this variable.
Given the way this code is plumbed together, clearing rodata_full when
returning false from arch_parse_debug_rodata() may result in
inconsistencies if the generic code decides that it cannot parse the
right hand side, so the best way to deal with this is to only take
rodata_full in account if rodata_enabled is also true.
Fixes: 2e8cff0a0eee ("arm64: fix rodata=full")
Cc: <stable(a)vger.kernel.org> # 6.0.x
Signed-off-by: Ard Biesheuvel <ardb(a)kernel.org>
Acked-by: Will Deacon <will(a)kernel.org>
Link: https://lore.kernel.org/r/20221103170015.4124426-1-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas(a)arm.com>
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index d107c3d434e2..5922178d7a06 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -26,7 +26,7 @@ bool can_set_direct_map(void)
* mapped at page granularity, so that it is possible to
* protect/unprotect single pages.
*/
- return rodata_full || debug_pagealloc_enabled() ||
+ return (rodata_enabled && rodata_full) || debug_pagealloc_enabled() ||
IS_ENABLED(CONFIG_KFENCE);
}
@@ -102,7 +102,8 @@ static int change_memory_common(unsigned long addr, int numpages,
* If we are manipulating read-only permissions, apply the same
* change to the linear mapping of the pages that back this VM area.
*/
- if (rodata_full && (pgprot_val(set_mask) == PTE_RDONLY ||
+ if (rodata_enabled &&
+ rodata_full && (pgprot_val(set_mask) == PTE_RDONLY ||
pgprot_val(clear_mask) == PTE_RDONLY)) {
for (i = 0; i < area->nr_pages; i++) {
__change_memory_common((u64)page_address(area->pages[i]),
Expose the routine zap_page_range_single to zap a range within a single
vma. The madvise routine madvise_dontneed_single_vma can use this
routine as it explicitly operates on a single vma. Also, update the mmu
notification range in zap_page_range_single to take hugetlb pmd sharing
into account. This is required as MADV_DONTNEED supports hugetlb vmas.
Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reported-by: Wei Chen <harperchen1110(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
---
include/linux/mm.h | 27 +++++++++++++++++++--------
mm/madvise.c | 6 +++---
mm/memory.c | 23 +++++++++++------------
3 files changed, 33 insertions(+), 23 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e8fc35edaee0..9e7cad65dfde 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1881,6 +1881,23 @@ static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodem
__show_free_areas(flags, nodemask, MAX_NR_ZONES - 1);
}
+/*
+ * Parameter block passed down to zap_pte_range in exceptional cases.
+ */
+struct zap_details {
+ struct folio *single_folio; /* Locked folio to be unmapped */
+ bool even_cows; /* Zap COWed private pages too? */
+ zap_flags_t zap_flags; /* Extra flags for zapping */
+};
+
+/*
+ * Whether to drop the pte markers, for example, the uffd-wp information for
+ * file-backed memory. This should only be specified when we will completely
+ * drop the page in the mm, either by truncation or unmapping of the vma. By
+ * default, the flag is not set.
+ */
+#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0))
+
#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
#else
@@ -1898,6 +1915,8 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
void zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size, struct zap_details *details);
void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *start_vma, unsigned long start,
unsigned long end);
@@ -3529,12 +3548,4 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
}
#endif
-/*
- * Whether to drop the pte markers, for example, the uffd-wp information for
- * file-backed memory. This should only be specified when we will completely
- * drop the page in the mm, either by truncation or unmapping of the vma. By
- * default, the flag is not set.
- */
-#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0))
-
#endif /* _LINUX_MM_H */
diff --git a/mm/madvise.c b/mm/madvise.c
index 68a23104687f..b2f1860a353e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -785,8 +785,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
* data it wants to keep. Be sure to free swap resources too. The
- * zap_page_range call sets things up for shrink_active_list to actually free
- * these pages later if no one else has touched them in the meantime,
+ * zap_page_range_single call sets things up for shrink_active_list to actually
+ * free these pages later if no one else has touched them in the meantime,
* although we could add these pages to a global reuse list for
* shrink_active_list to pick up before reclaiming other pages.
*
@@ -803,7 +803,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- zap_page_range(vma, start, end - start);
+ zap_page_range_single(vma, start, end - start, NULL);
return 0;
}
diff --git a/mm/memory.c b/mm/memory.c
index 98ddb91df9a7..ebdbd395cfad 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1294,15 +1294,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
return ret;
}
-/*
- * Parameter block passed down to zap_pte_range in exceptional cases.
- */
-struct zap_details {
- struct folio *single_folio; /* Locked folio to be unmapped */
- bool even_cows; /* Zap COWed private pages too? */
- zap_flags_t zap_flags; /* Extra flags for zapping */
-};
-
/* Whether we should zap all COWed (private) pages too */
static inline bool should_zap_cows(struct zap_details *details)
{
@@ -1736,19 +1727,27 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
*
* The range must fit into one VMA.
*/
-static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
+ unsigned long end = address + size;
struct mmu_notifier_range range;
struct mmu_gather tlb;
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- address, address + size);
+ address, end);
+ if (is_vm_hugetlb_page(vma))
+ adjust_range_if_pmd_sharing_possible(vma, &range.start,
+ &range.end);
tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
- unmap_single_vma(&tlb, vma, address, range.end, details);
+ /*
+ * unmap 'address-end' not 'range.start-range.end' as range
+ * could have been expanded for hugetlb pmd sharing.
+ */
+ unmap_single_vma(&tlb, vma, address, end, details);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
--
2.37.3
Dear ,
Please can I have your attention and possibly help me for humanity's
sake please. I am writing this message with a heavy heart filled with
sorrows and sadness.
Please if you can respond, i have an issue that i will be most
grateful if you could help me deal with it please.
Susan
The PHY is powered on during phy-init by setting the SW_PWRDN bit in the
COM_POWER_DOWN_CTRL register and then setting the same bit in the in the
PCS_POWER_DOWN_CONTROL register that belongs to the USB part of the
PHY.
Currently, whether power on succeeds depends on probe order and having
the USB part of the PHY be initialised first. In case the DP part of the
PHY is instead initialised first, the intended power on of the USB block
results in a corrupted DP_PHY register (e.g. DP_PHY_AUX_CFG8).
Add a pointer to the USB part of the PHY to the driver data and use that
to power on the PHY also if the DP part of the PHY is initialised first.
Fixes: 52e013d0bffa ("phy: qcom-qmp: Add support for DP in USB3+DP combo phy")
Cc: stable(a)vger.kernel.org # 5.10
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov(a)linaro.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/phy/qualcomm/phy-qcom-qmp-combo.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
index 40c25a0ead23..17707f68d482 100644
--- a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
+++ b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
@@ -932,6 +932,7 @@ struct qcom_qmp {
struct regulator_bulk_data *vregs;
struct qmp_phy **phys;
+ struct qmp_phy *usb_phy;
struct mutex phy_mutex;
int init_count;
@@ -1911,7 +1912,7 @@ static int qmp_combo_com_init(struct qmp_phy *qphy)
{
struct qcom_qmp *qmp = qphy->qmp;
const struct qmp_phy_cfg *cfg = qphy->cfg;
- void __iomem *pcs = qphy->pcs;
+ struct qmp_phy *usb_phy = qmp->usb_phy;
void __iomem *dp_com = qmp->dp_com;
int ret;
@@ -1963,7 +1964,8 @@ static int qmp_combo_com_init(struct qmp_phy *qphy)
qphy_clrbits(dp_com, QPHY_V3_DP_COM_SWI_CTRL, 0x03);
qphy_clrbits(dp_com, QPHY_V3_DP_COM_SW_RESET, SW_RESET);
- qphy_setbits(pcs, cfg->regs[QPHY_PCS_POWER_DOWN_CONTROL], SW_PWRDN);
+ qphy_setbits(usb_phy->pcs, usb_phy->cfg->regs[QPHY_PCS_POWER_DOWN_CONTROL],
+ SW_PWRDN);
mutex_unlock(&qmp->phy_mutex);
@@ -2831,6 +2833,8 @@ static int qmp_combo_probe(struct platform_device *pdev)
goto err_node_put;
}
+ qmp->usb_phy = qmp->phys[id];
+
/*
* Register the pipe clock provided by phy.
* See function description to see details of this pipe clock.
@@ -2846,6 +2850,9 @@ static int qmp_combo_probe(struct platform_device *pdev)
id++;
}
+ if (!qmp->usb_phy)
+ return -EINVAL;
+
phy_provider = devm_of_phy_provider_register(dev, of_phy_simple_xlate);
return PTR_ERR_OR_ZERO(phy_provider);
--
2.37.4
The SC8180X has two resets but the DP configuration erroneously
described only one.
In case the DP part of the PHY is initialised before the USB part (e.g.
depending on probe order), then only the first reset would be asserted.
Fixes: 1633802cd4ac ("phy: qcom: qmp: Add SC8180x USB/DP combo")
Cc: stable(a)vger.kernel.org # 5.15
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov(a)linaro.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/phy/qualcomm/phy-qcom-qmp-combo.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
index cc53e2f99121..40c25a0ead23 100644
--- a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
+++ b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
@@ -1177,8 +1177,8 @@ static const struct qmp_phy_cfg sc8180x_dpphy_cfg = {
.clk_list = qmp_v3_phy_clk_l,
.num_clks = ARRAY_SIZE(qmp_v3_phy_clk_l),
- .reset_list = sc7180_usb3phy_reset_l,
- .num_resets = ARRAY_SIZE(sc7180_usb3phy_reset_l),
+ .reset_list = msm8996_usb3phy_reset_l,
+ .num_resets = ARRAY_SIZE(msm8996_usb3phy_reset_l),
.vreg_list = qmp_phy_vreg_l,
.num_vregs = ARRAY_SIZE(qmp_phy_vreg_l),
.regs = qmp_v3_usb3phy_regs_layout,
--
2.37.4
The SM8250 only uses three clocks but the DP configuration erroneously
described four clocks.
In case the DP part of the PHY is initialised before the USB part, this
would lead to uninitialised memory beyond the bulk-clocks array to be
treated as a clock pointer as the clocks are requested based on the USB
configuration.
Fixes: aff188feb5e1 ("phy: qcom-qmp: add support for sm8250-usb3-dp phy")
Cc: stable(a)vger.kernel.org # 5.13
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov(a)linaro.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/phy/qualcomm/phy-qcom-qmp-combo.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
index 5e11b6a1d189..bb38b18258ca 100644
--- a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
+++ b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
@@ -1270,8 +1270,8 @@ static const struct qmp_phy_cfg sm8250_dpphy_cfg = {
.swing_hbr3_hbr2 = &qmp_dp_v3_voltage_swing_hbr3_hbr2,
.pre_emphasis_hbr3_hbr2 = &qmp_dp_v3_pre_emphasis_hbr3_hbr2,
- .clk_list = qmp_v4_phy_clk_l,
- .num_clks = ARRAY_SIZE(qmp_v4_phy_clk_l),
+ .clk_list = qmp_v4_sm8250_usbphy_clk_l,
+ .num_clks = ARRAY_SIZE(qmp_v4_sm8250_usbphy_clk_l),
.reset_list = msm8996_usb3phy_reset_l,
.num_resets = ARRAY_SIZE(msm8996_usb3phy_reset_l),
.vreg_list = qmp_phy_vreg_l,
--
2.37.4
The PHY is powered on during phy-init by setting the SW_PRWDN bit in the
COM_POWER_DOWN_CTRL register and then setting the same bit in the in the
PCS_POWER_DOWN_CONTROL register that belongs to the USB part of the
PHY.
Currently, whether power on succeeds depends on probe order and having
the USB part of the PHY be initialised first. In case the DP part of the
PHY is instead initialised first, the intended power on of the USB block
results in a corrupted DP_PHY register (e.g. DP_PHY_AUX_CFG8).
Add a pointer to the USB part of the PHY to the driver data and use that
to power on the PHY also if the DP part of the PHY is initialised first.
Fixes: 52e013d0bffa ("phy: qcom-qmp: Add support for DP in USB3+DP combo phy")
Cc: stable(a)vger.kernel.org # 5.10
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/phy/qualcomm/phy-qcom-qmp-combo.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
index 40c25a0ead23..17707f68d482 100644
--- a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
+++ b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
@@ -932,6 +932,7 @@ struct qcom_qmp {
struct regulator_bulk_data *vregs;
struct qmp_phy **phys;
+ struct qmp_phy *usb_phy;
struct mutex phy_mutex;
int init_count;
@@ -1911,7 +1912,7 @@ static int qmp_combo_com_init(struct qmp_phy *qphy)
{
struct qcom_qmp *qmp = qphy->qmp;
const struct qmp_phy_cfg *cfg = qphy->cfg;
- void __iomem *pcs = qphy->pcs;
+ struct qmp_phy *usb_phy = qmp->usb_phy;
void __iomem *dp_com = qmp->dp_com;
int ret;
@@ -1963,7 +1964,8 @@ static int qmp_combo_com_init(struct qmp_phy *qphy)
qphy_clrbits(dp_com, QPHY_V3_DP_COM_SWI_CTRL, 0x03);
qphy_clrbits(dp_com, QPHY_V3_DP_COM_SW_RESET, SW_RESET);
- qphy_setbits(pcs, cfg->regs[QPHY_PCS_POWER_DOWN_CONTROL], SW_PWRDN);
+ qphy_setbits(usb_phy->pcs, usb_phy->cfg->regs[QPHY_PCS_POWER_DOWN_CONTROL],
+ SW_PWRDN);
mutex_unlock(&qmp->phy_mutex);
@@ -2831,6 +2833,8 @@ static int qmp_combo_probe(struct platform_device *pdev)
goto err_node_put;
}
+ qmp->usb_phy = qmp->phys[id];
+
/*
* Register the pipe clock provided by phy.
* See function description to see details of this pipe clock.
@@ -2846,6 +2850,9 @@ static int qmp_combo_probe(struct platform_device *pdev)
id++;
}
+ if (!qmp->usb_phy)
+ return -EINVAL;
+
phy_provider = devm_of_phy_provider_register(dev, of_phy_simple_xlate);
return PTR_ERR_OR_ZERO(phy_provider);
--
2.37.4
Good day,
How are you doing today, I hope this email finds you in good health. You have not responded to my previous mail to you regarding Mr. Fredrick.
Kindly acknowledge my proposition and let me know what your decisions are, if you are taking the offer.
Get back to me as soon as you can for further details.
Regards,
Andrew Kyle
--
Hello, Good Day
Please quote us as per attached file on below for pricing and
availability
For 40 pieces of self priming centrifugal pump: SP928974-T95
(Model:SP928974-T95)
Parts:Number:SP928974-T95
Qty.40 pieces
NOTE: send your quotation to:info@unilevernetherlands.nl
Please we are looking forward to receive your best offer
Thank you
Best:Regards/Vriendelijke Groeten
Mr.Ibrahim Bakker
--
Good Day,
Please could you provide us with a quotation pricing / availability for
40
pumps of AP6B78‐810.
Description: BRONZE BOOSTER PUMP AP6B78‐810
Type/Model Number: AP6B78‐810
Quantity: 40 pieces
Note these product(s) we request may fall out of your scope of work,
Therefore we would like you to outsource for the product(s) and supply
to
us.
KINDLY: send your quotation to: purchasing(a)unilevernetherlands.nl
We look forward to receiving your feedback with an offer.
Kind Regards,
Ibrahim Bakker
Area Procurement officer
--
Hello, Good Day
Please quote us as per attached file on below for pricing and
availability
For 40 pieces of self priming centrifugal pump: SP928974-T95
(Model:SP928974-T95)
Parts:Number:SP928974-T95
Qty.40 pieces
NOTE: send your quotation to:info@unilevernetherlands.nl
Please we are looking forward to receive your best offer
Thank you
Best:Regards/Vriendelijke Groeten
Mr.Ibrahim Bakker
The SC8180X has two resets but the DP configuration erroneously
described only one.
In case the DP part of the PHY is initialised before the USB part (e.g.
depending on probe order), then only the first reset would be asserted.
Fixes: 1633802cd4ac ("phy: qcom: qmp: Add SC8180x USB/DP combo")
Cc: stable(a)vger.kernel.org # 5.15
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/phy/qualcomm/phy-qcom-qmp-combo.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
index cc53e2f99121..40c25a0ead23 100644
--- a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
+++ b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
@@ -1177,8 +1177,8 @@ static const struct qmp_phy_cfg sc8180x_dpphy_cfg = {
.clk_list = qmp_v3_phy_clk_l,
.num_clks = ARRAY_SIZE(qmp_v3_phy_clk_l),
- .reset_list = sc7180_usb3phy_reset_l,
- .num_resets = ARRAY_SIZE(sc7180_usb3phy_reset_l),
+ .reset_list = msm8996_usb3phy_reset_l,
+ .num_resets = ARRAY_SIZE(msm8996_usb3phy_reset_l),
.vreg_list = qmp_phy_vreg_l,
.num_vregs = ARRAY_SIZE(qmp_phy_vreg_l),
.regs = qmp_v3_usb3phy_regs_layout,
--
2.37.4
The SM8250 only uses three clocks but the DP configuration erroneously
described four clocks.
In case the DP part of the PHY is initialised before the USB part, this
would lead to uninitialised memory beyond the bulk-clocks array to be
treated as a clock pointer as the clocks are requested based on the USB
configuration.
Fixes: aff188feb5e1 ("phy: qcom-qmp: add support for sm8250-usb3-dp phy")
Cc: stable(a)vger.kernel.org # 5.13
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/phy/qualcomm/phy-qcom-qmp-combo.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
index 5e11b6a1d189..bb38b18258ca 100644
--- a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
+++ b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
@@ -1270,8 +1270,8 @@ static const struct qmp_phy_cfg sm8250_dpphy_cfg = {
.swing_hbr3_hbr2 = &qmp_dp_v3_voltage_swing_hbr3_hbr2,
.pre_emphasis_hbr3_hbr2 = &qmp_dp_v3_pre_emphasis_hbr3_hbr2,
- .clk_list = qmp_v4_phy_clk_l,
- .num_clks = ARRAY_SIZE(qmp_v4_phy_clk_l),
+ .clk_list = qmp_v4_sm8250_usbphy_clk_l,
+ .num_clks = ARRAY_SIZE(qmp_v4_sm8250_usbphy_clk_l),
.reset_list = msm8996_usb3phy_reset_l,
.num_resets = ARRAY_SIZE(msm8996_usb3phy_reset_l),
.vreg_list = qmp_phy_vreg_l,
--
2.37.4
The patch titled
Subject: hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
hugetlb-dont-delete-vma_lock-in-hugetlb-madv_dontneed-processing.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Mike Kravetz <mike.kravetz(a)oracle.com>
Subject: hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing
Date: Fri, 11 Nov 2022 15:26:28 -0800
madvise(MADV_DONTNEED) ends up calling zap_page_range() to clear page
tables associated with the address range. For hugetlb vmas,
zap_page_range will call __unmap_hugepage_range_final. However,
__unmap_hugepage_range_final assumes the passed vma is about to be removed
and deletes the vma_lock to prevent pmd sharing as the vma is on the way
out. In the case of madvise(MADV_DONTNEED) the vma remains, but the
missing vma_lock prevents pmd sharing and could potentially lead to issues
with truncation/fault races.
This issue was originally reported here [1] as a BUG triggered in
page_try_dup_anon_rmap. Prior to the introduction of the hugetlb
vma_lock, __unmap_hugepage_range_final cleared the VM_MAYSHARE flag to
prevent pmd sharing. Subsequent faults on this vma were confused as
VM_MAYSHARE indicates a sharable vma, but was not set so page_mapping was
not set in new pages added to the page table. This resulted in pages that
appeared anonymous in a VM_SHARED vma and triggered the BUG.
Address issue by adding a new zap flag ZAP_FLAG_UNMAP to indicate an unmap
call from unmap_vmas(). This is used to indicate the 'final' unmapping of
a hugetlb vma. When called via MADV_DONTNEED, this flag is not set and
the vm_lock is not deleted.
[1] https://lore.kernel.org/lkml/CAO4mrfdLMXsao9RF4fUE8-Wfde8xmjsKrTNMNC9wjUb6J…
Link: https://lkml.kernel.org/r/20221111232628.290160-4-mike.kravetz@oracle.com
Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reported-by: Wei Chen <harperchen1110(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Mina Almasry <almasrymina(a)google.com>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Naoya Horiguchi <naoya.horiguchi(a)linux.dev>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Rik van Riel <riel(a)surriel.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mm.h | 2 ++
mm/hugetlb.c | 27 ++++++++++++++++-----------
mm/memory.c | 2 +-
3 files changed, 19 insertions(+), 12 deletions(-)
--- a/include/linux/mm.h~hugetlb-dont-delete-vma_lock-in-hugetlb-madv_dontneed-processing
+++ a/include/linux/mm.h
@@ -1868,6 +1868,8 @@ struct zap_details {
* default, the flag is not set.
*/
#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0))
+/* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */
+#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1))
#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
--- a/mm/hugetlb.c~hugetlb-dont-delete-vma_lock-in-hugetlb-madv_dontneed-processing
+++ a/mm/hugetlb.c
@@ -5196,17 +5196,22 @@ void __unmap_hugepage_range_final(struct
/* mmu notification performed in caller */
__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
- /*
- * Unlock and free the vma lock before releasing i_mmap_rwsem. When
- * the vma_lock is freed, this makes the vma ineligible for pmd
- * sharing. And, i_mmap_rwsem is required to set up pmd sharing.
- * This is important as page tables for this unmapped range will
- * be asynchrously deleted. If the page tables are shared, there
- * will be issues when accessed by someone else.
- */
- __hugetlb_vma_unlock_write_free(vma);
-
- i_mmap_unlock_write(vma->vm_file->f_mapping);
+ if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */
+ /*
+ * Unlock and free the vma lock before releasing i_mmap_rwsem.
+ * When the vma_lock is freed, this makes the vma ineligible
+ * for pmd sharing. And, i_mmap_rwsem is required to set up
+ * pmd sharing. This is important as page tables for this
+ * unmapped range will be asynchrously deleted. If the page
+ * tables are shared, there will be issues when accessed by
+ * someone else.
+ */
+ __hugetlb_vma_unlock_write_free(vma);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ } else {
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ }
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
--- a/mm/memory.c~hugetlb-dont-delete-vma_lock-in-hugetlb-madv_dontneed-processing
+++ a/mm/memory.c
@@ -1711,7 +1711,7 @@ void unmap_vmas(struct mmu_gather *tlb,
{
struct mmu_notifier_range range;
struct zap_details details = {
- .zap_flags = ZAP_FLAG_DROP_MARKER,
+ .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
/* Careful - we need to zap private pages too! */
.even_cows = true,
};
_
Patches currently in -mm which might be from mike.kravetz(a)oracle.com are
ipc-shm-call-underlying-open-close-vm_ops.patch
madvise-use-zap_page_range_single-for-madvise-dontneed.patch
hugetlb-remove-duplicate-mmu-notifications.patch
hugetlb-dont-delete-vma_lock-in-hugetlb-madv_dontneed-processing.patch
selftests-vm-update-hugetlb-madvise.patch
The patch titled
Subject: hugetlb: remove duplicate mmu notifications
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
hugetlb-remove-duplicate-mmu-notifications.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Mike Kravetz <mike.kravetz(a)oracle.com>
Subject: hugetlb: remove duplicate mmu notifications
Date: Fri, 11 Nov 2022 15:26:27 -0800
The common hugetlb unmap routine __unmap_hugepage_range performs mmu
notification calls. However, in the case where __unmap_hugepage_range is
called via __unmap_hugepage_range_final, mmu notification calls are
performed earlier in other calling routines.
Remove mmu notification calls from __unmap_hugepage_range. Add
notification calls to the only other caller: unmap_hugepage_range.
unmap_hugepage_range is called for truncation and hole punch, so change
notification type from UNMAP to CLEAR as this is more appropriate.
Link: https://lkml.kernel.org/r/20221111232628.290160-3-mike.kravetz@oracle.com
Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reported-by: Wei Chen <harperchen1110(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Mina Almasry <almasrymina(a)google.com>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Naoya Horiguchi <naoya.horiguchi(a)linux.dev>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Rik van Riel <riel(a)surriel.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
--- a/mm/hugetlb.c~hugetlb-remove-duplicate-mmu-notifications
+++ a/mm/hugetlb.c
@@ -5064,7 +5064,6 @@ static void __unmap_hugepage_range(struc
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- struct mmu_notifier_range range;
unsigned long last_addr_mask;
bool force_flush = false;
@@ -5079,13 +5078,6 @@ static void __unmap_hugepage_range(struc
tlb_change_page_size(tlb, sz);
tlb_start_vma(tlb, vma);
- /*
- * If sharing possible, alert mmu notifiers of worst case.
- */
- mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
- end);
- adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
- mmu_notifier_invalidate_range_start(&range);
last_addr_mask = hugetlb_mask_last_page(h);
address = start;
for (; address < end; address += sz) {
@@ -5174,7 +5166,6 @@ static void __unmap_hugepage_range(struc
if (ref_page)
break;
}
- mmu_notifier_invalidate_range_end(&range);
tlb_end_vma(tlb, vma);
/*
@@ -5202,6 +5193,7 @@ void __unmap_hugepage_range_final(struct
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
+ /* mmu notification performed in caller */
__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
/*
@@ -5221,10 +5213,18 @@ void unmap_hugepage_range(struct vm_area
unsigned long end, struct page *ref_page,
zap_flags_t zap_flags)
{
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ start, end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+ mmu_notifier_invalidate_range_start(&range);
tlb_gather_mmu(&tlb, vma->vm_mm);
+
__unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
+
+ mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
_
Patches currently in -mm which might be from mike.kravetz(a)oracle.com are
ipc-shm-call-underlying-open-close-vm_ops.patch
madvise-use-zap_page_range_single-for-madvise-dontneed.patch
hugetlb-remove-duplicate-mmu-notifications.patch
hugetlb-dont-delete-vma_lock-in-hugetlb-madv_dontneed-processing.patch
selftests-vm-update-hugetlb-madvise.patch
The patch titled
Subject: madvise: use zap_page_range_single for madvise dontneed
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
madvise-use-zap_page_range_single-for-madvise-dontneed.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Mike Kravetz <mike.kravetz(a)oracle.com>
Subject: madvise: use zap_page_range_single for madvise dontneed
Date: Fri, 11 Nov 2022 15:26:26 -0800
Patch series "fix hugetlb MADV_DONTNEED vma_lock handling", v9.
This series addresses the issue first reported in [1], and fully described
in patch 3. While exploring solutions to this issue, related problems
with mmu notification calls were discovered. The first two patches
address those issues.
Previous discussions suggested further cleanup by removing the routine
zap_page_range. This is possible because zap_page_range_single is now
exported, and all callers of zap_page_range pass ranges entirely within a
single vma. This work will be done in a later patch so as not to distract
from this bug fix.
[1] https://lore.kernel.org/lkml/CAO4mrfdLMXsao9RF4fUE8-Wfde8xmjsKrTNMNC9wjUb6J…
This patch (of 3):
Expose the routine zap_page_range_single to zap a range within a single
vma. The madvise routine madvise_dontneed_single_vma can use this routine
as it explicitly operates on a single vma. Also, update the mmu
notification range in zap_page_range_single to take hugetlb pmd sharing
into account. This is required as MADV_DONTNEED supports hugetlb vmas.
Link: https://lkml.kernel.org/r/20221111232628.290160-1-mike.kravetz@oracle.com
Link: https://lkml.kernel.org/r/20221111232628.290160-2-mike.kravetz@oracle.com
Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reported-by: Wei Chen <harperchen1110(a)gmail.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Mina Almasry <almasrymina(a)google.com>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Naoya Horiguchi <naoya.horiguchi(a)linux.dev>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Rik van Riel <riel(a)surriel.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mm.h | 27 +++++++++++++++++++--------
mm/madvise.c | 6 +++---
mm/memory.c | 23 +++++++++++------------
3 files changed, 33 insertions(+), 23 deletions(-)
--- a/include/linux/mm.h~madvise-use-zap_page_range_single-for-madvise-dontneed
+++ a/include/linux/mm.h
@@ -1852,6 +1852,23 @@ static void __maybe_unused show_free_are
__show_free_areas(flags, nodemask, MAX_NR_ZONES - 1);
}
+/*
+ * Parameter block passed down to zap_pte_range in exceptional cases.
+ */
+struct zap_details {
+ struct folio *single_folio; /* Locked folio to be unmapped */
+ bool even_cows; /* Zap COWed private pages too? */
+ zap_flags_t zap_flags; /* Extra flags for zapping */
+};
+
+/*
+ * Whether to drop the pte markers, for example, the uffd-wp information for
+ * file-backed memory. This should only be specified when we will completely
+ * drop the page in the mm, either by truncation or unmapping of the vma. By
+ * default, the flag is not set.
+ */
+#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0))
+
#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
#else
@@ -1869,6 +1886,8 @@ void zap_vma_ptes(struct vm_area_struct
unsigned long size);
void zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size, struct zap_details *details);
void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *start_vma, unsigned long start,
unsigned long end);
@@ -3467,12 +3486,4 @@ madvise_set_anon_name(struct mm_struct *
}
#endif
-/*
- * Whether to drop the pte markers, for example, the uffd-wp information for
- * file-backed memory. This should only be specified when we will completely
- * drop the page in the mm, either by truncation or unmapping of the vma. By
- * default, the flag is not set.
- */
-#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0))
-
#endif /* _LINUX_MM_H */
--- a/mm/madvise.c~madvise-use-zap_page_range_single-for-madvise-dontneed
+++ a/mm/madvise.c
@@ -772,8 +772,8 @@ static int madvise_free_single_vma(struc
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
* data it wants to keep. Be sure to free swap resources too. The
- * zap_page_range call sets things up for shrink_active_list to actually free
- * these pages later if no one else has touched them in the meantime,
+ * zap_page_range_single call sets things up for shrink_active_list to actually
+ * free these pages later if no one else has touched them in the meantime,
* although we could add these pages to a global reuse list for
* shrink_active_list to pick up before reclaiming other pages.
*
@@ -790,7 +790,7 @@ static int madvise_free_single_vma(struc
static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- zap_page_range(vma, start, end - start);
+ zap_page_range_single(vma, start, end - start, NULL);
return 0;
}
--- a/mm/memory.c~madvise-use-zap_page_range_single-for-madvise-dontneed
+++ a/mm/memory.c
@@ -1341,15 +1341,6 @@ copy_page_range(struct vm_area_struct *d
return ret;
}
-/*
- * Parameter block passed down to zap_pte_range in exceptional cases.
- */
-struct zap_details {
- struct folio *single_folio; /* Locked folio to be unmapped */
- bool even_cows; /* Zap COWed private pages too? */
- zap_flags_t zap_flags; /* Extra flags for zapping */
-};
-
/* Whether we should zap all COWed (private) pages too */
static inline bool should_zap_cows(struct zap_details *details)
{
@@ -1774,19 +1765,27 @@ void zap_page_range(struct vm_area_struc
*
* The range must fit into one VMA.
*/
-static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
+ unsigned long end = address + size;
struct mmu_notifier_range range;
struct mmu_gather tlb;
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- address, address + size);
+ address, end);
+ if (is_vm_hugetlb_page(vma))
+ adjust_range_if_pmd_sharing_possible(vma, &range.start,
+ &range.end);
tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
- unmap_single_vma(&tlb, vma, address, range.end, details);
+ /*
+ * unmap 'address-end' not 'range.start-range.end' as range
+ * could have been expanded for hugetlb pmd sharing.
+ */
+ unmap_single_vma(&tlb, vma, address, end, details);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
_
Patches currently in -mm which might be from mike.kravetz(a)oracle.com are
ipc-shm-call-underlying-open-close-vm_ops.patch
madvise-use-zap_page_range_single-for-madvise-dontneed.patch
hugetlb-remove-duplicate-mmu-notifications.patch
hugetlb-dont-delete-vma_lock-in-hugetlb-madv_dontneed-processing.patch
selftests-vm-update-hugetlb-madvise.patch
madvise(MADV_DONTNEED) ends up calling zap_page_range() to clear page
tables associated with the address range. For hugetlb vmas,
zap_page_range will call __unmap_hugepage_range_final. However,
__unmap_hugepage_range_final assumes the passed vma is about to be removed
and deletes the vma_lock to prevent pmd sharing as the vma is on the way
out. In the case of madvise(MADV_DONTNEED) the vma remains, but the
missing vma_lock prevents pmd sharing and could potentially lead to issues
with truncation/fault races.
This issue was originally reported here [1] as a BUG triggered in
page_try_dup_anon_rmap. Prior to the introduction of the hugetlb
vma_lock, __unmap_hugepage_range_final cleared the VM_MAYSHARE flag to
prevent pmd sharing. Subsequent faults on this vma were confused as
VM_MAYSHARE indicates a sharable vma, but was not set so page_mapping was
not set in new pages added to the page table. This resulted in pages that
appeared anonymous in a VM_SHARED vma and triggered the BUG.
Address issue by adding a new zap flag ZAP_FLAG_UNMAP to indicate an unmap
call from unmap_vmas(). This is used to indicate the 'final' unmapping of
a hugetlb vma. When called via MADV_DONTNEED, this flag is not set and the
vm_lock is not deleted.
[1] https://lore.kernel.org/lkml/CAO4mrfdLMXsao9RF4fUE8-Wfde8xmjsKrTNMNC9wjUb6J…
Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reported-by: Wei Chen <harperchen1110(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
---
include/linux/mm.h | 2 ++
mm/hugetlb.c | 27 ++++++++++++++++-----------
mm/memory.c | 2 +-
3 files changed, 19 insertions(+), 12 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9e7cad65dfde..d0e8d5007343 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1897,6 +1897,8 @@ struct zap_details {
* default, the flag is not set.
*/
#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0))
+/* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */
+#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1))
#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 205c67c6787a..0cdefa63f474 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5202,17 +5202,22 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
/* mmu notification performed in caller */
__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
- /*
- * Unlock and free the vma lock before releasing i_mmap_rwsem. When
- * the vma_lock is freed, this makes the vma ineligible for pmd
- * sharing. And, i_mmap_rwsem is required to set up pmd sharing.
- * This is important as page tables for this unmapped range will
- * be asynchrously deleted. If the page tables are shared, there
- * will be issues when accessed by someone else.
- */
- __hugetlb_vma_unlock_write_free(vma);
-
- i_mmap_unlock_write(vma->vm_file->f_mapping);
+ if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */
+ /*
+ * Unlock and free the vma lock before releasing i_mmap_rwsem.
+ * When the vma_lock is freed, this makes the vma ineligible
+ * for pmd sharing. And, i_mmap_rwsem is required to set up
+ * pmd sharing. This is important as page tables for this
+ * unmapped range will be asynchrously deleted. If the page
+ * tables are shared, there will be issues when accessed by
+ * someone else.
+ */
+ __hugetlb_vma_unlock_write_free(vma);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ } else {
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ }
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
diff --git a/mm/memory.c b/mm/memory.c
index ebdbd395cfad..2d453736f87c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1673,7 +1673,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
{
struct mmu_notifier_range range;
struct zap_details details = {
- .zap_flags = ZAP_FLAG_DROP_MARKER,
+ .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
/* Careful - we need to zap private pages too! */
.even_cows = true,
};
--
2.37.3
The quilt patch titled
Subject: hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing
has been removed from the -mm tree. Its filename was
hugetlb-dont-delete-vma_lock-in-hugetlb-madv_dontneed-processing.patch
This patch was dropped because an updated version will be merged
------------------------------------------------------
From: Mike Kravetz <mike.kravetz(a)oracle.com>
Subject: hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing
Date: Tue, 1 Nov 2022 18:31:00 -0700
madvise(MADV_DONTNEED) ends up calling zap_page_range() to clear page
tables associated with the address range. For hugetlb vmas,
zap_page_range will call __unmap_hugepage_range_final. However,
__unmap_hugepage_range_final assumes the passed vma is about to be removed
and deletes the vma_lock to prevent pmd sharing as the vma is on the way
out. In the case of madvise(MADV_DONTNEED) the vma remains, but the
missing vma_lock prevents pmd sharing and could potentially lead to issues
with truncation/fault races.
This issue was originally reported here [1] as a BUG triggered in
page_try_dup_anon_rmap. Prior to the introduction of the hugetlb
vma_lock, __unmap_hugepage_range_final cleared the VM_MAYSHARE flag to
prevent pmd sharing. Subsequent faults on this vma were confused as
VM_MAYSHARE indicates a sharable vma, but was not set so page_mapping was
not set in new pages added to the page table. This resulted in pages that
appeared anonymous in a VM_SHARED vma and triggered the BUG.
Address issue by:
- Add a new zap flag ZAP_FLAG_UNMAP to indicate an unmap call from
unmap_vmas(). This is used to indicate the 'final' unmapping of a vma.
When called via MADV_DONTNEED, this flag is not set and the vm_lock is
not deleted.
- mmu notification is removed from __unmap_hugepage_range to avoid
duplication, and notification is added to the other calling routine
(unmap_hugepage_range).
- notification calls are updated in zap_page range to take into account
the possibility of multiple vmas.
[1] https://lore.kernel.org/lkml/CAO4mrfdLMXsao9RF4fUE8-Wfde8xmjsKrTNMNC9wjUb6J…
Link: https://lkml.kernel.org/r/20221102013100.455139-1-mike.kravetz@oracle.com
Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reported-by: Wei Chen <harperchen1110(a)gmail.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Mina Almasry <almasrymina(a)google.com>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Naoya Horiguchi <naoya.horiguchi(a)linux.dev>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Rik van Riel <riel(a)surriel.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mm.h | 3 ++
mm/hugetlb.c | 45 +++++++++++++++++++++++++------------------
mm/memory.c | 21 ++++++++++++++------
3 files changed, 45 insertions(+), 24 deletions(-)
--- a/include/linux/mm.h~hugetlb-dont-delete-vma_lock-in-hugetlb-madv_dontneed-processing
+++ a/include/linux/mm.h
@@ -3475,4 +3475,7 @@ madvise_set_anon_name(struct mm_struct *
*/
#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0))
+/* Set in unmap_vmas() to indicate an unmap call. Only used by hugetlb */
+#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1))
+
#endif /* _LINUX_MM_H */
--- a/mm/hugetlb.c~hugetlb-dont-delete-vma_lock-in-hugetlb-madv_dontneed-processing
+++ a/mm/hugetlb.c
@@ -5064,7 +5064,6 @@ static void __unmap_hugepage_range(struc
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- struct mmu_notifier_range range;
unsigned long last_addr_mask;
bool force_flush = false;
@@ -5079,13 +5078,6 @@ static void __unmap_hugepage_range(struc
tlb_change_page_size(tlb, sz);
tlb_start_vma(tlb, vma);
- /*
- * If sharing possible, alert mmu notifiers of worst case.
- */
- mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
- end);
- adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
- mmu_notifier_invalidate_range_start(&range);
last_addr_mask = hugetlb_mask_last_page(h);
address = start;
for (; address < end; address += sz) {
@@ -5174,7 +5166,6 @@ static void __unmap_hugepage_range(struc
if (ref_page)
break;
}
- mmu_notifier_invalidate_range_end(&range);
tlb_end_vma(tlb, vma);
/*
@@ -5199,32 +5190,50 @@ void __unmap_hugepage_range_final(struct
unsigned long end, struct page *ref_page,
zap_flags_t zap_flags)
{
+ bool final = zap_flags & ZAP_FLAG_UNMAP;
+
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
/*
- * Unlock and free the vma lock before releasing i_mmap_rwsem. When
- * the vma_lock is freed, this makes the vma ineligible for pmd
- * sharing. And, i_mmap_rwsem is required to set up pmd sharing.
- * This is important as page tables for this unmapped range will
- * be asynchrously deleted. If the page tables are shared, there
- * will be issues when accessed by someone else.
+ * When called via zap_page_range (MADV_DONTNEED), this is not the
+ * final unmap of the vma, and we do not want to delete the vma_lock.
*/
- __hugetlb_vma_unlock_write_free(vma);
-
- i_mmap_unlock_write(vma->vm_file->f_mapping);
+ if (final) {
+ /*
+ * Unlock and free the vma lock before releasing i_mmap_rwsem.
+ * When the vma_lock is freed, this makes the vma ineligible
+ * for pmd sharing. And, i_mmap_rwsem is required to set up
+ * pmd sharing. This is important as page tables for this
+ * unmapped range will be asynchrously deleted. If the page
+ * tables are shared, there will be issues when accessed by
+ * someone else.
+ */
+ __hugetlb_vma_unlock_write_free(vma);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ } else {
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ }
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page,
zap_flags_t zap_flags)
{
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+ start, end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
tlb_gather_mmu(&tlb, vma->vm_mm);
+
__unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
+
+ mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
--- a/mm/memory.c~hugetlb-dont-delete-vma_lock-in-hugetlb-madv_dontneed-processing
+++ a/mm/memory.c
@@ -1720,7 +1720,7 @@ void unmap_vmas(struct mmu_gather *tlb,
{
struct mmu_notifier_range range;
struct zap_details details = {
- .zap_flags = ZAP_FLAG_DROP_MARKER,
+ .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
/* Careful - we need to zap private pages too! */
.even_cows = true,
};
@@ -1753,15 +1753,24 @@ void zap_page_range(struct vm_area_struc
MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
lru_add_drain();
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- start, start + size);
tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
- mmu_notifier_invalidate_range_start(&range);
do {
- unmap_single_vma(&tlb, vma, start, range.end, NULL);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma,
+ vma->vm_mm,
+ max(start, vma->vm_start),
+ min(end, vma->vm_end));
+ if (is_vm_hugetlb_page(vma))
+ adjust_range_if_pmd_sharing_possible(vma,
+ &range.start, &range.end);
+ mmu_notifier_invalidate_range_start(&range);
+ /*
+ * unmap 'start-end' not 'range.start-range.end' as range
+ * could have been expanded for pmd sharing.
+ */
+ unmap_single_vma(&tlb, vma, start, end, NULL);
+ mmu_notifier_invalidate_range_end(&range);
} while ((vma = mas_find(&mas, end - 1)) != NULL);
- mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
_
Patches currently in -mm which might be from mike.kravetz(a)oracle.com are
ipc-shm-call-underlying-open-close-vm_ops.patch
selftests-vm-update-hugetlb-madvise.patch
Hi,
This is v2 of the fix & selftest previously sent at:
https://lore.kernel.org/linux-mm/20221108195211.214025-1-flaniel@linux.micr…
Changes v1 to v2:
- add 'cc:stable', 'Fixes:' and review/ack tags
- update commitmsg and fix my email
- rebase on bpf tree and tag for bpf tree
Thanks!
Alban Crequy (2):
maccess: fix writing offset in case of fault in
strncpy_from_kernel_nofault()
selftests: bpf: add a test when bpf_probe_read_kernel_str() returns
EFAULT
mm/maccess.c | 2 +-
tools/testing/selftests/bpf/prog_tests/varlen.c | 7 +++++++
tools/testing/selftests/bpf/progs/test_varlen.c | 5 +++++
3 files changed, 13 insertions(+), 1 deletion(-)
--
2.36.1
From: Kyle Huey <me(a)kylehuey.com>
When management of the PKRU register was moved away from XSTATE, emulation
of PKRU's existence in XSTATE was added for reading PKRU through ptrace,
but not for writing PKRU through ptrace. This can be seen by running gdb
and executing `p $pkru`, `set $pkru = 42`, and `p $pkru`. On affected
kernels (5.14+) the write to the PKRU register (which gdb performs through
ptrace) is ignored.
There are three APIs that write PKRU: sigreturn, PTRACE_SETREGSET with
NT_X86_XSTATE, and KVM_SET_XSAVE. sigreturn still uses XRSTOR to write to
PKRU. KVM_SET_XSAVE has its own special handling to make PKRU writes take
effect (in fpu_copy_uabi_to_guest_fpstate). Push that down into
copy_uabi_to_xstate and have PTRACE_SETREGSET with NT_X86_XSTATE pass in
a pointer to the appropriate PKRU slot. copy_sigframe_from_user_to_xstate
depends on copy_uabi_to_xstate populating the PKRU field in the task's
XSTATE so that __fpu_restore_sig can do a XRSTOR from it, so continue doing
that.
This also adds code to initialize the PKRU value to the hardware init value
(namely 0) if the PKRU bit is not set in the XSTATE header provided to
ptrace, to match XRSTOR.
Changelog since v5:
- Avoids a second copy from the uabi buffer as suggested.
- Preserves old KVM_SET_XSAVE behavior where leaving the PKRU bit in the
XSTATE header results in PKRU remaining unchanged instead of
reinitializing it.
- Fixed up patch metadata as requested.
Changelog since v4:
- Selftest additionally checks PKRU readbacks through ptrace.
- Selftest flips all PKRU bits (except the default key).
Changelog since v3:
- The v3 patch is now part 1 of 2.
- Adds a selftest in part 2 of 2.
Changelog since v2:
- Removed now unused variables in fpu_copy_uabi_to_guest_fpstate
Changelog since v1:
- Handles the error case of copy_to_buffer().
Fixes: e84ba47e313d ("x86/fpu: Hook up PKRU into ptrace()")
Signed-off-by: Kyle Huey <me(a)kylehuey.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Borislav Petkov <bp(a)suse.de>
Cc: stable(a)vger.kernel.org # 5.14+
---
arch/x86/kernel/fpu/core.c | 20 +++++++++-----------
arch/x86/kernel/fpu/regset.c | 2 +-
arch/x86/kernel/fpu/signal.c | 2 +-
arch/x86/kernel/fpu/xstate.c | 25 ++++++++++++++++++++-----
arch/x86/kernel/fpu/xstate.h | 4 ++--
5 files changed, 33 insertions(+), 20 deletions(-)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 3b28c5b25e12..c273669e8a00 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -391,8 +391,6 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
{
struct fpstate *kstate = gfpu->fpstate;
const union fpregs_state *ustate = buf;
- struct pkru_state *xpkru;
- int ret;
if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE)
@@ -406,16 +404,16 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
if (ustate->xsave.header.xfeatures & ~xcr0)
return -EINVAL;
- ret = copy_uabi_from_kernel_to_xstate(kstate, ustate);
- if (ret)
- return ret;
+ /*
+ * Nullify @vpkru to preserve its current value if PKRU's bit isn't set
+ * in the header. KVM's odd ABI is to leave PKRU untouched in this
+ * case (all other components are eventually re-initialized).
+ * (Not clear that this is actually necessary for compat).
+ */
+ if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU))
+ vpkru = NULL;
- /* Retrieve PKRU if not in init state */
- if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) {
- xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU);
- *vpkru = xpkru->pkru;
- }
- return 0;
+ return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
}
EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
#endif /* CONFIG_KVM */
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 75ffaef8c299..6d056b68f4ed 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -167,7 +167,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
}
fpu_force_restore(fpu);
- ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf);
+ ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf, &target->thread.pkru);
out:
vfree(tmpbuf);
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 91d4b6de58ab..558076dbde5b 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -396,7 +396,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
fpregs = &fpu->fpstate->regs;
if (use_xsave() && !fx_only) {
- if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx))
+ if (copy_sigframe_from_user_to_xstate(tsk, buf_fx))
return false;
} else {
if (__copy_from_user(&fpregs->fxsave, buf_fx,
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c8340156bfd2..8f14981a3936 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1197,7 +1197,7 @@ static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
- const void __user *ubuf)
+ const void __user *ubuf, u32 *pkru)
{
struct xregs_state *xsave = &fpstate->regs.xsave;
unsigned int offset, size;
@@ -1246,6 +1246,21 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
}
}
+ /*
+ * Update the user protection key storage. Allow KVM to
+ * pass in a NULL pkru pointer if the mask bit is unset
+ * for its legacy ABI behavior.
+ */
+ if (pkru)
+ *pkru = 0;
+
+ if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
+ struct pkru_state *xpkru;
+
+ xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
+ *pkru = xpkru->pkru;
+ }
+
/*
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
@@ -1264,9 +1279,9 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
* Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
* format and copy to the target thread. Used by ptrace and KVM.
*/
-int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf)
+int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
{
- return copy_uabi_to_xstate(fpstate, kbuf, NULL);
+ return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
}
/*
@@ -1274,10 +1289,10 @@ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf)
* XSAVE[S] format and copy to the target thread. This is called from the
* sigreturn() and rt_sigreturn() system calls.
*/
-int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate,
+int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
const void __user *ubuf)
{
- return copy_uabi_to_xstate(fpstate, NULL, ubuf);
+ return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru);
}
static bool validate_independent_components(u64 mask)
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 5ad47031383b..a4ecb04d8d64 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -46,8 +46,8 @@ extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
u32 pkru_val, enum xstate_copy_mode copy_mode);
extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
enum xstate_copy_mode mode);
-extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf);
-extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf);
+extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru);
+extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf);
extern void fpu__init_cpu_xstate(void);
--
2.37.3
When adding support for the DisplayPort part of the QMP PHY the binding
(and devicetree parser) for the (USB) child node was simply reused and
this has lead to some confusion.
The third DP register region is really the DP_PHY region, not "PCS" as
the binding claims, and lie at offset 0x2a00 (not 0x2c00).
Similarly, there likely are no "RX", "RX2" or "PCS_MISC" regions as
there are for the USB part of the PHY (and in any case the Linux driver
does not use them).
Note that the sixth "PCS_MISC" region is not even in the binding.
Fixes: 5aa0d1becd5b ("arm64: dts: qcom: sm8250: switch usb1 qmp phy to USB3+DP mode")
Cc: stable(a)vger.kernel.org # 5.13
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
arch/arm64/boot/dts/qcom/sm8250.dtsi | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/boot/dts/qcom/sm8250.dtsi b/arch/arm64/boot/dts/qcom/sm8250.dtsi
index 52686f963dfc..0e86ed688fc2 100644
--- a/arch/arm64/boot/dts/qcom/sm8250.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8250.dtsi
@@ -2891,10 +2891,9 @@ usb_1_ssphy: usb3-phy@88e9200 {
dp_phy: dp-phy@88ea200 {
reg = <0 0x088ea200 0 0x200>,
<0 0x088ea400 0 0x200>,
- <0 0x088eac00 0 0x400>,
+ <0 0x088eaa00 0 0x200>,
<0 0x088ea600 0 0x200>,
- <0 0x088ea800 0 0x200>,
- <0 0x088eaa00 0 0x100>;
+ <0 0x088ea800 0 0x200>;
#phy-cells = <0>;
#clock-cells = <1>;
};
--
2.37.4
When adding support for the DisplayPort part of the QMP PHY the binding
(and devicetree parser) for the (USB) child node was simply reused and
this has lead to some confusion.
The third DP register region is really the DP_PHY region, not "PCS" as
the binding claims, and lie at offset 0x2a00 (not 0x2c00).
Similarly, there likely are no "RX", "RX2" or "PCS_MISC" regions as
there are for the USB part of the PHY (and in any case the Linux driver
does not use them).
Note that the sixth "PCS_MISC" region is not even in the binding.
Fixes: 23737b9557fe ("arm64: dts: qcom: sm6350: Add USB1 nodes")
Cc: stable(a)vger.kernel.org # 5.16
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
arch/arm64/boot/dts/qcom/sm6350.dtsi | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/boot/dts/qcom/sm6350.dtsi b/arch/arm64/boot/dts/qcom/sm6350.dtsi
index 3a315280c34a..f3843749ec63 100644
--- a/arch/arm64/boot/dts/qcom/sm6350.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm6350.dtsi
@@ -1148,10 +1148,9 @@ usb_1_ssphy: usb3-phy@88e9200 {
dp_phy: dp-phy@88ea200 {
reg = <0 0x088ea200 0 0x200>,
<0 0x088ea400 0 0x200>,
- <0 0x088eac00 0 0x400>,
+ <0 0x088eaa00 0 0x200>,
<0 0x088ea600 0 0x200>,
- <0 0x088ea800 0 0x200>,
- <0 0x088eaa00 0 0x100>;
+ <0 0x088ea800 0 0x200>;
#phy-cells = <0>;
#clock-cells = <1>;
};
--
2.37.4
commit dc63a086daee92c63e392e4e7cd7ed61f3693026 upstream.
This is useful on !virt platforms for kexec, so change things from
BI_VIRT_RNG_SEED to be BI_RNG_SEED, and simply remove BI_VIRT_RNG_SEED
because it only ever lasted one release, and nothing is broken by not
having it. At the same time, keep a comment noting that it's been
removed, so that ID isn't reused. In addition, we previously documented
2-byte alignment, but 4-byte alignment is actually necessary, so update
that comment.
Suggested-by: Geert Uytterhoeven <geert(a)linux-m68k.org>
Fixes: a1ee38ab1a75 ("m68k: virt: Use RNG seed from bootinfo block")
Signed-off-by: Jason A. Donenfeld <Jason(a)zx2c4.com>
Link: https://lore.kernel.org/r/20220927130835.1629806-2-Jason@zx2c4.com
Signed-off-by: Geert Uytterhoeven <geert(a)linux-m68k.org>
Signed-off-by: Jason A. Donenfeld <Jason(a)zx2c4.com>
---
Greg - this has a "Fixes" tag and has been in Linus' tree for a while,
and it applies without anything special, but for some reason didn't make
it to 6.0.y, and I'm not sure why. So sending it to you manually. But
I'd be curious to learn why the scripts missed this one. -Jason
arch/m68k/include/uapi/asm/bootinfo-virt.h | 9 ++-------
arch/m68k/include/uapi/asm/bootinfo.h | 7 +++++++
arch/m68k/kernel/setup_mm.c | 12 ++++++++++++
arch/m68k/virt/config.c | 11 -----------
4 files changed, 21 insertions(+), 18 deletions(-)
diff --git a/arch/m68k/include/uapi/asm/bootinfo-virt.h b/arch/m68k/include/uapi/asm/bootinfo-virt.h
index b091ee9b06e0..7dbcd7bec103 100644
--- a/arch/m68k/include/uapi/asm/bootinfo-virt.h
+++ b/arch/m68k/include/uapi/asm/bootinfo-virt.h
@@ -13,13 +13,8 @@
#define BI_VIRT_VIRTIO_BASE 0x8004
#define BI_VIRT_CTRL_BASE 0x8005
-/*
- * A random seed used to initialize the RNG. Record format:
- *
- * - length [ 2 bytes, 16-bit big endian ]
- * - seed data [ `length` bytes, padded to preserve 2-byte alignment ]
- */
-#define BI_VIRT_RNG_SEED 0x8006
+/* No longer used -- replaced with BI_RNG_SEED -- but don't reuse this index:
+ * #define BI_VIRT_RNG_SEED 0x8006 */
#define VIRT_BOOTI_VERSION MK_BI_VERSION(2, 0)
diff --git a/arch/m68k/include/uapi/asm/bootinfo.h b/arch/m68k/include/uapi/asm/bootinfo.h
index 95ecf3ae4c49..024e87d7095f 100644
--- a/arch/m68k/include/uapi/asm/bootinfo.h
+++ b/arch/m68k/include/uapi/asm/bootinfo.h
@@ -64,6 +64,13 @@ struct mem_info {
/* (struct mem_info) */
#define BI_COMMAND_LINE 0x0007 /* kernel command line parameters */
/* (string) */
+/*
+ * A random seed used to initialize the RNG. Record format:
+ *
+ * - length [ 2 bytes, 16-bit big endian ]
+ * - seed data [ `length` bytes, padded to preserve 4-byte struct alignment ]
+ */
+#define BI_RNG_SEED 0x0008
/*
diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c
index 7e7ef67cff8b..e45cc9923703 100644
--- a/arch/m68k/kernel/setup_mm.c
+++ b/arch/m68k/kernel/setup_mm.c
@@ -25,6 +25,7 @@
#include <linux/module.h>
#include <linux/nvram.h>
#include <linux/initrd.h>
+#include <linux/random.h>
#include <asm/bootinfo.h>
#include <asm/byteorder.h>
@@ -151,6 +152,17 @@ static void __init m68k_parse_bootinfo(const struct bi_record *record)
sizeof(m68k_command_line));
break;
+ case BI_RNG_SEED: {
+ u16 len = be16_to_cpup(data);
+ add_bootloader_randomness(data + 2, len);
+ /*
+ * Zero the data to preserve forward secrecy, and zero the
+ * length to prevent kexec from using it.
+ */
+ memzero_explicit((void *)data, len + 2);
+ break;
+ }
+
default:
if (MACH_IS_AMIGA)
unknown = amiga_parse_bootinfo(record);
diff --git a/arch/m68k/virt/config.c b/arch/m68k/virt/config.c
index 4ab22946ff68..632ba200ad42 100644
--- a/arch/m68k/virt/config.c
+++ b/arch/m68k/virt/config.c
@@ -2,7 +2,6 @@
#include <linux/reboot.h>
#include <linux/serial_core.h>
-#include <linux/random.h>
#include <clocksource/timer-goldfish.h>
#include <asm/bootinfo.h>
@@ -93,16 +92,6 @@ int __init virt_parse_bootinfo(const struct bi_record *record)
data += 4;
virt_bi_data.virtio.irq = be32_to_cpup(data);
break;
- case BI_VIRT_RNG_SEED: {
- u16 len = be16_to_cpup(data);
- add_bootloader_randomness(data + 2, len);
- /*
- * Zero the data to preserve forward secrecy, and zero the
- * length to prevent kexec from using it.
- */
- memzero_explicit((void *)data, len + 2);
- break;
- }
default:
unknown = 1;
break;
--
2.38.1
From: Jason Gunthorpe <jgg(a)nvidia.com>
[ Upstream commit f423fa1bc9fe1978e6b9f54927411b62cb43eb04 ]
When converting to directly create the vfio_device the mdev driver has to
put a vfio_register_emulated_iommu_dev() in the probe() and a pairing
vfio_unregister_group_dev() in the remove.
This was missed for gvt, add it.
Cc: stable(a)vger.kernel.org
Fixes: 978cf586ac35 ("drm/i915/gvt: convert to use vfio_register_emulated_iommu_dev")
Reported-by: Alex Williamson <alex.williamson(a)redhat.com>
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian(a)intel.com>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Link: https://lore.kernel.org/r/0-v1-013609965fe8+9d-vfio_gvt_unregister_jgg@nvid…
Signed-off-by: Alex Williamson <alex.williamson(a)redhat.com>
Reviewed-by: Kevin Tian <kevin.tian(a)intel.com> # v6.0 backport
Signed-off-by: Alex Williamson <alex.williamson(a)redhat.com> # v6.0 backport
---
drivers/gpu/drm/i915/gvt/kvmgt.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index e3cd58946477..de89946c4817 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -1595,6 +1595,9 @@ static void intel_vgpu_remove(struct mdev_device *mdev)
if (WARN_ON_ONCE(vgpu->attached))
return;
+
+ vfio_unregister_group_dev(&vgpu->vfio_device);
+ vfio_uninit_group_dev(&vgpu->vfio_device);
intel_gvt_destroy_vgpu(vgpu);
}
[Public]
Hi,
USB4 DP tunneling has a bug in 6.0.y and 5.15.y that was fixed in 6.1.
Basically if the Pre-OS connection manager has set up USB4 DP tunnels they're ignored by the OS so you'll have a "working" display in firmware and once the Linux kernel boots up you use lose the display.
Hot plugging works around the issue.
For 6.0.y can you please take this commit to fix it:
b60e31bf18a7 ("thunderbolt: Add DP OUT resource when DP tunnel is discovered")
For 5.15.y can you please take these two commits to fix it:
43bddb26e20a ("thunderbolt: Tear down existing tunnels when resuming from hibernate")
b60e31bf18a7 ("thunderbolt: Add DP OUT resource when DP tunnel is discovered")
Thanks,
From: Sjoerd Simons <sjoerd(a)collabora.com>
The bus->clk_stop_timeout member is only initialized to a non-zero value
during the codec driver probe. This can lead to corner cases where this
value remains pegged at zero when the bus suspends, which results in an
endless loop in sdw_bus_wait_for_clk_prep_deprep().
Corner cases include configurations with no codecs described in the
firmware, or delays in probing codec drivers.
Initializing the default timeout to the smallest non-zero value avoid this
problem and allows for the existing logic to be preserved: the
bus->clk_stop_timeout is set as the maximum required by all codecs
connected on the bus.
Fixes: 1f2dcf3a154ac ("soundwire: intel: set dev_num_ida_min")
Signed-off-by: Sjoerd Simons <sjoerd(a)collabora.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart(a)linux.intel.com>
Reviewed-by: Chao Song <chao.song(a)intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao(a)linux.intel.com>
---
drivers/soundwire/intel.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c
index 25ec9c272239..78d35bb4852c 100644
--- a/drivers/soundwire/intel.c
+++ b/drivers/soundwire/intel.c
@@ -1311,6 +1311,7 @@ static int intel_link_probe(struct auxiliary_device *auxdev,
bus->link_id = auxdev->id;
bus->dev_num_ida_min = INTEL_DEV_NUM_IDA_MIN;
+ bus->clk_stop_timeout = 1;
sdw_cdns_probe(cdns);
--
2.25.1
From: Benjamin Coddington <bcodding(a)redhat.com>
[ Upstream commit f5ea16137a3fa2858620dc9084466491c128535f ]
There's a small window where a LOCK sent during a delegation return can
race with another OPEN on client, but the open stateid has not yet been
updated. In this case, the client doesn't handle the OLD_STATEID error
from the server and will lose this lock, emitting:
"NFS: nfs4_handle_delegation_recall_error: unhandled error -10024".
Fix this by sending the task through the nfs4 error handling in
nfs4_lock_done() when we may have to reconcile our stateid with what the
server believes it to be. For this case, the result is a retry of the
LOCK operation with the updated stateid.
Reported-by: Gonzalo Siero Humet <gsierohu(a)redhat.com>
Signed-off-by: Benjamin Coddington <bcodding(a)redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker(a)Netapp.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/nfs/nfs4proc.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f9f76594b866..9a0f48f7f2b8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6613,6 +6613,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
{
struct nfs4_lockdata *data = calldata;
struct nfs4_lock_state *lsp = data->lsp;
+ struct nfs_server *server = NFS_SERVER(d_inode(data->ctx->dentry));
dprintk("%s: begin!\n", __func__);
@@ -6622,8 +6623,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
data->rpc_status = task->tk_status;
switch (task->tk_status) {
case 0:
- renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)),
- data->timestamp);
+ renew_lease(server, data->timestamp);
if (data->arg.new_lock && !data->cancelled) {
data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0)
@@ -6644,6 +6644,8 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
if (!nfs4_stateid_match(&data->arg.open_stateid,
&lsp->ls_state->open_stateid))
goto out_restart;
+ else if (nfs4_async_handle_error(task, server, lsp->ls_state, NULL) == -EAGAIN)
+ goto out_restart;
} else if (!nfs4_stateid_match(&data->arg.lock_stateid,
&lsp->ls_stateid))
goto out_restart;
--
2.35.1
From: Benjamin Coddington <bcodding(a)redhat.com>
[ Upstream commit f5ea16137a3fa2858620dc9084466491c128535f ]
There's a small window where a LOCK sent during a delegation return can
race with another OPEN on client, but the open stateid has not yet been
updated. In this case, the client doesn't handle the OLD_STATEID error
from the server and will lose this lock, emitting:
"NFS: nfs4_handle_delegation_recall_error: unhandled error -10024".
Fix this by sending the task through the nfs4 error handling in
nfs4_lock_done() when we may have to reconcile our stateid with what the
server believes it to be. For this case, the result is a retry of the
LOCK operation with the updated stateid.
Reported-by: Gonzalo Siero Humet <gsierohu(a)redhat.com>
Signed-off-by: Benjamin Coddington <bcodding(a)redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker(a)Netapp.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/nfs/nfs4proc.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7c5dfed0437f..77c2c88621be 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6854,6 +6854,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
{
struct nfs4_lockdata *data = calldata;
struct nfs4_lock_state *lsp = data->lsp;
+ struct nfs_server *server = NFS_SERVER(d_inode(data->ctx->dentry));
dprintk("%s: begin!\n", __func__);
@@ -6863,8 +6864,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
data->rpc_status = task->tk_status;
switch (task->tk_status) {
case 0:
- renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)),
- data->timestamp);
+ renew_lease(server, data->timestamp);
if (data->arg.new_lock && !data->cancelled) {
data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0)
@@ -6885,6 +6885,8 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
if (!nfs4_stateid_match(&data->arg.open_stateid,
&lsp->ls_state->open_stateid))
goto out_restart;
+ else if (nfs4_async_handle_error(task, server, lsp->ls_state, NULL) == -EAGAIN)
+ goto out_restart;
} else if (!nfs4_stateid_match(&data->arg.lock_stateid,
&lsp->ls_stateid))
goto out_restart;
--
2.35.1
From: Benjamin Coddington <bcodding(a)redhat.com>
[ Upstream commit f5ea16137a3fa2858620dc9084466491c128535f ]
There's a small window where a LOCK sent during a delegation return can
race with another OPEN on client, but the open stateid has not yet been
updated. In this case, the client doesn't handle the OLD_STATEID error
from the server and will lose this lock, emitting:
"NFS: nfs4_handle_delegation_recall_error: unhandled error -10024".
Fix this by sending the task through the nfs4 error handling in
nfs4_lock_done() when we may have to reconcile our stateid with what the
server believes it to be. For this case, the result is a retry of the
LOCK operation with the updated stateid.
Reported-by: Gonzalo Siero Humet <gsierohu(a)redhat.com>
Signed-off-by: Benjamin Coddington <bcodding(a)redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker(a)Netapp.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/nfs/nfs4proc.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 03f09399abf4..36af3734ac87 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7014,6 +7014,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
{
struct nfs4_lockdata *data = calldata;
struct nfs4_lock_state *lsp = data->lsp;
+ struct nfs_server *server = NFS_SERVER(d_inode(data->ctx->dentry));
dprintk("%s: begin!\n", __func__);
@@ -7023,8 +7024,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
data->rpc_status = task->tk_status;
switch (task->tk_status) {
case 0:
- renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)),
- data->timestamp);
+ renew_lease(server, data->timestamp);
if (data->arg.new_lock && !data->cancelled) {
data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0)
@@ -7045,6 +7045,8 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
if (!nfs4_stateid_match(&data->arg.open_stateid,
&lsp->ls_state->open_stateid))
goto out_restart;
+ else if (nfs4_async_handle_error(task, server, lsp->ls_state, NULL) == -EAGAIN)
+ goto out_restart;
} else if (!nfs4_stateid_match(&data->arg.lock_stateid,
&lsp->ls_stateid))
goto out_restart;
--
2.35.1
From: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
[ Upstream commit cf00b33058b196b4db928419dde68993b15a975b ]
A bug in the LSA code resulted in transfers slightly larger
than the mailbox size. Let us make it easier to catch similar
issues in future by adding a low level check.
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
Link: https://lore.kernel.org/r/20220815154044.24733-2-Jonathan.Cameron@huawei.com
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/cxl/core/mbox.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 16176b9278b4..0c90f13870a4 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -174,7 +174,7 @@ int cxl_mbox_send_cmd(struct cxl_dev_state *cxlds, u16 opcode, void *in,
};
int rc;
- if (out_size > cxlds->payload_size)
+ if (in_size > cxlds->payload_size || out_size > cxlds->payload_size)
return -E2BIG;
rc = cxlds->mbox_send(cxlds, &mbox_cmd);
--
2.35.1
madvise(MADV_DONTNEED) ends up calling zap_page_range() to clear page
tables associated with the address range. For hugetlb vmas,
zap_page_range will call __unmap_hugepage_range_final. However,
__unmap_hugepage_range_final assumes the passed vma is about to be removed
and deletes the vma_lock to prevent pmd sharing as the vma is on the way
out. In the case of madvise(MADV_DONTNEED) the vma remains, but the
missing vma_lock prevents pmd sharing and could potentially lead to issues
with truncation/fault races.
This issue was originally reported here [1] as a BUG triggered in
page_try_dup_anon_rmap. Prior to the introduction of the hugetlb
vma_lock, __unmap_hugepage_range_final cleared the VM_MAYSHARE flag to
prevent pmd sharing. Subsequent faults on this vma were confused as
VM_MAYSHARE indicates a sharable vma, but was not set so page_mapping was
not set in new pages added to the page table. This resulted in pages that
appeared anonymous in a VM_SHARED vma and triggered the BUG.
Address issue by:
- Add a new zap flag ZAP_FLAG_UNMAP to indicate an unmap call from
unmap_vmas(). This is used to indicate the 'final' unmapping of a vma.
When called via MADV_DONTNEED, this flag is not set and the vm_lock is
not deleted.
- mmu notification is removed from __unmap_hugepage_range to avoid
duplication, and notification is added to the other calling routine
(unmap_hugepage_range).
- notification range in zap_page_range_single is updated to take into
account the possibility of hugetlb pmd sharing.
- zap_page_range_single renamed to __zap_page_range_single to be used
internally within mm/memory.c
- zap_vma_range() interface created to zap a range within a single vma.
- madvise_dontneed_single_vma is updated to call zap_vma_range instead of
zap_page_range as it only operates on a range within a single vma
[1] https://lore.kernel.org/lkml/CAO4mrfdLMXsao9RF4fUE8-Wfde8xmjsKrTNMNC9wjUb6J…
Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reported-by: Wei Chen <harperchen1110(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
---
include/linux/mm.h | 5 +++++
mm/hugetlb.c | 45 +++++++++++++++++++++++++++------------------
mm/madvise.c | 4 ++--
mm/memory.c | 17 +++++++++++++----
4 files changed, 47 insertions(+), 24 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 978c17df053e..d205bcd9cd2e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1840,6 +1840,8 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
void zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
+void zap_vma_range(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size);
void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *start_vma, unsigned long start,
unsigned long end);
@@ -3464,4 +3466,7 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
*/
#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0))
+/* Set in unmap_vmas() to indicate an unmap call. Only used by hugetlb */
+#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1))
+
#endif /* _LINUX_MM_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ceb47c4e183a..7c8fbce4441e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5072,7 +5072,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- struct mmu_notifier_range range;
unsigned long last_addr_mask;
bool force_flush = false;
@@ -5087,13 +5086,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
tlb_change_page_size(tlb, sz);
tlb_start_vma(tlb, vma);
- /*
- * If sharing possible, alert mmu notifiers of worst case.
- */
- mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
- end);
- adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
- mmu_notifier_invalidate_range_start(&range);
last_addr_mask = hugetlb_mask_last_page(h);
address = start;
for (; address < end; address += sz) {
@@ -5178,7 +5170,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
if (ref_page)
break;
}
- mmu_notifier_invalidate_range_end(&range);
tlb_end_vma(tlb, vma);
/*
@@ -5203,32 +5194,50 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
unsigned long end, struct page *ref_page,
zap_flags_t zap_flags)
{
+ bool final = zap_flags & ZAP_FLAG_UNMAP;
+
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
/*
- * Unlock and free the vma lock before releasing i_mmap_rwsem. When
- * the vma_lock is freed, this makes the vma ineligible for pmd
- * sharing. And, i_mmap_rwsem is required to set up pmd sharing.
- * This is important as page tables for this unmapped range will
- * be asynchrously deleted. If the page tables are shared, there
- * will be issues when accessed by someone else.
+ * When called via zap_vma_range (MADV_DONTNEED), this is not the
+ * final unmap of the vma, and we do not want to delete the vma_lock.
*/
- __hugetlb_vma_unlock_write_free(vma);
-
- i_mmap_unlock_write(vma->vm_file->f_mapping);
+ if (final) {
+ /*
+ * Unlock and free the vma lock before releasing i_mmap_rwsem.
+ * When the vma_lock is freed, this makes the vma ineligible
+ * for pmd sharing. And, i_mmap_rwsem is required to set up
+ * pmd sharing. This is important as page tables for this
+ * unmapped range will be asynchrously deleted. If the page
+ * tables are shared, there will be issues when accessed by
+ * someone else.
+ */
+ __hugetlb_vma_unlock_write_free(vma);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ } else {
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ }
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page,
zap_flags_t zap_flags)
{
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+ start, end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
tlb_gather_mmu(&tlb, vma->vm_mm);
+
__unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
+
+ mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
diff --git a/mm/madvise.c b/mm/madvise.c
index c7105ec6d08c..9d2625b8029a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -772,7 +772,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
* data it wants to keep. Be sure to free swap resources too. The
- * zap_page_range call sets things up for shrink_active_list to actually free
+ * zap_vma_range call sets things up for shrink_active_list to actually free
* these pages later if no one else has touched them in the meantime,
* although we could add these pages to a global reuse list for
* shrink_active_list to pick up before reclaiming other pages.
@@ -790,7 +790,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- zap_page_range(vma, start, end - start);
+ zap_vma_range(vma, start, end - start);
return 0;
}
diff --git a/mm/memory.c b/mm/memory.c
index 6090124b64f1..af3a4724b464 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1717,7 +1717,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
}
/**
- * zap_page_range_single - remove user pages in a given range
+ * __zap_page_range_single - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages
* @address: starting address of pages to zap
* @size: number of bytes to zap
@@ -1725,7 +1725,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
*
* The range must fit into one VMA.
*/
-static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+static void __zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
struct mmu_notifier_range range;
@@ -1734,6 +1734,9 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address, address + size);
+ if (is_vm_hugetlb_page(vma))
+ adjust_range_if_pmd_sharing_possible(vma, &range.start,
+ &range.end);
tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
@@ -1742,6 +1745,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
tlb_finish_mmu(&tlb);
}
+void zap_vma_range(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size)
+{
+ __zap_page_range_single(vma, address, size, NULL);
+}
+
/**
* zap_vma_ptes - remove ptes mapping the vma
* @vma: vm_area_struct holding ptes to be zapped
@@ -1760,7 +1769,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
!(vma->vm_flags & VM_PFNMAP))
return;
- zap_page_range_single(vma, address, size, NULL);
+ __zap_page_range_single(vma, address, size, NULL);
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);
@@ -3453,7 +3462,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr,
struct zap_details *details)
{
- zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
+ __zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
}
static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
--
2.37.3
On Thu, Nov 10, 2022 at 01:53:25PM -0800, Ives van Hoorne wrote:
> I verified these latest changes with my initial test case, and can confirm
> that I haven't been able to reproduce the problem anymore! Usually, I would
> be able to reproduce the error after ~1-5 runs, given that I would fill the
> page cache before the run, now I have been running the same test case for
> 50+ times, and I haven't seen the error anymore.
>
> Thanks again Peter for taking the time to find the issue and coming up with
> a fix!
Thanks for the quick feedback, Ives!
--
Peter Xu
--
Hello,
We are privileged and delighted to reach you via email" And we are
urgently waiting to hear from you. and again your number is not
connecting.
Best regards,
Prof. Do-Young Byun
This is a note to let you know that I've just added the patch titled
firmware: coreboot: Register bus in module init
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 65946690ed8d972fdb91a74ee75ac0f0f0d68321 Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris(a)chromium.org>
Date: Wed, 19 Oct 2022 18:10:53 -0700
Subject: firmware: coreboot: Register bus in module init
The coreboot_table driver registers a coreboot bus while probing a
"coreboot_table" device representing the coreboot table memory region.
Probing this device (i.e., registering the bus) is a dependency for the
module_init() functions of any driver for this bus (e.g.,
memconsole-coreboot.c / memconsole_driver_init()).
With synchronous probe, this dependency works OK, as the link order in
the Makefile ensures coreboot_table_driver_init() (and thus,
coreboot_table_probe()) completes before a coreboot device driver tries
to add itself to the bus.
With asynchronous probe, however, coreboot_table_probe() may race with
memconsole_driver_init(), and so we're liable to hit one of these two:
1. coreboot_driver_register() eventually hits "[...] the bus was not
initialized.", and the memconsole driver fails to register; or
2. coreboot_driver_register() gets past #1, but still races with
bus_register() and hits some other undefined/crashing behavior (e.g.,
in driver_find() [1])
We can resolve this by registering the bus in our initcall, and only
deferring "device" work (scanning the coreboot memory region and
creating sub-devices) to probe().
[1] Example failure, using 'driver_async_probe=*' kernel command line:
[ 0.114217] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010
...
[ 0.114307] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 6.1.0-rc1 #63
[ 0.114316] Hardware name: Google Scarlet (DT)
...
[ 0.114488] Call trace:
[ 0.114494] _raw_spin_lock+0x34/0x60
[ 0.114502] kset_find_obj+0x28/0x84
[ 0.114511] driver_find+0x30/0x50
[ 0.114520] driver_register+0x64/0x10c
[ 0.114528] coreboot_driver_register+0x30/0x3c
[ 0.114540] memconsole_driver_init+0x24/0x30
[ 0.114550] do_one_initcall+0x154/0x2e0
[ 0.114560] do_initcall_level+0x134/0x160
[ 0.114571] do_initcalls+0x60/0xa0
[ 0.114579] do_basic_setup+0x28/0x34
[ 0.114588] kernel_init_freeable+0xf8/0x150
[ 0.114596] kernel_init+0x2c/0x12c
[ 0.114607] ret_from_fork+0x10/0x20
[ 0.114624] Code: 5280002b 1100054a b900092a f9800011 (885ffc01)
[ 0.114631] ---[ end trace 0000000000000000 ]---
Fixes: b81e3140e412 ("firmware: coreboot: Make bus registration symmetric")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Brian Norris <briannorris(a)chromium.org>
Reviewed-by: Guenter Roeck <linux(a)roeck-us.net>
Reviewed-by: Stephen Boyd <swboyd(a)chromium.org>
Link: https://lore.kernel.org/r/20221019180934.1.If29e167d8a4771b0bf4a39c89c6946e…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/firmware/google/coreboot_table.c | 37 +++++++++++++++++++-----
1 file changed, 29 insertions(+), 8 deletions(-)
diff --git a/drivers/firmware/google/coreboot_table.c b/drivers/firmware/google/coreboot_table.c
index c52bcaa9def6..9ca21feb9d45 100644
--- a/drivers/firmware/google/coreboot_table.c
+++ b/drivers/firmware/google/coreboot_table.c
@@ -149,12 +149,8 @@ static int coreboot_table_probe(struct platform_device *pdev)
if (!ptr)
return -ENOMEM;
- ret = bus_register(&coreboot_bus_type);
- if (!ret) {
- ret = coreboot_table_populate(dev, ptr);
- if (ret)
- bus_unregister(&coreboot_bus_type);
- }
+ ret = coreboot_table_populate(dev, ptr);
+
memunmap(ptr);
return ret;
@@ -169,7 +165,6 @@ static int __cb_dev_unregister(struct device *dev, void *dummy)
static int coreboot_table_remove(struct platform_device *pdev)
{
bus_for_each_dev(&coreboot_bus_type, NULL, NULL, __cb_dev_unregister);
- bus_unregister(&coreboot_bus_type);
return 0;
}
@@ -199,6 +194,32 @@ static struct platform_driver coreboot_table_driver = {
.of_match_table = of_match_ptr(coreboot_of_match),
},
};
-module_platform_driver(coreboot_table_driver);
+
+static int __init coreboot_table_driver_init(void)
+{
+ int ret;
+
+ ret = bus_register(&coreboot_bus_type);
+ if (ret)
+ return ret;
+
+ ret = platform_driver_register(&coreboot_table_driver);
+ if (ret) {
+ bus_unregister(&coreboot_bus_type);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __exit coreboot_table_driver_exit(void)
+{
+ platform_driver_unregister(&coreboot_table_driver);
+ bus_unregister(&coreboot_bus_type);
+}
+
+module_init(coreboot_table_driver_init);
+module_exit(coreboot_table_driver_exit);
+
MODULE_AUTHOR("Google, Inc.");
MODULE_LICENSE("GPL");
--
2.38.1
This is a note to let you know that I've just added the patch titled
docs: update mediator contact information in CoC doc
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 5fddf8962b429b8303c4a654291ecb6e61a7d747 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan(a)linuxfoundation.org>
Date: Tue, 11 Oct 2022 11:14:17 -0600
Subject: docs: update mediator contact information in CoC doc
Update mediator contact information in CoC interpretation document.
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Shuah Khan <skhan(a)linuxfoundation.org>
Link: https://lore.kernel.org/r/20221011171417.34286-1-skhan@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
Documentation/process/code-of-conduct-interpretation.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Documentation/process/code-of-conduct-interpretation.rst b/Documentation/process/code-of-conduct-interpretation.rst
index 922e0b547bc3..66b07f14714c 100644
--- a/Documentation/process/code-of-conduct-interpretation.rst
+++ b/Documentation/process/code-of-conduct-interpretation.rst
@@ -51,7 +51,7 @@ the Technical Advisory Board (TAB) or other maintainers if you're
uncertain how to handle situations that come up. It will not be
considered a violation report unless you want it to be. If you are
uncertain about approaching the TAB or any other maintainers, please
-reach out to our conflict mediator, Joanna Lee <joanna.lee(a)gesmer.com>.
+reach out to our conflict mediator, Joanna Lee <jlee(a)linuxfoundation.org>.
In the end, "be kind to each other" is really what the end goal is for
everybody. We know everyone is human and we all fail at times, but the
--
2.38.1
This is a note to let you know that I've just added the patch titled
slimbus: stream: correct presence rate frequencies
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From b9c1939627f8185dec8ba6d741e9573a4c7a5834 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
Date: Thu, 29 Sep 2022 18:52:02 +0200
Subject: slimbus: stream: correct presence rate frequencies
Correct few frequencies in presence rate table - multiplied by 10
(110250 instead of 11025 Hz).
Fixes: abb9c9b8b51b ("slimbus: stream: add stream support")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
Link: https://lore.kernel.org/r/20220929165202.410937-1-krzysztof.kozlowski@linar…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/slimbus/stream.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/slimbus/stream.c b/drivers/slimbus/stream.c
index 75f87b3d8b95..73a2aa362957 100644
--- a/drivers/slimbus/stream.c
+++ b/drivers/slimbus/stream.c
@@ -67,10 +67,10 @@ static const int slim_presence_rate_table[] = {
384000,
768000,
0, /* Reserved */
- 110250,
- 220500,
- 441000,
- 882000,
+ 11025,
+ 22050,
+ 44100,
+ 88200,
176400,
352800,
705600,
--
2.38.1
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 36b038791e1e2baea892e9276588815fd14894b4
Gitweb: https://git.kernel.org/tip/36b038791e1e2baea892e9276588815fd14894b4
Author: Mel Gorman <mgorman(a)techsingularity.net>
AuthorDate: Thu, 10 Nov 2022 12:44:00
Committer: Borislav Petkov <bp(a)suse.de>
CommitterDate: Thu, 10 Nov 2022 16:57:38 +01:00
x86/fpu: Drop fpregs lock before inheriting FPU permissions
Mike Galbraith reported the following against an old fork of preempt-rt
but the same issue also applies to the current preempt-rt tree.
BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1, name: systemd
preempt_count: 1, expected: 0
RCU nest depth: 0, expected: 0
Preemption disabled at:
fpu_clone
CPU: 6 PID: 1 Comm: systemd Tainted: G E (unreleased)
Call Trace:
<TASK>
dump_stack_lvl
? fpu_clone
__might_resched
rt_spin_lock
fpu_clone
? copy_thread
? copy_process
? shmem_alloc_inode
? kmem_cache_alloc
? kernel_clone
? __do_sys_clone
? do_syscall_64
? __x64_sys_rt_sigprocmask
? syscall_exit_to_user_mode
? do_syscall_64
? syscall_exit_to_user_mode
? do_syscall_64
? syscall_exit_to_user_mode
? do_syscall_64
? exc_page_fault
? entry_SYSCALL_64_after_hwframe
</TASK>
Mike says:
The splat comes from fpu_inherit_perms() being called under fpregs_lock(),
and us reaching the spin_lock_irq() therein due to fpu_state_size_dynamic()
returning true despite static key __fpu_state_size_dynamic having never
been enabled.
Mike's assessment looks correct. fpregs_lock on a PREEMPT_RT kernel disables
preemption so calling spin_lock_irq() in fpu_inherit_perms() is unsafe. This
problem exists since commit
9e798e9aa14c ("x86/fpu: Prepare fpu_clone() for dynamically enabled features").
Even though the original bug report should not have enabled the paths at
all, the bug still exists.
fpregs_lock is necessary when editing the FPU registers or a task's FP
state but it is not necessary for fpu_inherit_perms(). The only write
of any FP state in fpu_inherit_perms() is for the new child which is
not running yet and cannot context switch or be borrowed by a kernel
thread yet. Hence, fpregs_lock is not protecting anything in the new
child until clone() completes and can be dropped earlier. The siglock
still needs to be acquired by fpu_inherit_perms() as the read of the
parent's permissions has to be serialised.
[ bp: Cleanup splat. ]
Fixes: 9e798e9aa14c ("x86/fpu: Prepare fpu_clone() for dynamically enabled features")
Reported-by: Mike Galbraith <efault(a)gmx.de>
Signed-off-by: Mel Gorman <mgorman(a)techsingularity.net>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Reviewed-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20221110124400.zgymc2lnwqjukgfh@techsingularity.n…
---
arch/x86/kernel/fpu/core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 3b28c5b..d00db56 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -605,9 +605,9 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal)
if (test_thread_flag(TIF_NEED_FPU_LOAD))
fpregs_restore_userregs();
save_fpregs_to_fpstate(dst_fpu);
+ fpregs_unlock();
if (!(clone_flags & CLONE_THREAD))
fpu_inherit_perms(dst_fpu);
- fpregs_unlock();
/*
* Children never inherit PASID state.
Ives van Hoorne from codesandbox.io reported an issue regarding possible
data loss of uffd-wp when applied to memfds on heavily loaded systems. The
sympton is some read page got data mismatch from the snapshot child VMs.
Here I can also reproduce with a Rust reproducer that was provided by Ives
that keeps taking snapshot of a 256MB VM, on a 32G system when I initiate
80 instances I can trigger the issues in ten minutes.
It turns out that we got some pages write-through even if uffd-wp is
applied to the pte.
The problem is, when removing migration entries, we didn't really worry
about write bit as long as we know it's not a write migration entry. That
may not be true, for some memory types (e.g. writable shmem) mk_pte can
return a pte with write bit set, then to recover the migration entry to its
original state we need to explicit wr-protect the pte or it'll has the
write bit set if it's a read migration entry.
For uffd it can cause write-through. I didn't verify, but I think it'll be
the same for mprotect()ed pages and after migration we can miss the sigbus
instead.
The relevant code on uffd was introduced in the anon support, which is
commit f45ec5ff16a7 ("userfaultfd: wp: support swap and page migration",
2020-04-07). However anon shouldn't suffer from this problem because anon
should already have the write bit cleared always, so that may not be a
proper Fixes target. To satisfy the need on the backport, I'm attaching
the Fixes tag to the uffd-wp shmem support. Since no one had issue with
mprotect, so I assume that's also the kernel version we should start to
backport for stable, and we shouldn't need to worry before that.
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: stable(a)vger.kernel.org
Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs")
Reported-by: Ives van Hoorne <ives(a)codesandbox.io>
Signed-off-by: Peter Xu <peterx(a)redhat.com>
---
mm/migrate.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/mm/migrate.c b/mm/migrate.c
index dff333593a8a..8b6351c08c78 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -213,8 +213,14 @@ static bool remove_migration_pte(struct folio *folio,
pte = pte_mkdirty(pte);
if (is_writable_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
- else if (pte_swp_uffd_wp(*pvmw.pte))
+ else
+ /* NOTE: mk_pte can have write bit set */
+ pte = pte_wrprotect(pte);
+
+ if (pte_swp_uffd_wp(*pvmw.pte)) {
+ WARN_ON_ONCE(pte_write(pte));
pte = pte_mkuffd_wp(pte);
+ }
if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
rmap_flags |= RMAP_EXCLUSIVE;
--
2.37.3
From: Xiubo Li <xiubli(a)redhat.com>
The request's r_session maybe changed when it was forwarded or
resent. Both the forwarding and resending cases the requests will
be protected by the mdsc->mutex.
Cc: stable(a)vger.kernel.org
URL: https://bugzilla.redhat.com/show_bug.cgi?id=2137955
Signed-off-by: Xiubo Li <xiubli(a)redhat.com>
---
Changed in V5:
- simplify the code by removing the "unlikely(s->s_mds >= max_sessions)" check.
Changed in V4:
- move mdsc->mutex acquisition and max_sessions assignment into "if (req1 || req2)" branch
fs/ceph/caps.c | 48 ++++++++++++------------------------------------
1 file changed, 12 insertions(+), 36 deletions(-)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 894adfb4a092..065e9311b607 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2297,7 +2297,6 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req1 = NULL, *req2 = NULL;
- unsigned int max_sessions;
int ret, err = 0;
spin_lock(&ci->i_unsafe_lock);
@@ -2315,28 +2314,24 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
}
spin_unlock(&ci->i_unsafe_lock);
- /*
- * The mdsc->max_sessions is unlikely to be changed
- * mostly, here we will retry it by reallocating the
- * sessions array memory to get rid of the mdsc->mutex
- * lock.
- */
-retry:
- max_sessions = mdsc->max_sessions;
-
/*
* Trigger to flush the journal logs in all the relevant MDSes
* manually, or in the worst case we must wait at most 5 seconds
* to wait the journal logs to be flushed by the MDSes periodically.
*/
- if ((req1 || req2) && likely(max_sessions)) {
- struct ceph_mds_session **sessions = NULL;
- struct ceph_mds_session *s;
+ if (req1 || req2) {
struct ceph_mds_request *req;
+ struct ceph_mds_session **sessions;
+ struct ceph_mds_session *s;
+ unsigned int max_sessions;
int i;
+ mutex_lock(&mdsc->mutex);
+ max_sessions = mdsc->max_sessions;
+
sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
if (!sessions) {
+ mutex_unlock(&mdsc->mutex);
err = -ENOMEM;
goto out;
}
@@ -2348,16 +2343,6 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
s = req->r_session;
if (!s)
continue;
- if (unlikely(s->s_mds >= max_sessions)) {
- spin_unlock(&ci->i_unsafe_lock);
- for (i = 0; i < max_sessions; i++) {
- s = sessions[i];
- if (s)
- ceph_put_mds_session(s);
- }
- kfree(sessions);
- goto retry;
- }
if (!sessions[s->s_mds]) {
s = ceph_get_mds_session(s);
sessions[s->s_mds] = s;
@@ -2370,16 +2355,6 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
s = req->r_session;
if (!s)
continue;
- if (unlikely(s->s_mds >= max_sessions)) {
- spin_unlock(&ci->i_unsafe_lock);
- for (i = 0; i < max_sessions; i++) {
- s = sessions[i];
- if (s)
- ceph_put_mds_session(s);
- }
- kfree(sessions);
- goto retry;
- }
if (!sessions[s->s_mds]) {
s = ceph_get_mds_session(s);
sessions[s->s_mds] = s;
@@ -2391,11 +2366,12 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
/* the auth MDS */
spin_lock(&ci->i_ceph_lock);
if (ci->i_auth_cap) {
- s = ci->i_auth_cap->session;
- if (!sessions[s->s_mds])
- sessions[s->s_mds] = ceph_get_mds_session(s);
+ s = ci->i_auth_cap->session;
+ if (!sessions[s->s_mds])
+ sessions[s->s_mds] = ceph_get_mds_session(s);
}
spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&mdsc->mutex);
/* send flush mdlog request to MDSes */
for (i = 0; i < max_sessions; i++) {
--
2.31.1
Hello,
I saw that the projected EOL of LTS 5.15 is Oct 2023.
How likely is it that the date will be extended? I'm guessing it's
pretty likely, given that Android uses it.
Thanks,
-- Suleiman
From: Eric Biggers <ebiggers(a)google.com>
Currently, registering an algorithm with the crypto API always causes a
notification to be posted to the "cryptomgr", which then creates a
kthread to self-test the algorithm. However, if self-tests are disabled
in the kconfig (as is the default option), then this kthread just
notifies waiters that the algorithm has been tested, then exits.
This causes a significant amount of overhead, especially in the kthread
creation and destruction, which is not necessary at all. For example,
in a quick test I found that booting a "minimum" x86_64 kernel with all
the crypto options enabled (except for the self-tests) takes about 400ms
until PID 1 can start. Of that, a full 13ms is spent just doing this
pointless dance, involving a kthread being created, run, and destroyed
over 200 times. That's over 3% of the entire kernel start time.
Fix this by just skipping the creation of the test larval and the
posting of the registration notification entirely, when self-tests are
disabled. Also compile out the unnecessary code in algboss.c.
While this patch is an optimization and not a "fix" per se, I've marked
it as for stable, due to the large improvement it can make to boot time.
Cc: stable(a)vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
---
crypto/algapi.c | 3 ++-
crypto/algboss.c | 11 +++++++----
2 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/crypto/algapi.c b/crypto/algapi.c
index 5c69ff8e8fa5c..018935cb8417d 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -226,7 +226,8 @@ static struct crypto_larval *crypto_alloc_test_larval(struct crypto_alg *alg)
{
struct crypto_larval *larval;
- if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER))
+ if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER) ||
+ IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS))
return NULL;
larval = crypto_larval_alloc(alg->cra_name,
diff --git a/crypto/algboss.c b/crypto/algboss.c
index eb5fe84efb83e..e6f0443d08048 100644
--- a/crypto/algboss.c
+++ b/crypto/algboss.c
@@ -171,16 +171,18 @@ static int cryptomgr_schedule_probe(struct crypto_larval *larval)
return NOTIFY_OK;
}
+#ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS
+static int cryptomgr_schedule_test(struct crypto_alg *alg)
+{
+ return 0;
+}
+#else
static int cryptomgr_test(void *data)
{
struct crypto_test_param *param = data;
u32 type = param->type;
int err = 0;
-#ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS
- goto skiptest;
-#endif
-
if (type & CRYPTO_ALG_TESTED)
goto skiptest;
@@ -229,6 +231,7 @@ static int cryptomgr_schedule_test(struct crypto_alg *alg)
err:
return NOTIFY_OK;
}
+#endif /* !CONFIG_CRYPTO_MANAGER_DISABLE_TESTS */
static int cryptomgr_notify(struct notifier_block *this, unsigned long msg,
void *data)
base-commit: f67dd6ce0723ad013395f20a3f79d8a437d3f455
--
2.38.1
This is the start of the stable review cycle for the 4.9.333 release.
There are 30 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Thu, 10 Nov 2022 13:33:17 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.9.333-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.9.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.9.333-rc1
Dokyung Song <dokyung.song(a)gmail.com>
wifi: brcmfmac: Fix potential buffer overflow in brcmf_fweh_event_worker()
Maxim Levitsky <mlevitsk(a)redhat.com>
KVM: x86: emulator: update the emulation mode after CR0 write
Maxim Levitsky <mlevitsk(a)redhat.com>
KVM: x86: emulator: introduce emulator_recalc_and_set_mode
Maxim Levitsky <mlevitsk(a)redhat.com>
KVM: x86: emulator: em_sysexit should update ctxt->mode
Jim Mattson <jmattson(a)google.com>
KVM: x86: Mask off reserved bits in CPUID.80000008H
Ye Bin <yebin10(a)huawei.com>
ext4: fix warning in 'ext4_da_release_space'
Helge Deller <deller(a)gmx.de>
parisc: Export iosapic_serial_irq() symbol for serial port driver
Helge Deller <deller(a)gmx.de>
parisc: Make 8250_gsc driver dependend on CONFIG_PARISC
John Veness <john-linux(a)pelago.org.uk>
ALSA: usb-audio: Add quirks for MacroSilicon MS2100/MS2106 devices
David Sterba <dsterba(a)suse.com>
btrfs: fix type of parameter generation in btrfs_get_dentry
Luiz Augusto von Dentz <luiz.von.dentz(a)intel.com>
Bluetooth: L2CAP: Fix attempting to access uninitialized memory
Martin Tůma <martin.tuma(a)digiteqautomotive.com>
i2c: xiic: Add platform module alias
Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
media: dvb-frontends/drxk: initialize err to 0
Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
media: s5p_cec: limit msg.len to CEC_MAX_MSG_SIZE
Gaosheng Cui <cuigaosheng1(a)huawei.com>
net: mdio: fix undefined behavior in bit shift for __mdiobus_register
Zhengchao Shao <shaozhengchao(a)huawei.com>
Bluetooth: L2CAP: fix use-after-free in l2cap_conn_del()
Maxim Mikityanskiy <maxtram95(a)gmail.com>
Bluetooth: L2CAP: Fix use-after-free caused by l2cap_reassemble_sdu
Filipe Manana <fdmanana(a)suse.com>
btrfs: fix ulist leaks in error paths of qgroup self tests
Yang Yingliang <yangyingliang(a)huawei.com>
isdn: mISDN: netjet: fix wrong check of device registration
Yang Yingliang <yangyingliang(a)huawei.com>
mISDN: fix possible memory leak in mISDN_register_device()
Zhang Qilong <zhangqilong3(a)huawei.com>
rose: Fix NULL pointer dereference in rose_send_frame()
Jason A. Donenfeld <Jason(a)zx2c4.com>
ipvs: use explicitly signed chars
Dan Carpenter <dan.carpenter(a)oracle.com>
net: sched: Fix use after free in red_enqueue()
Sergey Shtylyov <s.shtylyov(a)omp.ru>
ata: pata_legacy: fix pdc20230_set_piomode()
Zhang Changzhong <zhangchangzhong(a)huawei.com>
net: fec: fix improper use of NETDEV_TX_BUSY
Shang XiaoJing <shangxiaojing(a)huawei.com>
nfc: nfcmrvl: Fix potential memory leak in nfcmrvl_i2c_nci_send()
Shang XiaoJing <shangxiaojing(a)huawei.com>
nfc: s3fwrn5: Fix potential memory leak in s3fwrn5_nci_send()
Zhang Xiaoxu <zhangxiaoxu5(a)huawei.com>
nfs4: Fix kmemleak when allocate slot failed
Trond Myklebust <trond.myklebust(a)hammerspace.com>
NFSv4.1: We must always send RECLAIM_COMPLETE after a reboot
Trond Myklebust <trond.myklebust(a)hammerspace.com>
NFSv4.1: Handle RECLAIM_COMPLETE trunking errors
-------------
Diffstat:
Makefile | 4 +-
arch/x86/kvm/cpuid.c | 1 +
arch/x86/kvm/emulate.c | 102 +++++++++++++++------
drivers/ata/pata_legacy.c | 5 +-
drivers/i2c/busses/i2c-xiic.c | 1 +
drivers/isdn/hardware/mISDN/netjet.c | 2 +-
drivers/isdn/mISDN/core.c | 5 +-
drivers/media/dvb-frontends/drxk_hard.c | 2 +-
drivers/net/ethernet/freescale/fec_main.c | 4 +-
drivers/net/phy/mdio_bus.c | 2 +-
.../wireless/broadcom/brcm80211/brcmfmac/fweh.c | 4 +
drivers/nfc/nfcmrvl/i2c.c | 7 +-
drivers/nfc/s3fwrn5/core.c | 8 +-
drivers/parisc/iosapic.c | 1 +
drivers/staging/media/s5p-cec/s5p_cec.c | 2 +
drivers/tty/serial/8250/Kconfig | 2 +-
fs/btrfs/export.c | 2 +-
fs/btrfs/export.h | 2 +-
fs/btrfs/tests/qgroup-tests.c | 20 +++-
fs/ext4/migrate.c | 3 +-
fs/nfs/nfs4client.c | 1 +
fs/nfs/nfs4state.c | 2 +
net/bluetooth/l2cap_core.c | 52 +++++++++--
net/netfilter/ipvs/ip_vs_conn.c | 4 +-
net/rose/rose_link.c | 3 +
net/sched/sch_red.c | 4 +-
sound/usb/quirks-table.h | 58 ++++++++++++
sound/usb/quirks.c | 1 +
28 files changed, 241 insertions(+), 63 deletions(-)
When pci_create_attr() fails then pci_remove_resource_files() is called
which will iterate over the res_attr[_wc] arrays and frees every non
NULL entry. To avoid a double free here we have to set the failed entry
to NULL in pci_create_attr() when freeing it.
Fixes: b562ec8f74e4 ("PCI: Don't leak memory if sysfs_create_bin_file() fails")
Signed-off-by: Sascha Hauer <s.hauer(a)pengutronix.de>
Cc: <stable(a)vger.kernel.org>
---
drivers/pci/pci-sysfs.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index fc804e08e3cb5..a07381d46ddae 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1196,8 +1196,13 @@ static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine)
res_attr->size = pci_resource_len(pdev, num);
res_attr->private = (void *)(unsigned long)num;
retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr);
- if (retval)
+ if (retval) {
+ if (write_combine)
+ pdev->res_attr_wc[num] = NULL;
+ else
+ pdev->res_attr[num] = NULL;
kfree(res_attr);
+ }
return retval;
}
--
2.30.2
Restoration of the host IA32_SPEC_CTRL value is probably too late
with respect to the return thunk training sequence.
With respect to the user/kernel boundary, AMD says, "If software chooses
to toggle STIBP (e.g., set STIBP on kernel entry, and clear it on kernel
exit), software should set STIBP to 1 before executing the return thunk
training sequence." I assume the same requirements apply to the guest/host
boundary. The return thunk training sequence is in vmenter.S, quite close
to the VM-exit. On hosts without V_SPEC_CTRL, however, the host's
IA32_SPEC_CTRL value is not restored until much later.
To avoid this, move the restoration of host SPEC_CTRL to assembly and,
for consistency, move the restoration of the guest SPEC_CTRL as well.
This is not particularly difficult, apart from some care to cover both
32- and 64-bit, and to share code between SEV-ES and normal vmentry.
Cc: stable(a)vger.kernel.org
Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
Suggested-by: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
---
arch/x86/kernel/asm-offsets.c | 1 +
arch/x86/kernel/cpu/bugs.c | 13 ++---
arch/x86/kvm/svm/svm.c | 38 ++++++---------
arch/x86/kvm/svm/svm.h | 4 +-
arch/x86/kvm/svm/vmenter.S | 92 ++++++++++++++++++++++++++++++++++-
5 files changed, 111 insertions(+), 37 deletions(-)
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 69d1fed51086..d0bd68af0a5a 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -115,6 +115,7 @@ static void __used common(void)
OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs);
OFFSET(SVM_vmcb01, vcpu_svm, vmcb01);
OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
+ OFFSET(SVM_spec_ctrl, vcpu_svm, spec_ctrl);
OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index da7c361f47e0..6ec0b7ce7453 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -196,22 +196,15 @@ void __init check_bugs(void)
}
/*
- * NOTE: This function is *only* called for SVM. VMX spec_ctrl handling is
- * done in vmenter.S.
+ * NOTE: This function is *only* called for SVM, since Intel uses
+ * MSR_IA32_SPEC_CTRL for SSBD.
*/
void
x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
{
- u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current();
+ u64 guestval, hostval;
struct thread_info *ti = current_thread_info();
- if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
- if (hostval != guestval) {
- msrval = setguest ? guestval : hostval;
- wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
- }
- }
-
/*
* If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
* MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 381c7dcffe25..31aa158a2e10 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -731,6 +731,15 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
u32 offset;
u32 *msrpm;
+ /*
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
to_svm(vcpu)->msrpm;
@@ -3912,18 +3921,19 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
{
struct vcpu_svm *svm = to_svm(vcpu);
guest_state_enter_irqoff();
if (sev_es_guest(vcpu->kvm)) {
- __svm_sev_es_vcpu_run(svm);
+ __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
} else {
struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
- __svm_vcpu_run(svm, __sme_page_pa(sd->save_area));
+ __svm_vcpu_run(svm, __sme_page_pa(sd->save_area),
+ spec_ctrl_intercepted);
}
guest_state_exit_irqoff();
@@ -3932,6 +3942,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
trace_kvm_entry(vcpu);
@@ -3990,26 +4001,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
- svm_vcpu_enter_exit(vcpu);
-
- /*
- * We do not use IBRS in the kernel. If this vCPU has used the
- * SPEC_CTRL MSR it may have left it on; save the value and
- * turn it off. This is much more efficient than blindly adding
- * it to the atomic save/restore list. Especially as the former
- * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
- *
- * For non-nested case:
- * If the L01 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- *
- * For nested case:
- * If the L02 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- */
- if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
- unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
- svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
+ svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
if (!sev_es_guest(vcpu->kvm))
reload_tss(vcpu);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 99410651f2a5..9d940d8736f0 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -483,7 +483,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */
-void __svm_sev_es_vcpu_run(struct vcpu_svm *svm);
-void __svm_vcpu_run(struct vcpu_svm *svm, unsigned long hsave_pa);
+void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+void __svm_vcpu_run(struct vcpu_svm *svm, unsigned long hsave_pa, bool spec_ctrl_intercepted);
#endif
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 45a4bd002494..9e381386ffdc 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -32,10 +32,64 @@
.section .noinstr.text, "ax"
+.macro RESTORE_GUEST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "jmp 999f", \
+ "", X86_FEATURE_MSR_SPEC_CTRL, \
+ "jmp 999f", X86_FEATURE_V_SPEC_CTRL
+
+ /*
+ * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
+ * host's, write the MSR.
+ *
+ * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
+ * there must not be any returns or indirect branches between this code
+ * and vmentry.
+ */
+ movl SVM_spec_ctrl(%_ASM_DI), %eax
+ cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ je 999f
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+ xor %edx, %edx
+ wrmsr
+999:
+
+.endm
+
+.macro RESTORE_HOST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "jmp 999f", \
+ "", X86_FEATURE_MSR_SPEC_CTRL, \
+ "jmp 999f", X86_FEATURE_V_SPEC_CTRL
+
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+
+ /*
+ * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
+ * if it was not intercepted during guest execution.
+ */
+ cmpb $0, (%_ASM_SP)
+ jnz 998f
+ rdmsr
+ movl %eax, SVM_spec_ctrl(%_ASM_DI)
+998:
+
+ /* Now restore the host value of the MSR if different from the guest's. */
+ movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ cmp SVM_spec_ctrl(%_ASM_DI), %eax
+ je 999f
+ xor %edx, %edx
+ wrmsr
+999:
+
+.endm
+
+
/**
* __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
* @svm: struct vcpu_svm *
* @hsave_pa: unsigned long
+ * @spec_ctrl_intercepted: bool
*/
SYM_FUNC_START(__svm_vcpu_run)
push %_ASM_BP
@@ -50,7 +104,12 @@ SYM_FUNC_START(__svm_vcpu_run)
#endif
push %_ASM_BX
- /* @hsave_pa is needed last after vmexit, save it first. */
+ /*
+ * Both @spec_ctrl_intercepted and @hsave_pa are used only after vmexit.
+ * @spec_ctrl_intercepted is needed later and accessed directly from
+ * the stack in RESTORE_HOST_SPEC_CTRL, so save it first.
+ */
+ push %_ASM_ARG3
push %_ASM_ARG2
/* Save @svm. */
@@ -61,6 +120,8 @@ SYM_FUNC_START(__svm_vcpu_run)
mov %_ASM_ARG1, %_ASM_DI
.endif
+ RESTORE_GUEST_SPEC_CTRL
+
/*
* Use a single vmcb (vmcb01 because it's always valid) for
* context switching guest state via VMLOAD/VMSAVE, that way
@@ -138,6 +199,8 @@ SYM_FUNC_START(__svm_vcpu_run)
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
+ RESTORE_HOST_SPEC_CTRL
+
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
* untrained as soon as we exit the VM and are back to the
@@ -173,6 +236,9 @@ SYM_FUNC_START(__svm_vcpu_run)
xor %r15d, %r15d
#endif
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
pop %_ASM_BX
#ifdef CONFIG_X86_64
@@ -210,6 +276,7 @@ SYM_FUNC_END(__svm_vcpu_run)
/**
* __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
* @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
*/
SYM_FUNC_START(__svm_sev_es_vcpu_run)
push %_ASM_BP
@@ -224,8 +291,21 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
#endif
push %_ASM_BX
+ /* Save @spec_ctrl_intercepted for RESTORE_HOST_SPEC_CTRL. */
+ push %_ASM_ARG2
+
+ /* Save @svm. */
+ push %_ASM_ARG1
+
+.ifnc _ASM_ARG1, _ASM_DI
+ /* Move @svm to RDI for RESTORE_GUEST_SPEC_CTRL. */
+ mov %_ASM_ARG1, %_ASM_DI
+.endif
+
+ RESTORE_GUEST_SPEC_CTRL
+
/* Get svm->current_vmcb->pa into RAX. */
- mov SVM_current_vmcb(%_ASM_ARG1), %_ASM_AX
+ mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
/* Enter guest mode */
@@ -235,11 +315,16 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
2: cli
+ /* Pop @svm to RDI, guest registers have been saved already. */
+ pop %_ASM_DI
+
#ifdef CONFIG_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
+ RESTORE_HOST_SPEC_CTRL
+
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
* untrained as soon as we exit the VM and are back to the
@@ -249,6 +334,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
*/
UNTRAIN_RET
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
pop %_ASM_BX
#ifdef CONFIG_X86_64
--
2.31.1
Restoration of the host IA32_SPEC_CTRL value is probably too late
with respect to the return thunk training sequence.
With respect to the user/kernel boundary, AMD says, "If software chooses
to toggle STIBP (e.g., set STIBP on kernel entry, and clear it on kernel
exit), software should set STIBP to 1 before executing the return thunk
training sequence." I assume the same requirements apply to the guest/host
boundary. The return thunk training sequence is in vmenter.S, quite close
to the VM-exit. On hosts without V_SPEC_CTRL, however, the host's
IA32_SPEC_CTRL value is not restored until much later.
To avoid this, move the restoration of host SPEC_CTRL to assembly and,
for consistency, move the restoration of the guest SPEC_CTRL as well.
This is not particularly difficult, apart from some care to cover both
32- and 64-bit, and to share code between SEV-ES and normal vmentry.
Cc: stable(a)vger.kernel.org
Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
Suggested-by: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
---
arch/x86/kernel/cpu/bugs.c | 13 +---
arch/x86/kvm/kvm-asm-offsets.c | 1 +
arch/x86/kvm/svm/svm.c | 37 ++++------
arch/x86/kvm/svm/svm.h | 4 +-
arch/x86/kvm/svm/vmenter.S | 119 ++++++++++++++++++++++++++++++++-
5 files changed, 136 insertions(+), 38 deletions(-)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index da7c361f47e0..6ec0b7ce7453 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -196,22 +196,15 @@ void __init check_bugs(void)
}
/*
- * NOTE: This function is *only* called for SVM. VMX spec_ctrl handling is
- * done in vmenter.S.
+ * NOTE: This function is *only* called for SVM, since Intel uses
+ * MSR_IA32_SPEC_CTRL for SSBD.
*/
void
x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
{
- u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current();
+ u64 guestval, hostval;
struct thread_info *ti = current_thread_info();
- if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
- if (hostval != guestval) {
- msrval = setguest ? guestval : hostval;
- wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
- }
- }
-
/*
* If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
* MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
index 1b805cd24d66..24a710d37323 100644
--- a/arch/x86/kvm/kvm-asm-offsets.c
+++ b/arch/x86/kvm/kvm-asm-offsets.c
@@ -16,6 +16,7 @@ static void __used common(void)
BLANK();
OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs);
OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
+ OFFSET(SVM_spec_ctrl, vcpu_svm, spec_ctrl);
OFFSET(SVM_vmcb01, vcpu_svm, vmcb01);
OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
OFFSET(SD_save_area_pa, svm_cpu_data, save_area_pa);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 469c1b5617af..cf1aed25f4ab 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -720,6 +720,15 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
u32 offset;
u32 *msrpm;
+ /*
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
to_svm(vcpu)->msrpm;
@@ -3901,16 +3910,16 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
{
struct vcpu_svm *svm = to_svm(vcpu);
guest_state_enter_irqoff();
if (sev_es_guest(vcpu->kvm))
- __svm_sev_es_vcpu_run(svm);
+ __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
else
- __svm_vcpu_run(svm);
+ __svm_vcpu_run(svm, spec_ctrl_intercepted);
guest_state_exit_irqoff();
}
@@ -3918,6 +3927,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
trace_kvm_entry(vcpu);
@@ -3976,26 +3986,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
- svm_vcpu_enter_exit(vcpu);
-
- /*
- * We do not use IBRS in the kernel. If this vCPU has used the
- * SPEC_CTRL MSR it may have left it on; save the value and
- * turn it off. This is much more efficient than blindly adding
- * it to the atomic save/restore list. Especially as the former
- * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
- *
- * For non-nested case:
- * If the L01 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- *
- * For nested case:
- * If the L02 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- */
- if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
- unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
- svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
+ svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
if (!sev_es_guest(vcpu->kvm))
reload_tss(vcpu);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 83955a4e520e..199a2ecef1ce 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -682,7 +682,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */
-void __svm_sev_es_vcpu_run(struct vcpu_svm *svm);
-void __svm_vcpu_run(struct vcpu_svm *svm);
+void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
#endif
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 57440acfc73e..34367dc203f2 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -32,9 +32,69 @@
.section .noinstr.text, "ax"
+.macro RESTORE_GUEST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 800f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+801:
+.endm
+.macro RESTORE_GUEST_SPEC_CTRL_BODY
+800:
+ /*
+ * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
+ * host's, write the MSR. This is kept out-of-line so that the common
+ * case does not have to jump.
+ *
+ * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
+ * there must not be any returns or indirect branches between this code
+ * and vmentry.
+ */
+ movl SVM_spec_ctrl(%_ASM_DI), %eax
+ cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ je 801b
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+ xor %edx, %edx
+ wrmsr
+ jmp 801b
+.endm
+
+.macro RESTORE_HOST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+901:
+.endm
+.macro RESTORE_HOST_SPEC_CTRL_BODY
+900:
+ /* Same for after vmexit. */
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+
+ /*
+ * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
+ * if it was not intercepted during guest execution.
+ */
+ cmpb $0, (%_ASM_SP)
+ jnz 998f
+ rdmsr
+ movl %eax, SVM_spec_ctrl(%_ASM_DI)
+998:
+
+ /* Now restore the host value of the MSR if different from the guest's. */
+ movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ cmp SVM_spec_ctrl(%_ASM_DI), %eax
+ je 901b
+ xor %edx, %edx
+ wrmsr
+ jmp 901b
+.endm
+
+
/**
* __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
* @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
*/
SYM_FUNC_START(__svm_vcpu_run)
push %_ASM_BP
@@ -54,17 +114,26 @@ SYM_FUNC_START(__svm_vcpu_run)
* order compared to when they are needed.
*/
+ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
+ push %_ASM_ARG2
+
/* Needed to restore access to percpu variables. */
__ASM_SIZE(push) PER_CPU_VAR(svm_data + SD_save_area_pa)
- /* Save @svm. */
+ /* Finally save @svm. */
push %_ASM_ARG1
.ifnc _ASM_ARG1, _ASM_DI
- /* Move @svm to RDI. */
+ /*
+ * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
+ * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
+ */
mov %_ASM_ARG1, %_ASM_DI
.endif
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_GUEST_SPEC_CTRL
+
/*
* Use a single vmcb (vmcb01 because it's always valid) for
* context switching guest state via VMLOAD/VMSAVE, that way
@@ -142,6 +211,9 @@ SYM_FUNC_START(__svm_vcpu_run)
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_HOST_SPEC_CTRL
+
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
* untrained as soon as we exit the VM and are back to the
@@ -177,6 +249,9 @@ SYM_FUNC_START(__svm_vcpu_run)
xor %r15d, %r15d
#endif
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
pop %_ASM_BX
#ifdef CONFIG_X86_64
@@ -191,6 +266,9 @@ SYM_FUNC_START(__svm_vcpu_run)
pop %_ASM_BP
RET
+ RESTORE_GUEST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY
+
10: cmpb $0, kvm_rebooting
jne 2b
ud2
@@ -214,6 +292,7 @@ SYM_FUNC_END(__svm_vcpu_run)
/**
* __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
* @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
*/
SYM_FUNC_START(__svm_sev_es_vcpu_run)
push %_ASM_BP
@@ -228,8 +307,30 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
#endif
push %_ASM_BX
+ /*
+ * Save variables needed after vmexit on the stack, in inverse
+ * order compared to when they are needed.
+ */
+
+ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
+ push %_ASM_ARG2
+
+ /* Save @svm. */
+ push %_ASM_ARG1
+
+.ifnc _ASM_ARG1, _ASM_DI
+ /*
+ * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
+ * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
+ */
+ mov %_ASM_ARG1, %_ASM_DI
+.endif
+
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_GUEST_SPEC_CTRL
+
/* Get svm->current_vmcb->pa into RAX. */
- mov SVM_current_vmcb(%_ASM_ARG1), %_ASM_AX
+ mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
/* Enter guest mode */
@@ -239,11 +340,17 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
2: cli
+ /* Pop @svm to RDI, guest registers have been saved already. */
+ pop %_ASM_DI
+
#ifdef CONFIG_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_HOST_SPEC_CTRL
+
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
* untrained as soon as we exit the VM and are back to the
@@ -253,6 +360,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
*/
UNTRAIN_RET
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
pop %_ASM_BX
#ifdef CONFIG_X86_64
@@ -267,6 +377,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
pop %_ASM_BP
RET
+ RESTORE_GUEST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY
+
3: cmpb $0, kvm_rebooting
jne 2b
ud2
--
2.31.1
FILL_RETURN_BUFFER can access percpu data, therefore vmload of the
host save area must be executed first. First of all, move the VMCB
vmsave/vmload to assembly, essentially undoing commit fb0c4a4fee5a ("KVM:
SVM: move VMLOAD/VMSAVE to C code", 2021-03-15). The reason for that
commit was that it made it simpler to use a different VMCB for
VMLOAD/VMSAVE versus VMRUN; but that is not a big hassle anymore thanks
to the kvm-asm-offsets machinery.
The idea on how to number the exception tables is stolen from
a prototype patch by Peter Zijlstra.
Cc: stable(a)vger.kernel.org
Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
Link: <https://lore.kernel.org/all/f571e404-e625-bae1-10e9-449b2eb4cbd8@citrix.com/>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
---
arch/x86/kvm/kvm-asm-offsets.c | 1 +
arch/x86/kvm/svm/svm.c | 9 -------
arch/x86/kvm/svm/vmenter.S | 49 ++++++++++++++++++++++++++--------
3 files changed, 39 insertions(+), 20 deletions(-)
diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
index f1b694e431ae..f83e88b85bf2 100644
--- a/arch/x86/kvm/kvm-asm-offsets.c
+++ b/arch/x86/kvm/kvm-asm-offsets.c
@@ -16,6 +16,7 @@ static void __used common(void)
BLANK();
OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs);
OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
+ OFFSET(SVM_vmcb01, vcpu_svm, vmcb01);
OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 48274c93d78b..4e3a47eb5002 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3910,16 +3910,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
} else {
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
- /*
- * Use a single vmcb (vmcb01 because it's always valid) for
- * context switching guest state via VMLOAD/VMSAVE, that way
- * the state doesn't need to be copied between vmcb01 and
- * vmcb02 when switching vmcbs for nested virtualization.
- */
- vmload(svm->vmcb01.pa);
__svm_vcpu_run(svm);
- vmsave(svm->vmcb01.pa);
-
vmload(__sme_page_pa(sd->save_area));
}
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index d07bac1952c5..5bc2ed7d79c0 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -28,6 +28,8 @@
#define VCPU_R15 (SVM_vcpu_arch_regs + __VCPU_REGS_R15 * WORD_SIZE)
#endif
+#define SVM_vmcb01_pa (SVM_vmcb01 + KVM_VMCB_pa)
+
.section .noinstr.text, "ax"
/**
@@ -55,6 +57,16 @@ SYM_FUNC_START(__svm_vcpu_run)
mov %_ASM_ARG1, %_ASM_DI
.endif
+ /*
+ * Use a single vmcb (vmcb01 because it's always valid) for
+ * context switching guest state via VMLOAD/VMSAVE, that way
+ * the state doesn't need to be copied between vmcb01 and
+ * vmcb02 when switching vmcbs for nested virtualization.
+ */
+ mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX
+1: vmload %_ASM_AX
+2:
+
/* Get svm->current_vmcb->pa into RAX. */
mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
@@ -80,16 +92,11 @@ SYM_FUNC_START(__svm_vcpu_run)
/* Enter guest mode */
sti
-1: vmrun %_ASM_AX
-
-2: cli
-
-#ifdef CONFIG_RETPOLINE
- /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
- FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
-#endif
+3: vmrun %_ASM_AX
+4:
+ cli
- /* "POP" @svm to RAX. */
+ /* Pop @svm to RAX while it's the only available register. */
pop %_ASM_AX
/* Save all guest registers. */
@@ -110,6 +117,18 @@ SYM_FUNC_START(__svm_vcpu_run)
mov %r15, VCPU_R15(%_ASM_AX)
#endif
+ /* @svm can stay in RDI from now on. */
+ mov %_ASM_AX, %_ASM_DI
+
+ mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX
+5: vmsave %_ASM_AX
+6:
+
+#ifdef CONFIG_RETPOLINE
+ /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+#endif
+
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
* untrained as soon as we exit the VM and are back to the
@@ -159,11 +178,19 @@ SYM_FUNC_START(__svm_vcpu_run)
pop %_ASM_BP
RET
-3: cmpb $0, kvm_rebooting
+10: cmpb $0, kvm_rebooting
jne 2b
ud2
+30: cmpb $0, kvm_rebooting
+ jne 4b
+ ud2
+50: cmpb $0, kvm_rebooting
+ jne 6b
+ ud2
- _ASM_EXTABLE(1b, 3b)
+ _ASM_EXTABLE(1b, 10b)
+ _ASM_EXTABLE(3b, 30b)
+ _ASM_EXTABLE(5b, 50b)
SYM_FUNC_END(__svm_vcpu_run)
--
2.31.1
This is a note to let you know that I've just added the patch titled
binder: validate alloc->mm in ->mmap() handler
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 3ce00bb7e91cf57d723905371507af57182c37ef Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas(a)google.com>
Date: Fri, 4 Nov 2022 23:12:35 +0000
Subject: binder: validate alloc->mm in ->mmap() handler
Since commit 1da52815d5f1 ("binder: fix alloc->vma_vm_mm null-ptr
dereference") binder caches a pointer to the current->mm during open().
This fixes a null-ptr dereference reported by syzkaller. Unfortunately,
it also opens the door for a process to update its mm after the open(),
(e.g. via execve) making the cached alloc->mm pointer invalid.
Things get worse when the process continues to mmap() a vma. From this
point forward, binder will attempt to find this vma using an obsolete
alloc->mm reference. Such as in binder_update_page_range(), where the
wrong vma is obtained via vma_lookup(), yet binder proceeds to happily
insert new pages into it.
To avoid this issue fail the ->mmap() callback if we detect a mismatch
between the vma->vm_mm and the original alloc->mm pointer. This prevents
alloc->vm_addr from getting set, so that any subsequent vma_lookup()
calls fail as expected.
Fixes: 1da52815d5f1 ("binder: fix alloc->vma_vm_mm null-ptr dereference")
Reported-by: Jann Horn <jannh(a)google.com>
Cc: <stable(a)vger.kernel.org> # 5.15+
Signed-off-by: Carlos Llamas <cmllamas(a)google.com>
Acked-by: Todd Kjos <tkjos(a)google.com>
Link: https://lore.kernel.org/r/20221104231235.348958-1-cmllamas@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/android/binder_alloc.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 1c39cfce32fa..4ad42b0f75cd 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -739,6 +739,12 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
const char *failure_string;
struct binder_buffer *buffer;
+ if (unlikely(vma->vm_mm != alloc->mm)) {
+ ret = -EINVAL;
+ failure_string = "invalid vma->vm_mm";
+ goto err_invalid_mm;
+ }
+
mutex_lock(&binder_alloc_mmap_lock);
if (alloc->buffer_size) {
ret = -EBUSY;
@@ -785,6 +791,7 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
alloc->buffer_size = 0;
err_already_mapped:
mutex_unlock(&binder_alloc_mmap_lock);
+err_invalid_mm:
binder_alloc_debug(BINDER_DEBUG_USER_ERROR,
"%s: %d %lx-%lx %s failed %d\n", __func__,
alloc->pid, vma->vm_start, vma->vm_end,
--
2.38.1
On Wed, Nov 09, 2022 at 05:34:13PM +0300, Denis Arefev wrote:
> Date: Wed, 9 Nov 2022 16:52:17 +0300
> Subject: [PATCH 5.10] nbio_v7_4: Add pointer check
>
> Return value of a function 'amdgpu_ras_find_obj' is dereferenced at nbio_v7_4.c:325 without checking for null
>
> Found by Linux Verification Center (linuxtesting.org) with SVACE.
>
> Signed-off-by: Denis Arefev <arefev(a)swemel.ru>
> ---
> drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> index eadc9526d33f..d2627a610e48 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> @@ -303,6 +303,9 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
> struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if);
> struct ras_err_data err_data = {0, 0, 0, NULL};
> struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>
> + if (!obj)
> + return;
>
> bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL);
> if (REG_GET_FIELD(bif_doorbell_intr_cntl,
> --
> 2.25.1
>
<formletter>
This is not the correct way to submit patches for inclusion in the
stable kernel tree. Please read:
https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
for how to do this properly.
</formletter>
The following commit has been merged into the x86/fpu branch of tip:
Commit-ID: 48280042f2c6e3ac2cfb1d8b752ab4a7e0baea24
Gitweb: https://git.kernel.org/tip/48280042f2c6e3ac2cfb1d8b752ab4a7e0baea24
Author: Andrew Cooper <andrew.cooper3(a)citrix.com>
AuthorDate: Wed, 10 Aug 2022 23:19:09 +01:00
Committer: Borislav Petkov <bp(a)suse.de>
CommitterDate: Wed, 09 Nov 2022 13:28:31 +01:00
x86/fpu/xstate: Fix XSTATE_WARN_ON() to emit relevant diagnostics
"XSAVE consistency problem" has been reported under Xen, but that's the extent
of my divination skills.
Modify XSTATE_WARN_ON() to force the caller to provide relevant diagnostic
information, and modify each caller suitably.
For check_xstate_against_struct(), this removes a double WARN() where one will
do perfectly fine.
CC stable as this has been wonky debugging for 7 years and it is good to
have there too.
Signed-off-by: Andrew Cooper <andrew.cooper3(a)citrix.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20220810221909.12768-1-andrew.cooper3@citrix.com
---
arch/x86/kernel/fpu/xstate.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 59e543b..c2dde46 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -440,8 +440,8 @@ static void __init __xstate_dump_leaves(void)
}
}
-#define XSTATE_WARN_ON(x) do { \
- if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \
+#define XSTATE_WARN_ON(x, fmt, ...) do { \
+ if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \
__xstate_dump_leaves(); \
} \
} while (0)
@@ -554,8 +554,7 @@ static bool __init check_xstate_against_struct(int nr)
(nr >= XFEATURE_MAX) ||
(nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) {
- WARN_ONCE(1, "no structure for xstate: %d\n", nr);
- XSTATE_WARN_ON(1);
+ XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
return false;
}
return true;
@@ -598,12 +597,13 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
* XSAVES.
*/
if (!xsaves && xfeature_is_supervisor(i)) {
- XSTATE_WARN_ON(1);
+ XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
return false;
}
}
size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
- XSTATE_WARN_ON(size != kernel_size);
+ XSTATE_WARN_ON(size != kernel_size,
+ "size %u != kernel_size %u\n", size, kernel_size);
return size == kernel_size;
}
Before adding this quirk, this (mechanical keyboard) device would not be
recognized, logging:
new full-speed USB device number 56 using xhci_hcd
unable to read config index 0 descriptor/start: -32
chopping to 0 config(s)
It would take dozens of plugging/unpuggling cycles for the keyboard to
be recognized. Keyboard seems to simply work after applying this quirk.
This issue had been reported by users in two places already ([1], [2])
but nobody tried upstreaming a patch yet. After testing I believe their
suggested fix (DELAY_INIT + NO_LPM + DEVICE_QUALIFIER) was probably a
little overkill. I assume this particular combination was tested because
it had been previously suggested in [3], but only NO_LPM seems
sufficient for this device.
[1]: https://qiita.com/float168/items/fed43d540c8e2201b543
[2]: https://blog.kostic.dev/posts/making-the-realforce-87ub-work-with-usb30-on-…
[3]: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678477
---
Changes in v2:
- add the entry to the right location (sorting entries by
vendor/device id).
Cc: stable(a)vger.kernel.org
Signed-off-by: Nicolas Dumazet <ndumazet(a)google.com>
---
drivers/usb/core/quirks.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index 0722d2131305..079e183cf3bf 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -362,6 +362,9 @@ static const struct usb_device_id usb_quirk_list[] = {
{ USB_DEVICE(0x0781, 0x5583), .driver_info = USB_QUIRK_NO_LPM },
{ USB_DEVICE(0x0781, 0x5591), .driver_info = USB_QUIRK_NO_LPM },
+ /* Realforce 87U Keyboard */
+ { USB_DEVICE(0x0853, 0x011b), .driver_info = USB_QUIRK_NO_LPM },
+
/* M-Systems Flash Disk Pioneers */
{ USB_DEVICE(0x08ec, 0x1000), .driver_info = USB_QUIRK_RESET_RESUME },
--
2.38.0.135.g90850a2211-goog
Before adding this quirk, this (mechanical keyboard) device would not be
recognized, logging:
new full-speed USB device number 56 using xhci_hcd
unable to read config index 0 descriptor/start: -32
chopping to 0 config(s)
It would take dozens of plugging/unpuggling cycles for the keyboard to
be recognized. Keyboard seems to simply work after applying this quirk.
This issue had been reported by users in two places already ([1], [2])
but nobody tried upstreaming a patch yet. After testing I believe their
suggested fix (DELAY_INIT + NO_LPM + DEVICE_QUALIFIER) was probably a
little overkill. I assume this particular combination was tested because
it had been previously suggested in [3], but only NO_LPM seems
sufficient for this device.
[1]: https://qiita.com/float168/items/fed43d540c8e2201b543
[2]: https://blog.kostic.dev/posts/making-the-realforce-87ub-work-with-usb30-on-…
[3]: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678477
Cc: stable(a)vger.kernel.org
Signed-off-by: Nicolas Dumazet <ndumazet(a)google.com>
---
Changes:
* v2 -> v3: fix s-o-b / patch format.
* v1 -> v2 : add the entry to the right location (sorting entries by
vendor/device id).
---
drivers/usb/core/quirks.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index 0722d2131305..079e183cf3bf 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -362,6 +362,9 @@ static const struct usb_device_id usb_quirk_list[] = {
{ USB_DEVICE(0x0781, 0x5583), .driver_info = USB_QUIRK_NO_LPM },
{ USB_DEVICE(0x0781, 0x5591), .driver_info = USB_QUIRK_NO_LPM },
+ /* Realforce 87U Keyboard */
+ { USB_DEVICE(0x0853, 0x011b), .driver_info = USB_QUIRK_NO_LPM },
+
/* M-Systems Flash Disk Pioneers */
{ USB_DEVICE(0x08ec, 0x1000), .driver_info = USB_QUIRK_RESET_RESUME },
--
2.38.0.135.g90850a2211-goog
This is a note to let you know that I've just added the patch titled
serial: 8250: Flush DMA Rx on RLSI
to my tty git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git
in the tty-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 1980860e0c8299316cddaf0992dd9e1258ec9d88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen(a)linux.intel.com>
Date: Tue, 8 Nov 2022 14:19:52 +0200
Subject: serial: 8250: Flush DMA Rx on RLSI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Returning true from handle_rx_dma() without flushing DMA first creates
a data ordering hazard. If DMA Rx has handled any character at the
point when RLSI occurs, the non-DMA path handles any pending characters
jumping them ahead of those characters that are pending under DMA.
Fixes: 75df022b5f89 ("serial: 8250_dma: Fix RX handling")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Link: https://lore.kernel.org/r/20221108121952.5497-5-ilpo.jarvinen@linux.intel.c…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/tty/serial/8250/8250_port.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 92dd18716169..388172289627 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -1901,10 +1901,9 @@ static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir)
if (!up->dma->rx_running)
break;
fallthrough;
+ case UART_IIR_RLSI:
case UART_IIR_RX_TIMEOUT:
serial8250_rx_dma_flush(up);
- fallthrough;
- case UART_IIR_RLSI:
return true;
}
return up->dma->rx_dma(up);
--
2.38.1
This is a note to let you know that I've just added the patch titled
serial: 8250_lpss: Use 16B DMA burst with Elkhart Lake
to my tty git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git
in the tty-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 7090abd6ad0610a144523ce4ffcb8560909bf2a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen(a)linux.intel.com>
Date: Tue, 8 Nov 2022 14:19:51 +0200
Subject: serial: 8250_lpss: Use 16B DMA burst with Elkhart Lake
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Configure DMA to use 16B burst size with Elkhart Lake. This makes the
bus use more efficient and works around an issue which occurs with the
previously used 1B.
The fix was initially developed by Srikanth Thokala and Aman Kumar.
This together with the previous config change is the cleaned up version
of the original fix.
Fixes: 0a9410b981e9 ("serial: 8250_lpss: Enable DMA on Intel Elkhart Lake")
Cc: <stable(a)vger.kernel.org> # serial: 8250_lpss: Configure DMA also w/o DMA filter
Reported-by: Wentong Wu <wentong.wu(a)intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Link: https://lore.kernel.org/r/20221108121952.5497-4-ilpo.jarvinen@linux.intel.c…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/tty/serial/8250/8250_lpss.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/tty/serial/8250/8250_lpss.c b/drivers/tty/serial/8250/8250_lpss.c
index 7d9cddbfef40..0e43bdfb7459 100644
--- a/drivers/tty/serial/8250/8250_lpss.c
+++ b/drivers/tty/serial/8250/8250_lpss.c
@@ -174,6 +174,8 @@ static int ehl_serial_setup(struct lpss8250 *lpss, struct uart_port *port)
*/
up->dma = dma;
+ lpss->dma_maxburst = 16;
+
port->set_termios = dw8250_do_set_termios;
return 0;
--
2.38.1
This is a note to let you know that I've just added the patch titled
serial: 8250: Fall back to non-DMA Rx if IIR_RDI occurs
to my tty git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git
in the tty-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From a931237cbea256aff13bb403da13a97b2d1605d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen(a)linux.intel.com>
Date: Tue, 8 Nov 2022 14:19:49 +0200
Subject: serial: 8250: Fall back to non-DMA Rx if IIR_RDI occurs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
DW UART sometimes triggers IIR_RDI during DMA Rx when IIR_RX_TIMEOUT
should have been triggered instead. Since IIR_RDI has higher priority
than IIR_RX_TIMEOUT, this causes the Rx to hang into interrupt loop.
The problem seems to occur at least with some combinations of
small-sized transfers (I've reproduced the problem on Elkhart Lake PSE
UARTs).
If there's already an on-going Rx DMA and IIR_RDI triggers, fall
graciously back to non-DMA Rx. That is, behave as if IIR_RX_TIMEOUT had
occurred.
8250_omap already considers IIR_RDI similar to this change so its
nothing unheard of.
Fixes: 75df022b5f89 ("serial: 8250_dma: Fix RX handling")
Cc: <stable(a)vger.kernel.org>
Co-developed-by: Srikanth Thokala <srikanth.thokala(a)intel.com>
Signed-off-by: Srikanth Thokala <srikanth.thokala(a)intel.com>
Co-developed-by: Aman Kumar <aman.kumar(a)intel.com>
Signed-off-by: Aman Kumar <aman.kumar(a)intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Link: https://lore.kernel.org/r/20221108121952.5497-2-ilpo.jarvinen@linux.intel.c…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/tty/serial/8250/8250_port.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index fe8662cd9402..92dd18716169 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -1897,6 +1897,10 @@ EXPORT_SYMBOL_GPL(serial8250_modem_status);
static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir)
{
switch (iir & 0x3f) {
+ case UART_IIR_RDI:
+ if (!up->dma->rx_running)
+ break;
+ fallthrough;
case UART_IIR_RX_TIMEOUT:
serial8250_rx_dma_flush(up);
fallthrough;
--
2.38.1
We've got a dump that current cpu is in pinctrl_commit_state, the
old_state != p->state while the stack is still in the process of
pinmux_disable_setting. So it means even if the current p->state is
changed in new state, the settings are not yet up-to-date enabled
complete yet.
Currently p->state in different value to synchronize the
pinctrl_commit_state behaviors. The p->state will have transaction like
old_state -> NULL -> new_state. When in old_state, it will try to
disable all the all state settings. And when after new state settings
enabled, p->state will changed to the new state after that. So use
smp_mb to synchronize the p->state variable and the settings in order.
---
drivers/pinctrl/core.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index 9e57f4c62e60..cd917a5b1a0a 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -1256,6 +1256,7 @@ static int pinctrl_commit_state(struct pinctrl *p, struct pinctrl_state *state)
}
}
+ smp_mb();
p->state = NULL;
/* Apply all the settings for the new state - pinmux first */
@@ -1305,6 +1306,7 @@ static int pinctrl_commit_state(struct pinctrl *p, struct pinctrl_state *state)
pinctrl_link_add(setting->pctldev, p->dev);
}
+ smp_mb();
p->state = state;
return 0;
--
2.17.1
Looks like that we're accidentally dropping a pretty important return code
here. For some reason, we just return -EINVAL if we fail to get the MST
topology state. This is wrong: error codes are important and should never
be squashed without being handled, which here seems to have the potential
to cause a deadlock.
Signed-off-by: Lyude Paul <lyude(a)redhat.com>
Fixes: 8ec046716ca8 ("drm/dp_mst: Add helper to trigger modeset on affected DSC MST CRTCs")
Cc: <stable(a)vger.kernel.org> # v5.6+
---
drivers/gpu/drm/display/drm_dp_mst_topology.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/display/drm_dp_mst_topology.c b/drivers/gpu/drm/display/drm_dp_mst_topology.c
index ecd22c038c8c0..51a46689cda70 100644
--- a/drivers/gpu/drm/display/drm_dp_mst_topology.c
+++ b/drivers/gpu/drm/display/drm_dp_mst_topology.c
@@ -5186,7 +5186,7 @@ int drm_dp_mst_add_affected_dsc_crtcs(struct drm_atomic_state *state, struct drm
mst_state = drm_atomic_get_mst_topology_state(state, mgr);
if (IS_ERR(mst_state))
- return -EINVAL;
+ return PTR_ERR(mst_state);
list_for_each_entry(pos, &mst_state->payloads, next) {
--
2.37.3
Restoration of the host IA32_SPEC_CTRL value is probably too late
with respect to the return thunk training sequence.
With respect to the user/kernel boundary, AMD says, "If software chooses
to toggle STIBP (e.g., set STIBP on kernel entry, and clear it on kernel
exit), software should set STIBP to 1 before executing the return thunk
training sequence." I assume the same requirements apply to the guest/host
boundary. The return thunk training sequence is in vmenter.S, quite close
to the VM-exit. On hosts without V_SPEC_CTRL, however, the host's
IA32_SPEC_CTRL value is not restored until much later.
To avoid this, move the restoration of host SPEC_CTRL to assembly and,
for consistency, move the restoration of the guest SPEC_CTRL as well.
This is not particularly difficult, apart from some care to cover both
32- and 64-bit, and to share code between SEV-ES and normal vmentry.
Cc: stable(a)vger.kernel.org
Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk")
Suggested-by: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
---
arch/x86/kernel/cpu/bugs.c | 13 +----
arch/x86/kvm/kvm-asm-offsets.c | 1 +
arch/x86/kvm/svm/svm.c | 38 +++++-------
arch/x86/kvm/svm/svm.h | 4 +-
arch/x86/kvm/svm/vmenter.S | 102 ++++++++++++++++++++++++++++++++-
5 files changed, 121 insertions(+), 37 deletions(-)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index da7c361f47e0..6ec0b7ce7453 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -196,22 +196,15 @@ void __init check_bugs(void)
}
/*
- * NOTE: This function is *only* called for SVM. VMX spec_ctrl handling is
- * done in vmenter.S.
+ * NOTE: This function is *only* called for SVM, since Intel uses
+ * MSR_IA32_SPEC_CTRL for SSBD.
*/
void
x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
{
- u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current();
+ u64 guestval, hostval;
struct thread_info *ti = current_thread_info();
- if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
- if (hostval != guestval) {
- msrval = setguest ? guestval : hostval;
- wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
- }
- }
-
/*
* If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
* MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
index f83e88b85bf2..b2877c2c8df1 100644
--- a/arch/x86/kvm/kvm-asm-offsets.c
+++ b/arch/x86/kvm/kvm-asm-offsets.c
@@ -16,6 +16,7 @@ static void __used common(void)
BLANK();
OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs);
OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
+ OFFSET(SVM_spec_ctrl, vcpu_svm, spec_ctrl);
OFFSET(SVM_vmcb01, vcpu_svm, vmcb01);
OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e15f6ea9e5cc..512bc06a4ba1 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -730,6 +730,15 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
u32 offset;
u32 *msrpm;
+ /*
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
to_svm(vcpu)->msrpm;
@@ -3911,18 +3920,19 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
{
struct vcpu_svm *svm = to_svm(vcpu);
guest_state_enter_irqoff();
if (sev_es_guest(vcpu->kvm)) {
- __svm_sev_es_vcpu_run(svm);
+ __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
} else {
struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
- __svm_vcpu_run(svm, __sme_page_pa(sd->save_area));
+ __svm_vcpu_run(svm, __sme_page_pa(sd->save_area),
+ spec_ctrl_intercepted);
}
guest_state_exit_irqoff();
@@ -3931,6 +3941,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
trace_kvm_entry(vcpu);
@@ -3989,26 +4000,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
- svm_vcpu_enter_exit(vcpu);
-
- /*
- * We do not use IBRS in the kernel. If this vCPU has used the
- * SPEC_CTRL MSR it may have left it on; save the value and
- * turn it off. This is much more efficient than blindly adding
- * it to the atomic save/restore list. Especially as the former
- * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
- *
- * For non-nested case:
- * If the L01 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- *
- * For nested case:
- * If the L02 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- */
- if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
- unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
- svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
+ svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
if (!sev_es_guest(vcpu->kvm))
reload_tss(vcpu);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 932f26be5675..bf9ff39dc420 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -683,7 +683,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */
-void __svm_sev_es_vcpu_run(struct vcpu_svm *svm);
-void __svm_vcpu_run(struct vcpu_svm *svm, unsigned long hsave_pa);
+void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+void __svm_vcpu_run(struct vcpu_svm *svm, unsigned long hsave_pa, bool spec_ctrl_intercepted);
#endif
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 0a4272faf80f..a02eef724379 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -32,10 +32,70 @@
.section .noinstr.text, "ax"
+.macro RESTORE_GUEST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 800f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+801:
+.endm
+
+.macro RESTORE_HOST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+901:
+.endm
+
+.macro RESTORE_SPEC_CTRL_BODY
+800:
+ /*
+ * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
+ * host's, write the MSR. This is kept out-of-line so that the common
+ * case does not have to jump.
+ *
+ * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
+ * there must not be any returns or indirect branches between this code
+ * and vmentry.
+ */
+ movl SVM_spec_ctrl(%_ASM_DI), %eax
+ cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ je 801b
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+ xor %edx, %edx
+ wrmsr
+ jmp 801b
+
+900:
+ /* Same for after vmexit. */
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+
+ /*
+ * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
+ * if it was not intercepted during guest execution.
+ */
+ cmpb $0, (%_ASM_SP)
+ jnz 998f
+ rdmsr
+ movl %eax, SVM_spec_ctrl(%_ASM_DI)
+998:
+
+ /* Now restore the host value of the MSR if different from the guest's. */
+ movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ cmp SVM_spec_ctrl(%_ASM_DI), %eax
+ je 901b
+ xor %edx, %edx
+ wrmsr
+ jmp 901b
+.endm
+
+
/**
* __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
* @svm: struct vcpu_svm *
* @hsave_pa: unsigned long
+ * @spec_ctrl_intercepted: bool
*/
SYM_FUNC_START(__svm_vcpu_run)
push %_ASM_BP
@@ -50,7 +110,12 @@ SYM_FUNC_START(__svm_vcpu_run)
#endif
push %_ASM_BX
- /* @hsave_pa is needed last after vmexit, save it first. */
+ /*
+ * Both @spec_ctrl_intercepted and @hsave_pa are used only after vmexit.
+ * @spec_ctrl_intercepted is needed later and accessed directly from
+ * the stack in RESTORE_HOST_SPEC_CTRL, so save it first.
+ */
+ push %_ASM_ARG3
push %_ASM_ARG2
/* Save @svm. */
@@ -61,6 +126,8 @@ SYM_FUNC_START(__svm_vcpu_run)
mov %_ASM_ARG1, %_ASM_DI
.endif
+ RESTORE_GUEST_SPEC_CTRL
+
/*
* Use a single vmcb (vmcb01 because it's always valid) for
* context switching guest state via VMLOAD/VMSAVE, that way
@@ -138,6 +205,8 @@ SYM_FUNC_START(__svm_vcpu_run)
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
+ RESTORE_HOST_SPEC_CTRL
+
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
* untrained as soon as we exit the VM and are back to the
@@ -173,6 +242,9 @@ SYM_FUNC_START(__svm_vcpu_run)
xor %r15d, %r15d
#endif
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
pop %_ASM_BX
#ifdef CONFIG_X86_64
@@ -187,6 +259,8 @@ SYM_FUNC_START(__svm_vcpu_run)
pop %_ASM_BP
RET
+ RESTORE_SPEC_CTRL_BODY
+
10: cmpb $0, kvm_rebooting
jne 2b
ud2
@@ -210,6 +284,7 @@ SYM_FUNC_END(__svm_vcpu_run)
/**
* __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
* @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
*/
SYM_FUNC_START(__svm_sev_es_vcpu_run)
push %_ASM_BP
@@ -224,8 +299,21 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
#endif
push %_ASM_BX
+ /* Save @spec_ctrl_intercepted for RESTORE_HOST_SPEC_CTRL. */
+ push %_ASM_ARG2
+
+ /* Save @svm. */
+ push %_ASM_ARG1
+
+.ifnc _ASM_ARG1, _ASM_DI
+ /* Move @svm to RDI for RESTORE_GUEST_SPEC_CTRL. */
+ mov %_ASM_ARG1, %_ASM_DI
+.endif
+
+ RESTORE_GUEST_SPEC_CTRL
+
/* Get svm->current_vmcb->pa into RAX. */
- mov SVM_current_vmcb(%_ASM_ARG1), %_ASM_AX
+ mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
/* Enter guest mode */
@@ -235,11 +323,16 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
2: cli
+ /* Pop @svm to RDI, guest registers have been saved already. */
+ pop %_ASM_DI
+
#ifdef CONFIG_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
+ RESTORE_HOST_SPEC_CTRL
+
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
* untrained as soon as we exit the VM and are back to the
@@ -249,6 +342,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
*/
UNTRAIN_RET
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
pop %_ASM_BX
#ifdef CONFIG_X86_64
@@ -263,6 +359,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
pop %_ASM_BP
RET
+ RESTORE_SPEC_CTRL_BODY
+
3: cmpb $0, kvm_rebooting
jne 2b
ud2
--
2.31.1
Buenos días:
Le escribo para hablarle sobre una de las mejores herramientas GPS en el mercado.
La herramienta, que me gustaría presentarle brevemente, dispone de muchas funciones útiles para su trabajo, que optimizan los procesos de transporte y le ayudan a realizar tareas de campo de manera más eficiente.
¿Quiere conocer los detalles?
Atentamente,
Antonio Valverde
From: Stylon Wang <stylon.wang(a)amd.com>
[Why]
Since introduction of patch "Query DPIA HPD status.", link detection at
boot could be accessing DPIA AUX, which will not succeed until
DMUB outbox messaging is enabled and results in below dmesg logs:
[ 160.840227] [drm:amdgpu_dm_process_dmub_aux_transfer_sync [amdgpu]] *ERROR* wait_for_completion_timeout timeout!
[How]
Enable DMUB outbox messaging before link detection at boot time.
Reviewed-by: Wayne Lin <Wayne.Lin(a)amd.com>
Acked-by: Tom Chung <chiahsuan.chung(a)amd.com>
Signed-off-by: Stylon Wang <stylon.wang(a)amd.com>
Cc: stable(a)vger.kernel.org #6.0.x
---
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 66eb16fbe09f..4d90c5415d5c 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -1633,12 +1633,6 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
}
}
- if (amdgpu_dm_initialize_drm_device(adev)) {
- DRM_ERROR(
- "amdgpu: failed to initialize sw for display support.\n");
- goto error;
- }
-
/* Enable outbox notification only after IRQ handlers are registered and DMUB is alive.
* It is expected that DMUB will resend any pending notifications at this point, for
* example HPD from DPIA.
@@ -1646,6 +1640,12 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
if (dc_is_dmub_outbox_supported(adev->dm.dc))
dc_enable_dmub_outbox(adev->dm.dc);
+ if (amdgpu_dm_initialize_drm_device(adev)) {
+ DRM_ERROR(
+ "amdgpu: failed to initialize sw for display support.\n");
+ goto error;
+ }
+
/* create fake encoders for MST */
dm_dp_create_fake_mst_encoders(adev);
--
2.25.1
From: Xiubo Li <xiubli(a)redhat.com>
When decoding the snaps fails it maybe leaving the 'first_realm'
and 'realm' pointing to the same snaprealm memory. And then it'll
put it twice and could cause random use-after-free, BUG_ON, etc
issues.
Cc: stable(a)vger.kernel.org
URL: https://tracker.ceph.com/issues/57686
Signed-off-by: Xiubo Li <xiubli(a)redhat.com>
---
fs/ceph/snap.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 9bceed2ebda3..f5b0fa1ff705 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -775,6 +775,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
dout("%s deletion=%d\n", __func__, deletion);
more:
+ realm = NULL;
rebuild_snapcs = 0;
ceph_decode_need(&p, e, sizeof(*ri), bad);
ri = p;
--
2.31.1