From: Stefan Schake stschake@gmail.com
[ Upstream commit 253696ccd613fbdaa5aba1de44c461a058e0a114 ]
Synchronously disable the IRQ to make the following cancel_work_sync invocation effective.
An interrupt in flight could enqueue further overflow mem work. As we free the binner BO immediately following vc4_irq_uninstall this caused a NULL pointer dereference in the work callback vc4_overflow_mem_work.
Link: https://github.com/anholt/linux/issues/114 Signed-off-by: Stefan Schake stschake@gmail.com Fixes: d5b1a78a772f ("drm/vc4: Add support for drawing 3D frames.") Signed-off-by: Eric Anholt eric@anholt.net Reviewed-by: Eric Anholt eric@anholt.net Link: https://patchwork.freedesktop.org/patch/msgid/1510275907-993-2-git-send-emai... Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/gpu/drm/vc4/vc4_irq.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c index 7d7af3a93d94..61b2e5377993 100644 --- a/drivers/gpu/drm/vc4/vc4_irq.c +++ b/drivers/gpu/drm/vc4/vc4_irq.c @@ -208,6 +208,9 @@ vc4_irq_postinstall(struct drm_device *dev) { struct vc4_dev *vc4 = to_vc4_dev(dev);
+ /* Undo the effects of a previous vc4_irq_uninstall. */ + enable_irq(dev->irq); + /* Enable both the render done and out of memory interrupts. */ V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS);
@@ -225,6 +228,9 @@ vc4_irq_uninstall(struct drm_device *dev) /* Clear any pending interrupts we might have left. */ V3D_WRITE(V3D_INTCTL, V3D_DRIVER_IRQS);
+ /* Finish any interrupt handler still in flight. */ + disable_irq(dev->irq); + cancel_work_sync(&vc4->overflow_mem_work); }
From: Nikolay Borisov nborisov@suse.com
[ Upstream commit 5e9f2ad5b2904a7e81df6d9a3dbef29478952eac ]
btrfs_rm_dev_item calls several function under an active transaction, however it fails to abort it if an error happens. Fix this by adding explicit btrfs_abort_transaction/btrfs_end_transaction calls.
Signed-off-by: Nikolay Borisov nborisov@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/btrfs/volumes.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0c11121a8ace..4006b2a1233d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1765,20 +1765,24 @@ static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, key.offset = device->devid;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto out; - - if (ret > 0) { - ret = -ENOENT; + if (ret) { + if (ret > 0) + ret = -ENOENT; + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); goto out; }
ret = btrfs_del_item(trans, root, path); - if (ret) - goto out; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + } + out: btrfs_free_path(path); - btrfs_commit_transaction(trans); + if (!ret) + ret = btrfs_commit_transaction(trans); return ret; }
From: Liu Bo bo.li.liu@oracle.com
[ Upstream commit 56a0e706fcf870270878d6d72b71092ae42d229c ]
If a file's DIR_ITEM key is invalid (due to memory errors) and gets written to disk, a future lookup_path can end up with kernel panic due to BUG_ON().
This gets rid of the BUG_ON(), meanwhile output the corrupted key and return ENOENT if it's invalid.
Signed-off-by: Liu Bo bo.li.liu@oracle.com Reported-by: Guillaume Bouchard bouchard@mercs-eng.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/btrfs/inode.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d94e3f68b9b1..c71afd424900 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5500,6 +5500,14 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, goto out_err;
btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); + if (location->type != BTRFS_INODE_ITEM_KEY && + location->type != BTRFS_ROOT_ITEM_KEY) { + btrfs_warn(root->fs_info, +"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", + __func__, name, btrfs_ino(BTRFS_I(dir)), + location->objectid, location->type, location->offset); + goto out_err; + } out: btrfs_free_path(path); return ret; @@ -5816,8 +5824,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return inode; }
- BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); - index = srcu_read_lock(&fs_info->subvol_srcu); ret = fixup_tree_root_location(fs_info, dir, dentry, &location, &sub_root);
From: Abhishek Goel huntbag@linux.vnet.ibm.com
[ Upstream commit 53d1cd6b125fb9d69303516a1179ebc3b72f797a ]
cpupower_is_cpu_online was incorrectly checking for 0. This patch fixes this by checking for 1 when the cpu is online.
Signed-off-by: Abhishek Goel huntbag@linux.vnet.ibm.com Signed-off-by: Shuah Khan shuahkh@osg.samsung.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- tools/power/cpupower/bench/system.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/power/cpupower/bench/system.c b/tools/power/cpupower/bench/system.c index c25a74ae51ba..2bb3eef7d5c1 100644 --- a/tools/power/cpupower/bench/system.c +++ b/tools/power/cpupower/bench/system.c @@ -61,7 +61,7 @@ int set_cpufreq_governor(char *governor, unsigned int cpu)
dprintf("set %s as cpufreq governor\n", governor);
- if (cpupower_is_cpu_online(cpu) != 0) { + if (cpupower_is_cpu_online(cpu) != 1) { perror("cpufreq_cpu_exists"); fprintf(stderr, "error: cpu %u does not exist\n", cpu); return -1;
From: Abhishek Goel huntbag@linux.vnet.ibm.com
[ Upstream commit dbdc468f35ee827cab2753caa1c660bdb832243a ]
cpuidle_monitor used to assume that cpu0 is always online which is not a valid assumption on POWER machines. This patch fixes this by getting the cpu on which the current thread is running, instead of always using cpu0 for monitoring which may not be online.
Signed-off-by: Abhishek Goel huntbag@linux.vnet.ibm.com Signed-off-by: Shuah Khan shuahkh@osg.samsung.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c b/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c index 1b5da0066ebf..5b3205f16217 100644 --- a/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c +++ b/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c @@ -130,15 +130,18 @@ static struct cpuidle_monitor *cpuidle_register(void) { int num; char *tmp; + int this_cpu; + + this_cpu = sched_getcpu();
/* Assume idle state count is the same for all CPUs */ - cpuidle_sysfs_monitor.hw_states_num = cpuidle_state_count(0); + cpuidle_sysfs_monitor.hw_states_num = cpuidle_state_count(this_cpu);
if (cpuidle_sysfs_monitor.hw_states_num <= 0) return NULL;
for (num = 0; num < cpuidle_sysfs_monitor.hw_states_num; num++) { - tmp = cpuidle_state_name(0, num); + tmp = cpuidle_state_name(this_cpu, num); if (tmp == NULL) continue;
@@ -146,7 +149,7 @@ static struct cpuidle_monitor *cpuidle_register(void) strncpy(cpuidle_cstates[num].name, tmp, CSTATE_NAME_LEN - 1); free(tmp);
- tmp = cpuidle_state_desc(0, num); + tmp = cpuidle_state_desc(this_cpu, num); if (tmp == NULL) continue; strncpy(cpuidle_cstates[num].desc, tmp, CSTATE_DESC_LEN - 1);
From: Liran Alon liran.alon@oracle.com
[ Upstream commit ac9b305caa0df6f5b75d294e4b86c1027648991e ]
When running L2, #UD should be intercepted by L1 or just forwarded directly to L2. It should not reach L0 x86 emulator. Therefore, set intercept for #UD only based on L1 exception-bitmap.
Also add WARN_ON_ONCE() on L0 #UD intercept handlers to make sure it is never reached while running L2.
This improves commit ae1f57670703 ("KVM: nVMX: Do not emulate #UD while in guest mode") by removing an unnecessary exit from L2 to L0 on #UD when L1 doesn't intercept it.
In addition, SVM L0 #UD intercept handler doesn't handle correctly the case it is raised from L2. In this case, it should forward the #UD to guest instead of x86 emulator. As done in VMX #UD intercept handler. This commit fixes this issue as-well.
Signed-off-by: Liran Alon liran.alon@oracle.com Reviewed-by: Nikita Leshenko nikita.leshchenko@oracle.com Reviewed-by: Konrad Rzeszutek Wilk konrad.wilk@oracle.com Signed-off-by: Konrad Rzeszutek Wilk konrad.wilk@oracle.com Reviewed-by: Paolo Bonzini pbonzini@redhat.com Reviewed-by: Wanpeng Li wanpeng.li@hotmail.com Signed-off-by: Radim Krčmář rkrcmar@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/x86/kvm/svm.c | 9 ++++++++- arch/x86/kvm/vmx.c | 9 ++++----- 2 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6a8284f72328..c8be4e6d365b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -362,6 +362,7 @@ static void recalc_intercepts(struct vcpu_svm *svm) { struct vmcb_control_area *c, *h; struct nested_state *g; + u32 h_intercept_exceptions;
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
@@ -372,9 +373,14 @@ static void recalc_intercepts(struct vcpu_svm *svm) h = &svm->nested.hsave->control; g = &svm->nested;
+ /* No need to intercept #UD if L1 doesn't intercept it */ + h_intercept_exceptions = + h->intercept_exceptions & ~(1U << UD_VECTOR); + c->intercept_cr = h->intercept_cr | g->intercept_cr; c->intercept_dr = h->intercept_dr | g->intercept_dr; - c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions; + c->intercept_exceptions = + h_intercept_exceptions | g->intercept_exceptions; c->intercept = h->intercept | g->intercept; }
@@ -2189,6 +2195,7 @@ static int ud_interception(struct vcpu_svm *svm) { int er;
+ WARN_ON_ONCE(is_guest_mode(&svm->vcpu)); er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD); if (er == EMULATE_USER_EXIT) return 0; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ef16cf0f7cfd..36628ed362d8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1891,7 +1891,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb;
- eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | + eb = (1u << PF_VECTOR) | (1u << MC_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); if ((vcpu->guest_debug & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == @@ -1909,6 +1909,8 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) */ if (is_guest_mode(vcpu)) eb |= get_vmcs12(vcpu)->exception_bitmap; + else + eb |= 1u << UD_VECTOR;
vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -5919,10 +5921,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) return 1; /* already handled by vmx_vcpu_run() */
if (is_invalid_opcode(intr_info)) { - if (is_guest_mode(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } + WARN_ON_ONCE(is_guest_mode(vcpu)); er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); if (er == EMULATE_USER_EXIT) return 0;
From: Liran Alon liran.alon@oracle.com
[ Upstream commit 1f4dcb3b213235e642088709a1c54964d23365e9 ]
On this case, handle_emulation_failure() fills kvm_run with internal-error information which it expects to be delivered to user-mode for further processing. However, the code reports a wrong return-value which makes KVM to never return to user-mode on this scenario.
Fixes: 6d77dbfc88e3 ("KVM: inject #UD if instruction emulation fails and exit to userspace")
Signed-off-by: Liran Alon liran.alon@oracle.com Reviewed-by: Nikita Leshenko nikita.leshchenko@oracle.com Reviewed-by: Konrad Rzeszutek Wilk konrad.wilk@oracle.com Signed-off-by: Konrad Rzeszutek Wilk konrad.wilk@oracle.com Reviewed-by: Wanpeng Li wanpeng.li@hotmail.com Signed-off-by: Radim Krčmář rkrcmar@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 575c8953cc9a..b25326ad82a1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5416,7 +5416,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; - r = EMULATE_FAIL; + r = EMULATE_USER_EXIT; } kvm_queue_exception(vcpu, UD_VECTOR);
From: Wanpeng Li wanpeng.li@hotmail.com
[ Upstream commit 3853be2603191829b442b64dac6ae8ba0c027bf9 ]
Pedro reported: During tests that we conducted on KVM, we noticed that executing a "PUSH %ES" instruction under KVM produces different results on both memory and the SP register depending on whether EPT support is enabled. With EPT the SP is reduced by 4 bytes (and the written value is 0-padded) but without EPT support it is only reduced by 2 bytes. The difference can be observed when the CS.DB field is 1 (32-bit) but not when it's 0 (16-bit).
The internal segment descriptor cache exist even in real/vm8096 mode. The CS.D also should be respected instead of just default operand/address-size/66H prefix/67H prefix during instruction decoding. This patch fixes it by also adjusting operand/address-size according to CS.D.
Reported-by: Pedro Fonseca pfonseca@cs.washington.edu Tested-by: Pedro Fonseca pfonseca@cs.washington.edu Cc: Paolo Bonzini pbonzini@redhat.com Cc: Radim Krčmář rkrcmar@redhat.com Cc: Nadav Amit nadav.amit@gmail.com Cc: Pedro Fonseca pfonseca@cs.washington.edu Signed-off-by: Wanpeng Li wanpeng.li@hotmail.com Reviewed-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Radim Krčmář rkrcmar@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/x86/kvm/emulate.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7bbb5da2b49d..1a2543545e5d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5009,6 +5009,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) bool op_prefix = false; bool has_seg_override = false; struct opcode opcode; + u16 dummy; + struct desc_struct desc;
ctxt->memop.type = OP_NONE; ctxt->memopp = NULL; @@ -5027,6 +5029,11 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) switch (mode) { case X86EMUL_MODE_REAL: case X86EMUL_MODE_VM86: + def_op_bytes = def_ad_bytes = 2; + ctxt->ops->get_segment(ctxt, &dummy, &desc, NULL, VCPU_SREG_CS); + if (desc.d) + def_op_bytes = def_ad_bytes = 4; + break; case X86EMUL_MODE_PROT16: def_op_bytes = def_ad_bytes = 2; break;
From: Liran Alon liran.alon@oracle.com
[ Upstream commit 9b8ae63798cb97e785a667ff27e43fa6220cb734 ]
In case of instruction-decode failure or emulation failure, x86_emulate_instruction() will call reexecute_instruction() which will attempt to use the cr2 value passed to x86_emulate_instruction(). However, when x86_emulate_instruction() is called from emulate_instruction(), cr2 is not passed (passed as 0) and therefore it doesn't make sense to execute reexecute_instruction() logic at all.
Fixes: 51d8b66199e9 ("KVM: cleanup emulate_instruction")
Signed-off-by: Liran Alon liran.alon@oracle.com Reviewed-by: Nikita Leshenko nikita.leshchenko@oracle.com Reviewed-by: Konrad Rzeszutek Wilk konrad.wilk@oracle.com Signed-off-by: Konrad Rzeszutek Wilk konrad.wilk@oracle.com Reviewed-by: Wanpeng Li wanpeng.li@hotmail.com Signed-off-by: Radim Krčmář rkrcmar@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/vmx.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index eb38ac9d9a31..4f8b80199672 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1156,7 +1156,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, static inline int emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) { - return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); + return x86_emulate_instruction(vcpu, 0, + emulation_type | EMULTYPE_NO_REEXECUTE, NULL, 0); }
void kvm_enable_efer_bits(u64); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 36628ed362d8..079cedb910bc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6607,7 +6607,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1;
- err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); + err = emulate_instruction(vcpu, 0);
if (err == EMULATE_USER_EXIT) { ++vcpu->stat.mmio_exits;
From: Wanpeng Li wanpeng.li@hotmail.com
[ Upstream commit 5af4157388adad82c339e3742fb6b67840721347 ]
Commit 4f350c6dbcb (kvm: nVMX: Handle deferred early VMLAUNCH/VMRESUME failure properly) can result in L1(run kvm-unit-tests/run_tests.sh vmx_controls in L1) null pointer deference and also L0 calltrace when EPT=0 on both L0 and L1.
In L1:
BUG: unable to handle kernel paging request at ffffffffc015bf8f IP: vmx_vcpu_run+0x202/0x510 [kvm_intel] PGD 146e13067 P4D 146e13067 PUD 146e15067 PMD 3d2686067 PTE 3d4af9161 Oops: 0003 [#1] PREEMPT SMP CPU: 2 PID: 1798 Comm: qemu-system-x86 Not tainted 4.14.0-rc4+ #6 RIP: 0010:vmx_vcpu_run+0x202/0x510 [kvm_intel] Call Trace: WARNING: kernel stack frame pointer at ffffb86f4988bc18 in qemu-system-x86:1798 has bad value 0000000000000002
In L0:
-----------[ cut here ]------------ WARNING: CPU: 6 PID: 4460 at /home/kernel/linux/arch/x86/kvm//vmx.c:9845 vmx_inject_page_fault_nested+0x130/0x140 [kvm_intel] CPU: 6 PID: 4460 Comm: qemu-system-x86 Tainted: G OE 4.14.0-rc7+ #25 RIP: 0010:vmx_inject_page_fault_nested+0x130/0x140 [kvm_intel] Call Trace: paging64_page_fault+0x500/0xde0 [kvm] ? paging32_gva_to_gpa_nested+0x120/0x120 [kvm] ? nonpaging_page_fault+0x3b0/0x3b0 [kvm] ? __asan_storeN+0x12/0x20 ? paging64_gva_to_gpa+0xb0/0x120 [kvm] ? paging64_walk_addr_generic+0x11a0/0x11a0 [kvm] ? lock_acquire+0x2c0/0x2c0 ? vmx_read_guest_seg_ar+0x97/0x100 [kvm_intel] ? vmx_get_segment+0x2a6/0x310 [kvm_intel] ? sched_clock+0x1f/0x30 ? check_chain_key+0x137/0x1e0 ? __lock_acquire+0x83c/0x2420 ? kvm_multiple_exception+0xf2/0x220 [kvm] ? debug_check_no_locks_freed+0x240/0x240 ? debug_smp_processor_id+0x17/0x20 ? __lock_is_held+0x9e/0x100 kvm_mmu_page_fault+0x90/0x180 [kvm] kvm_handle_page_fault+0x15c/0x310 [kvm] ? __lock_is_held+0x9e/0x100 handle_exception+0x3c7/0x4d0 [kvm_intel] vmx_handle_exit+0x103/0x1010 [kvm_intel] ? kvm_arch_vcpu_ioctl_run+0x1628/0x2e20 [kvm]
The commit avoids to load host state of vmcs12 as vmcs01's guest state since vmcs12 is not modified (except for the VM-instruction error field) if the checking of vmcs control area fails. However, the mmu context is switched to nested mmu in prepare_vmcs02() and it will not be reloaded since load_vmcs12_host_state() is skipped when nested VMLAUNCH/VMRESUME fails. This patch fixes it by reloading mmu context when nested VMLAUNCH/VMRESUME fails.
Reviewed-by: Jim Mattson jmattson@google.com Reviewed-by: Krish Sadhukhan krish.sadhukhan@oracle.com Cc: Paolo Bonzini pbonzini@redhat.com Cc: Radim Krčmář rkrcmar@redhat.com Cc: Jim Mattson jmattson@google.com Signed-off-by: Wanpeng Li wanpeng.li@hotmail.com Reviewed-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Radim Krčmář rkrcmar@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/x86/kvm/vmx.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 079cedb910bc..3a16e45f0c2e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11339,6 +11339,24 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, kvm_clear_interrupt_queue(vcpu); }
+static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 entry_failure_code; + + nested_ept_uninit_mmu_context(vcpu); + + /* + * Only PDPTE load can fail as the value of cr3 was checked on entry and + * couldn't have changed. + */ + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); + + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; +} + /* * A part of what we need to when the nested L2 guest exits and we want to * run its L1 parent, is to reset L1's guest state to the host state specified @@ -11352,7 +11370,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct kvm_segment seg; - u32 entry_failure_code;
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; @@ -11379,17 +11396,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); vmx_set_cr4(vcpu, vmcs12->host_cr4);
- nested_ept_uninit_mmu_context(vcpu); - - /* - * Only PDPTE load can fail as the value of cr3 was checked on entry and - * couldn't have changed. - */ - if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) - nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); - - if (!enable_ept) - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + load_vmcs12_mmu_host_state(vcpu, vmcs12);
if (enable_vpid) { /* @@ -11615,6 +11622,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, * accordingly. */ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + load_vmcs12_mmu_host_state(vcpu, vmcs12); + /* * The emulated instruction was already skipped in * nested_vmx_run, but the updated RIP was never
From: Nikita Leshenko nikita.leshchenko@oracle.com
[ Upstream commit 0fc5a36dd6b345eb0d251a65c236e53bead3eef7 ]
KVM uses ioapic_handled_vectors to track vectors that need to notify the IOAPIC on EOI. The problem is that IOAPIC can be reconfigured while an interrupt with old configuration is pending or running and ioapic_handled_vectors only remembers the newest configuration; thus EOI from the old interrupt is not delievered to the IOAPIC.
A previous commit db2bdcbbbd32 ("KVM: x86: fix edge EOI and IOAPIC reconfig race") addressed this issue by adding pending edge-triggered interrupts to ioapic_handled_vectors, fixing this race for edge-triggered interrupts. The commit explicitly ignored level-triggered interrupts, but this race applies to them as well:
1) IOAPIC sends a level triggered interrupt vector to VCPU0 2) VCPU0's handler deasserts the irq line and reconfigures the IOAPIC to route the vector to VCPU1. The reconfiguration rewrites only the upper 32 bits of the IOREDTBLn register. (Causes KVM to update ioapic_handled_vectors for VCPU0 and it no longer includes the vector.) 3) VCPU0 sends EOI for the vector, but it's not delievered to the IOAPIC because the ioapic_handled_vectors doesn't include the vector. 4) New interrupts are not delievered to VCPU1 because remote_irr bit is set forever.
Therefore, the correct behavior is to add all pending and running interrupts to ioapic_handled_vectors.
This commit introduces a slight performance hit similar to commit db2bdcbbbd32 ("KVM: x86: fix edge EOI and IOAPIC reconfig race") for the rare case that the vector is reused by a non-IOAPIC source on VCPU0. We prefer to keep solution simple and not handle this case just as the original commit does.
Fixes: db2bdcbbbd32 ("KVM: x86: fix edge EOI and IOAPIC reconfig race")
Signed-off-by: Nikita Leshenko nikita.leshchenko@oracle.com Reviewed-by: Liran Alon liran.alon@oracle.com Signed-off-by: Konrad Rzeszutek Wilk konrad.wilk@oracle.com Signed-off-by: Radim Krčmář rkrcmar@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/x86/kvm/ioapic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index bdff437acbcb..ae0a7dc318b2 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -257,8 +257,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) index == RTC_GSI) { if (kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id, e->fields.dest_mode) || - (e->fields.trig_mode == IOAPIC_EDGE_TRIG && - kvm_apic_pending_eoi(vcpu, e->fields.vector))) + kvm_apic_pending_eoi(vcpu, e->fields.vector)) __set_bit(e->fields.vector, ioapic_handled_vectors); }
From: David Hildenbrand david@redhat.com
[ Upstream commit 4d772cb85f64c16eca00177089ecb3cd5d292120 ]
Commit 9d643f63128b ("KVM: x86: avoid large stack allocations in em_fxrstor") optimize the stack size, but introduced a guest memory access which might sleep while in atomic.
Fix it by introducing, again, a second fxregs_state. Try to avoid large stacks by using noinline. Add some helpful comments.
Reported by syzbot:
in_atomic(): 1, irqs_disabled(): 0, pid: 2909, name: syzkaller879109 2 locks held by syzkaller879109/2909: #0: (&vcpu->mutex){+.+.}, at: [<ffffffff8106222c>] vcpu_load+0x1c/0x70 arch/x86/kvm/../../../virt/kvm/kvm_main.c:154 #1: (&kvm->srcu){....}, at: [<ffffffff810dd162>] vcpu_enter_guest arch/x86/kvm/x86.c:6983 [inline] #1: (&kvm->srcu){....}, at: [<ffffffff810dd162>] vcpu_run arch/x86/kvm/x86.c:7061 [inline] #1: (&kvm->srcu){....}, at: [<ffffffff810dd162>] kvm_arch_vcpu_ioctl_run+0x1bc2/0x58b0 arch/x86/kvm/x86.c:7222 CPU: 1 PID: 2909 Comm: syzkaller879109 Not tainted 4.13.0-rc4-next-20170811 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 ___might_sleep+0x2b2/0x470 kernel/sched/core.c:6014 __might_sleep+0x95/0x190 kernel/sched/core.c:5967 __might_fault+0xab/0x1d0 mm/memory.c:4383 __copy_from_user include/linux/uaccess.h:71 [inline] __kvm_read_guest_page+0x58/0xa0 arch/x86/kvm/../../../virt/kvm/kvm_main.c:1771 kvm_vcpu_read_guest_page+0x44/0x60 arch/x86/kvm/../../../virt/kvm/kvm_main.c:1791 kvm_read_guest_virt_helper+0x76/0x140 arch/x86/kvm/x86.c:4407 kvm_read_guest_virt_system+0x3c/0x50 arch/x86/kvm/x86.c:4466 segmented_read_std+0x10c/0x180 arch/x86/kvm/emulate.c:819 em_fxrstor+0x27b/0x410 arch/x86/kvm/emulate.c:4022 x86_emulate_insn+0x55d/0x3c50 arch/x86/kvm/emulate.c:5471 x86_emulate_instruction+0x411/0x1ca0 arch/x86/kvm/x86.c:5698 kvm_mmu_page_fault+0x18b/0x2c0 arch/x86/kvm/mmu.c:4854 handle_ept_violation+0x1fc/0x5e0 arch/x86/kvm/vmx.c:6400 vmx_handle_exit+0x281/0x1ab0 arch/x86/kvm/vmx.c:8718 vcpu_enter_guest arch/x86/kvm/x86.c:6999 [inline] vcpu_run arch/x86/kvm/x86.c:7061 [inline] kvm_arch_vcpu_ioctl_run+0x1cee/0x58b0 arch/x86/kvm/x86.c:7222 kvm_vcpu_ioctl+0x64c/0x1010 arch/x86/kvm/../../../virt/kvm/kvm_main.c:2591 vfs_ioctl fs/ioctl.c:45 [inline] do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685 SYSC_ioctl fs/ioctl.c:700 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x437fc9 RSP: 002b:00007ffc7b4d5ab8 EFLAGS: 00000206 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00000000004002b0 RCX: 0000000000437fc9 RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 0000000000000005 RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000020ae8000 R10: 0000000000009120 R11: 0000000000000206 R12: 0000000000000000 R13: 0000000000000004 R14: 0000000000000004 R15: 0000000020077000
Fixes: 9d643f63128b ("KVM: x86: avoid large stack allocations in em_fxrstor") Signed-off-by: David Hildenbrand david@redhat.com Signed-off-by: Radim Krčmář rkrcmar@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/x86/kvm/emulate.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1a2543545e5d..eca6a89f2326 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4023,6 +4023,26 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) fxstate_size(ctxt)); }
+/* + * FXRSTOR might restore XMM registers not provided by the guest. Fill + * in the host registers (via FXSAVE) instead, so they won't be modified. + * (preemption has to stay disabled until FXRSTOR). + * + * Use noinline to keep the stack for other functions called by callers small. + */ +static noinline int fxregs_fixup(struct fxregs_state *fx_state, + const size_t used_size) +{ + struct fxregs_state fx_tmp; + int rc; + + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_tmp)); + memcpy((void *)fx_state + used_size, (void *)&fx_tmp + used_size, + __fxstate_size(16) - used_size); + + return rc; +} + static int em_fxrstor(struct x86_emulate_ctxt *ctxt) { struct fxregs_state fx_state; @@ -4033,19 +4053,19 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc;
+ size = fxstate_size(ctxt); + rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); + if (rc != X86EMUL_CONTINUE) + return rc; + ctxt->ops->get_fpu(ctxt);
- size = fxstate_size(ctxt); if (size < __fxstate_size(16)) { - rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + rc = fxregs_fixup(&fx_state, size); if (rc != X86EMUL_CONTINUE) goto out; }
- rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); - if (rc != X86EMUL_CONTINUE) - goto out; - if (fx_state.mxcsr >> 16) { rc = emulate_gp(ctxt, 0); goto out;
From: Nikita Leshenko nikita.leshchenko@oracle.com
[ Upstream commit a8bfec2930525808c01f038825d1df3904638631 ]
Some OSes (Linux, Xen) use this behavior to clear the Remote IRR bit for IOAPICs without an EOI register. They simulate the EOI message manually by changing the trigger mode to edge and then back to level, with the entry being masked during this.
QEMU implements this feature in commit ed1263c363c9 ("ioapic: clear remote irr bit for edge-triggered interrupts")
As a side effect, this commit removes an incorrect behavior where Remote IRR was cleared when the redirection table entry was rewritten. This is not consistent with the manual and also opens an opportunity for a strange behavior when a redirection table entry is modified from an interrupt handler that handles the same entry: The modification will clear the Remote IRR bit even though the interrupt handler is still running.
Signed-off-by: Nikita Leshenko nikita.leshchenko@oracle.com Reviewed-by: Liran Alon liran.alon@oracle.com Signed-off-by: Konrad Rzeszutek Wilk konrad.wilk@oracle.com Reviewed-by: Wanpeng Li wanpeng.li@hotmail.com Reviewed-by: Steve Rutherford srutherford@google.com Signed-off-by: Radim Krčmář rkrcmar@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/x86/kvm/ioapic.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index ae0a7dc318b2..5c9ad29e11fd 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -304,8 +304,17 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) } else { e->bits &= ~0xffffffffULL; e->bits |= (u32) val; - e->fields.remote_irr = 0; } + + /* + * Some OSes (Linux, Xen) assume that Remote IRR bit will + * be cleared by IOAPIC hardware when the entry is configured + * as edge-triggered. This behavior is used to simulate an + * explicit EOI on IOAPICs that don't have the EOI register. + */ + if (e->fields.trig_mode == IOAPIC_EDGE_TRIG) + e->fields.remote_irr = 0; + mask_after = e->fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
From: Nikita Leshenko nikita.leshchenko@oracle.com
[ Upstream commit b200dded0a6974a3b69599832b2203483920ab25 ]
According to 82093AA (IOAPIC) manual, Remote IRR and Delivery Status are read-only. QEMU implements the bits as RO in commit 479c2a1cb7fb ("ioapic: keep RO bits for IOAPIC entry").
Signed-off-by: Nikita Leshenko nikita.leshchenko@oracle.com Reviewed-by: Liran Alon liran.alon@oracle.com Signed-off-by: Konrad Rzeszutek Wilk konrad.wilk@oracle.com Reviewed-by: Wanpeng Li wanpeng.li@hotmail.com Reviewed-by: Steve Rutherford srutherford@google.com Signed-off-by: Radim Krčmář rkrcmar@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/x86/kvm/ioapic.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 5c9ad29e11fd..9d270ba9643c 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -276,6 +276,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) { unsigned index; bool mask_before, mask_after; + int old_remote_irr, old_delivery_status; union kvm_ioapic_redirect_entry *e;
switch (ioapic->ioregsel) { @@ -298,6 +299,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) return; e = &ioapic->redirtbl[index]; mask_before = e->fields.mask; + /* Preserve read-only fields */ + old_remote_irr = e->fields.remote_irr; + old_delivery_status = e->fields.delivery_status; if (ioapic->ioregsel & 1) { e->bits &= 0xffffffff; e->bits |= (u64) val << 32; @@ -305,6 +309,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) e->bits &= ~0xffffffffULL; e->bits |= (u32) val; } + e->fields.remote_irr = old_remote_irr; + e->fields.delivery_status = old_delivery_status;
/* * Some OSes (Linux, Xen) assume that Remote IRR bit will
From: Liran Alon liran.alon@oracle.com
[ Upstream commit 917dc6068bc12a2dafffcf0e9d405ddb1b8780cb ]
vmx_check_nested_events() should return -EBUSY only in case there is a pending L1 event which requires a VMExit from L2 to L1 but such a VMExit is currently blocked. Such VMExits are blocked either because nested_run_pending=1 or an event was reinjected to L2. vmx_check_nested_events() should return 0 in case there are no pending L1 events which requires a VMExit from L2 to L1 or if a VMExit from L2 to L1 was done internally.
However, upstream commit which introduced blocking in case an event was reinjected to L2 (commit acc9ab601327 ("KVM: nVMX: Fix pending events injection")) contains a bug: It returns -EBUSY even if there are no pending L1 events which requires VMExit from L2 to L1.
This commit fix this issue.
Fixes: acc9ab601327 ("KVM: nVMX: Fix pending events injection")
Signed-off-by: Liran Alon liran.alon@oracle.com Reviewed-by: Nikita Leshenko nikita.leshchenko@oracle.com Signed-off-by: Konrad Rzeszutek Wilk konrad.wilk@oracle.com Signed-off-by: Radim Krčmář rkrcmar@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/x86/kvm/vmx.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3a16e45f0c2e..d073f91bc089 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11114,13 +11114,12 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qual; - - if (kvm_event_needs_reinjection(vcpu)) - return -EBUSY; + bool block_nested_events = + vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
if (vcpu->arch.exception.pending && nested_vmx_check_exception(vcpu, &exit_qual)) { - if (vmx->nested.nested_run_pending) + if (block_nested_events) return -EBUSY; nested_vmx_inject_exception_vmexit(vcpu, exit_qual); vcpu->arch.exception.pending = false; @@ -11129,14 +11128,14 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && vmx->nested.preemption_timer_expired) { - if (vmx->nested.nested_run_pending) + if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); return 0; }
if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { - if (vmx->nested.nested_run_pending) + if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, NMI_VECTOR | INTR_TYPE_NMI_INTR | @@ -11152,7 +11151,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && nested_exit_on_intr(vcpu)) { - if (vmx->nested.nested_run_pending) + if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); return 0;
From: Sagi Grimberg sagi@grimberg.me
[ Upstream commit 48832f8d58cfedb2f9bee11bbfbb657efb42e7e7 ]
When the fabrics queue is not alive and fully functional, no commands should be allowed to pass but connect (which moves the queue to a fully functional state). Any other command should be failed, with either temporary status BLK_STS_RESOUCE or permanent status BLK_STS_IOERR.
This is shared across all fabrics, hence move the check to fabrics library.
Signed-off-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/nvme/host/fabrics.h | 30 ++++++++++++++++++++++++++++++ drivers/nvme/host/rdma.c | 32 ++++++-------------------------- 2 files changed, 36 insertions(+), 26 deletions(-)
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index bf33663218cd..9ff8529a64a9 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -142,4 +142,34 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts); int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
+static inline blk_status_t nvmf_check_init_req(struct nvme_ctrl *ctrl, + struct request *rq) +{ + struct nvme_command *cmd = nvme_req(rq)->cmd; + + /* + * We cannot accept any other command until the connect command has + * completed, so only allow connect to pass. + */ + if (!blk_rq_is_passthrough(rq) || + cmd->common.opcode != nvme_fabrics_command || + cmd->fabrics.fctype != nvme_fabrics_type_connect) { + /* + * Reconnecting state means transport disruption, which can take + * a long time and even might fail permanently, fail fast to + * give upper layers a chance to failover. + * Deleting state means that the ctrl will never accept commands + * again, fail it permanently. + */ + if (ctrl->state == NVME_CTRL_RECONNECTING || + ctrl->state == NVME_CTRL_DELETING) { + nvme_req(rq)->status = NVME_SC_ABORT_REQ; + return BLK_STS_IOERR; + } + return BLK_STS_RESOURCE; /* try again later */ + } + + return BLK_STS_OK; +} + #endif /* _NVME_FABRICS_H */ diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 0ebb539f3bd3..6de163e6c9eb 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1603,31 +1603,11 @@ nvme_rdma_timeout(struct request *rq, bool reserved) * We cannot accept any other command until the Connect command has completed. */ static inline blk_status_t -nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, struct request *rq) -{ - if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) { - struct nvme_command *cmd = nvme_req(rq)->cmd; - - if (!blk_rq_is_passthrough(rq) || - cmd->common.opcode != nvme_fabrics_command || - cmd->fabrics.fctype != nvme_fabrics_type_connect) { - /* - * reconnecting state means transport disruption, which - * can take a long time and even might fail permanently, - * fail fast to give upper layers a chance to failover. - * deleting state means that the ctrl will never accept - * commands again, fail it permanently. - */ - if (queue->ctrl->ctrl.state == NVME_CTRL_RECONNECTING || - queue->ctrl->ctrl.state == NVME_CTRL_DELETING) { - nvme_req(rq)->status = NVME_SC_ABORT_REQ; - return BLK_STS_IOERR; - } - return BLK_STS_RESOURCE; /* try again later */ - } - } - - return 0; +nvme_rdma_is_ready(struct nvme_rdma_queue *queue, struct request *rq) +{ + if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) + return nvmf_check_init_req(&queue->ctrl->ctrl, rq); + return BLK_STS_OK; }
static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, @@ -1646,7 +1626,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
WARN_ON_ONCE(rq->tag < 0);
- ret = nvme_rdma_queue_is_ready(queue, rq); + ret = nvme_rdma_is_ready(queue, rq); if (unlikely(ret)) return ret;
From: Janakarajan Natarajan Janakarajan.Natarajan@amd.com
[ Upstream commit 50a671d4d15b859f447fa527191073019b6ce9cb ]
The function for CPUID 80000001 ECX is set to 0xc0000001. Set it to 0x80000001.
Signed-off-by: Janakarajan Natarajan Janakarajan.Natarajan@amd.com Reviewed-by: Jim Mattson jmattson@google.com Reviewed-by: Krish Sadhukhan krish.sadhukhan@oracle.com Reviewed-by: Borislav Petkov bp@suse.de Fixes: d6321d493319 ("KVM: x86: generalize guest_cpuid_has_ helpers") Signed-off-by: Radim Krčmář rkrcmar@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/x86/kvm/cpuid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index cdc70a3a6583..c2cea6651279 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -44,7 +44,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX}, [CPUID_1_ECX] = { 1, 0, CPUID_ECX}, [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX}, - [CPUID_8000_0001_ECX] = {0xc0000001, 0, CPUID_ECX}, + [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX}, [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX}, [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX}, [CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX},
From: Sagi Grimberg sagi@grimberg.me
[ Upstream commit 9d7fab04b95e8c26014a9bfc1c943b8360b44c17 ]
In case the queue is not LIVE (fully functional and connected at the nvmf level), we cannot allow any commands other than connect to pass through.
Add a new queue state flag NVME_LOOP_Q_LIVE which is set after nvmf connect and cleared in queue teardown.
Signed-off-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/nvme/target/loop.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 92628c432926..02aff5cc48bf 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -61,10 +61,15 @@ static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) return container_of(ctrl, struct nvme_loop_ctrl, ctrl); }
+enum nvme_loop_queue_flags { + NVME_LOOP_Q_LIVE = 0, +}; + struct nvme_loop_queue { struct nvmet_cq nvme_cq; struct nvmet_sq nvme_sq; struct nvme_loop_ctrl *ctrl; + unsigned long flags; };
static struct nvmet_port *nvmet_loop_port; @@ -153,6 +158,14 @@ nvme_loop_timeout(struct request *rq, bool reserved) return BLK_EH_HANDLED; }
+static inline blk_status_t nvme_loop_is_ready(struct nvme_loop_queue *queue, + struct request *rq) +{ + if (unlikely(!test_bit(NVME_LOOP_Q_LIVE, &queue->flags))) + return nvmf_check_init_req(&queue->ctrl->ctrl, rq); + return BLK_STS_OK; +} + static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -162,6 +175,10 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret;
+ ret = nvme_loop_is_ready(queue, req); + if (unlikely(ret)) + return ret; + ret = nvme_setup_cmd(ns, req, &iod->cmd); if (ret) return ret; @@ -275,6 +292,7 @@ static const struct blk_mq_ops nvme_loop_admin_mq_ops = {
static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) { + clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); blk_cleanup_queue(ctrl->ctrl.admin_q); blk_mq_free_tag_set(&ctrl->admin_tag_set); @@ -305,8 +323,10 @@ static void nvme_loop_destroy_io_queues(struct nvme_loop_ctrl *ctrl) { int i;
- for (i = 1; i < ctrl->ctrl.queue_count; i++) + for (i = 1; i < ctrl->ctrl.queue_count; i++) { + clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags); nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); + } }
static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl) @@ -346,6 +366,7 @@ static int nvme_loop_connect_io_queues(struct nvme_loop_ctrl *ctrl) ret = nvmf_connect_io_queue(&ctrl->ctrl, i); if (ret) return ret; + set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags); }
return 0; @@ -387,6 +408,8 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) if (error) goto out_cleanup_queue;
+ set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); if (error) { dev_err(ctrl->ctrl.device,
From: Sagi Grimberg sagi@grimberg.me
[ Upstream commit 9e0ed16ab9a9aaf670b81c9cd05b5e50defed654 ]
In case the queue is not LIVE (fully functional and connected at the nvmf level), we cannot allow any commands other than connect to pass through.
Add a new queue state flag NVME_FC_Q_LIVE which is set after nvmf connect and cleared in queue teardown.
Signed-off-by: Sagi Grimberg sagi@grimberg.me Reviewed-by: James Smart james.smart@broadcom.com Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/nvme/host/fc.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index be49d0f79381..3148d760d825 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -41,6 +41,7 @@
enum nvme_fc_queue_flags { NVME_FC_Q_CONNECTED = (1 << 0), + NVME_FC_Q_LIVE = (1 << 1), };
#define NVMEFC_QUEUE_DELAY 3 /* ms units */ @@ -1654,6 +1655,7 @@ nvme_fc_free_queue(struct nvme_fc_queue *queue) if (!test_and_clear_bit(NVME_FC_Q_CONNECTED, &queue->flags)) return;
+ clear_bit(NVME_FC_Q_LIVE, &queue->flags); /* * Current implementation never disconnects a single queue. * It always terminates a whole association. So there is never @@ -1661,7 +1663,6 @@ nvme_fc_free_queue(struct nvme_fc_queue *queue) */
queue->connection_id = 0; - clear_bit(NVME_FC_Q_CONNECTED, &queue->flags); }
static void @@ -1740,6 +1741,8 @@ nvme_fc_connect_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize) ret = nvmf_connect_io_queue(&ctrl->ctrl, i); if (ret) break; + + set_bit(NVME_FC_Q_LIVE, &ctrl->queues[i].flags); }
return ret; @@ -2048,6 +2051,14 @@ busy: return BLK_STS_RESOURCE; }
+static inline blk_status_t nvme_fc_is_ready(struct nvme_fc_queue *queue, + struct request *rq) +{ + if (unlikely(!test_bit(NVME_FC_Q_LIVE, &queue->flags))) + return nvmf_check_init_req(&queue->ctrl->ctrl, rq); + return BLK_STS_OK; +} + static blk_status_t nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) @@ -2063,6 +2074,10 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, u32 data_len; blk_status_t ret;
+ ret = nvme_fc_is_ready(queue, rq); + if (unlikely(ret)) + return ret; + ret = nvme_setup_cmd(ns, rq, sqe); if (ret) return ret; @@ -2398,6 +2413,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ret) goto out_disconnect_admin_queue;
+ set_bit(NVME_FC_Q_LIVE, &ctrl->queues[0].flags); + /* * Check controller capabilities *
From: Kai-Heng Feng kai.heng.feng@canonical.com
[ Upstream commit 8427bbc224863e14d905c87920d4005cb3e88ac3 ]
The NVMe device in question drops off the PCIe bus after system suspend. I've tried several approaches to workaround this issue, but none of them works: - NVME_QUIRK_DELAY_BEFORE_CHK_RDY - NVME_QUIRK_NO_DEEPEST_PS - Disable APST before controller shutdown - Delay between controller shutdown and system suspend - Explicitly set power state to 0 before controller shutdown
Fortunately it's a desktop, so disable APST won't hurt the battery.
Also, change the quirk function name to reflect it's for vendor combination quirks.
BugLink: https://bugs.launchpad.net/bugs/1705748 Signed-off-by: Kai-Heng Feng kai.heng.feng@canonical.com Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/nvme/host/pci.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 75539f7c58b9..66d872b05024 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2282,7 +2282,7 @@ static int nvme_dev_map(struct nvme_dev *dev) return -ENODEV; }
-static unsigned long check_dell_samsung_bug(struct pci_dev *pdev) +static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) { if (pdev->vendor == 0x144d && pdev->device == 0xa802) { /* @@ -2297,6 +2297,14 @@ static unsigned long check_dell_samsung_bug(struct pci_dev *pdev) (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") || dmi_match(DMI_PRODUCT_NAME, "Precision 5510"))) return NVME_QUIRK_NO_DEEPEST_PS; + } else if (pdev->vendor == 0x144d && pdev->device == 0xa804) { + /* + * Samsung SSD 960 EVO drops off the PCIe bus after system + * suspend on a Ryzen board, ASUS PRIME B350M-A. + */ + if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") && + dmi_match(DMI_BOARD_NAME, "PRIME B350M-A")) + return NVME_QUIRK_NO_APST; }
return 0; @@ -2336,7 +2344,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto unmap;
- quirks |= check_dell_samsung_bug(pdev); + quirks |= check_vendor_combination_bug(pdev);
result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, quirks);
From: Minwoo Im minwoo.im.dev@gmail.com
[ Upstream commit 244a8fe40a09c218622eb9927b9090b0a9b73a1a ]
hmb descriptor idx out-of-bound occurs in case of below conditions. preferred = 128MiB chunk_size = 4MiB hmmaxd = 1
Current code will not allow rmmod which will free hmb descriptors to be done successfully in above case.
"descs[i]" will be set in for-loop without seeing any conditions related to "max_entries" after a single "descs" was allocated by (max_entries = 1) in this case.
Added a condition into for-loop to check index of descriptors.
Fixes: 044a9df1("nvme-pci: implement the HMB entry number and size limitations") Signed-off-by: Minwoo Im minwoo.im.dev@gmail.com Reviewed-by: Keith Busch keith.busch@intel.com Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 66d872b05024..5dae650438c1 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1645,7 +1645,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, if (!bufs) goto out_free_descs;
- for (size = 0; size < preferred; size += len) { + for (size = 0; size < preferred && i < max_entries; size += len) { dma_addr_t dma_addr;
len = min_t(u64, chunk_size, preferred - size);
From: James Smart jsmart2021@gmail.com
[ Upstream commit 619c62dcc62b957d17cccde2081cad527b020883 ]
Whenever a cmd is received a reference is taken while looking up the queue. The reference is removed after the cmd is done as the iod is returned for reuse. The fod may be reused for a deferred (recevied but no job context) cmd. Existing code removes the reference only if the fod is not reused for another command. Given the fod may be used for one or more ios, although a reference was taken per io, it won't be matched on the frees.
Remove the reference on every fod free. This pairs the references to each io.
Signed-off-by: James Smart james.smart@broadcom.com Reviewed-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/nvme/target/fc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 58e010bdda3e..8e21211b904b 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -532,15 +532,15 @@ nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue,
tgtport->ops->fcp_req_release(&tgtport->fc_target_port, fcpreq);
+ /* release the queue lookup reference on the completed IO */ + nvmet_fc_tgt_q_put(queue); + spin_lock_irqsave(&queue->qlock, flags); deferfcp = list_first_entry_or_null(&queue->pending_cmd_list, struct nvmet_fc_defer_fcp_req, req_list); if (!deferfcp) { list_add_tail(&fod->fcp_list, &fod->queue->fod_list); spin_unlock_irqrestore(&queue->qlock, flags); - - /* Release reference taken at queue lookup and fod allocation */ - nvmet_fc_tgt_q_put(queue); return; }
@@ -759,6 +759,9 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue) tgtport->ops->fcp_req_release(&tgtport->fc_target_port, deferfcp->fcp_req);
+ /* release the queue lookup reference */ + nvmet_fc_tgt_q_put(queue); + kfree(deferfcp);
spin_lock_irqsave(&queue->qlock, flags);
From: Thomas Richter tmricht@linux.vnet.ibm.com
[ Upstream commit 38389ec84e835fa31a59b7dabb18343106a6d0d5 ]
Commit 1887aa07b676 ("s390/topology: add detection of dedicated vs shared CPUs") introduced following compiler error when CONFIG_SCHED_TOPOLOGY is not set.
CC arch/s390/kernel/smp.o ... arch/s390/kernel/smp.c: In function ‘smp_start_secondary’: arch/s390/kernel/smp.c:812:6: error: implicit declaration of function ‘topology_cpu_dedicated’; did you mean ‘topology_cpu_init’?
This patch fixes the compiler error by adding function topology_cpu_dedicated() to return false when this config option is not defined.
Signed-off-by: Thomas Richter tmricht@linux.vnet.ibm.com Reviewed-by: Heiko Carstens heiko.carstens@de.ibm.com Signed-off-by: Martin Schwidefsky schwidefsky@de.ibm.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/s390/include/asm/topology.h | 1 + arch/s390/kernel/smp.c | 1 + 2 files changed, 2 insertions(+)
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 55de4eb73604..de0a8b17bcaa 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -51,6 +51,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu); static inline void topology_init_early(void) { } static inline void topology_schedule_update(void) { } static inline int topology_cpu_init(struct cpu *cpu) { return 0; } +static inline int topology_cpu_dedicated(int cpu_nr) { return 0; } static inline void topology_expect_change(void) { }
#endif /* CONFIG_SCHED_TOPOLOGY */ diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 092c4154abd7..7ffaf9fd6d19 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -54,6 +54,7 @@ #include <asm/sigp.h> #include <asm/idle.h> #include <asm/nmi.h> +#include <asm/topology.h> #include "entry.h"
enum {
From: Harald Freudenberger freude@linux.vnet.ibm.com
[ Upstream commit 0b0882672640ced4deeebf84da0b88b6389619c4 ]
The function to decide if one zcrypt queue is better than another one compared two pointers instead of comparing the values where the pointers refer to. So within the same zcrypt card when load of each queue was equal just one queue was used. This effect only appears on relatively lite load, typically with one thread applications.
This patch fixes the wrong comparison and now the counters show that requests are balanced equally over all available queues within the cards.
There is no performance improvement coming with this fix. As long as the queue depth for an APQN queue is not touched, processing is not faster when requests are spread over queues within the same card hardware. So this fix only beautifies the lszcrypt counter printouts.
Signed-off-by: Harald Freudenberger freude@linux.vnet.ibm.com Signed-off-by: Martin Schwidefsky schwidefsky@de.ibm.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/s390/crypto/zcrypt_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index b5f4006198b9..a9a56aa9c26b 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -218,8 +218,8 @@ static inline bool zcrypt_queue_compare(struct zcrypt_queue *zq, weight += atomic_read(&zq->load); pref_weight += atomic_read(&pref_zq->load); if (weight == pref_weight) - return &zq->queue->total_request_count > - &pref_zq->queue->total_request_count; + return zq->queue->total_request_count > + pref_zq->queue->total_request_count; return weight > pref_weight; }
From: Hans de Goede hdegoede@redhat.com
[ Upstream commit 10809bb976648ac58194a629e3d7af99e7400297 ]
Most Bay and Cherry Trail devices use a generic DSDT with all possible peripheral devices present in the DSDT, with their _STA returning 0x00 or 0x0f based on AML variables which describe what is actually present on the board.
Since ACPI device objects with a 0x00 status (not present) still get an entry under /sys/bus/acpi/devices, and those entry had an acpi:PNPID modalias, userspace would end up loading modules for non present hardware.
This commit fixes this by leaving the modalias empty for non present devices. This results in 10 modules less being loaded with a generic distro kernel config on my Cherry Trail test-device (a GPD pocket).
Signed-off-by: Hans de Goede hdegoede@redhat.com Signed-off-by: Rafael J. Wysocki rafael.j.wysocki@intel.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/acpi/device_sysfs.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/drivers/acpi/device_sysfs.c b/drivers/acpi/device_sysfs.c index 24418932612e..a041689e5701 100644 --- a/drivers/acpi/device_sysfs.c +++ b/drivers/acpi/device_sysfs.c @@ -146,6 +146,10 @@ static int create_pnp_modalias(struct acpi_device *acpi_dev, char *modalias, int count; struct acpi_hardware_id *id;
+ /* Avoid unnecessarily loading modules for non present devices. */ + if (!acpi_device_is_present(acpi_dev)) + return 0; + /* * Since we skip ACPI_DT_NAMESPACE_HID from the modalias below, 0 should * be returned if ACPI_DT_NAMESPACE_HID is the only ACPI/PNP ID in the
From: James Hogan jhogan@kernel.org
[ Upstream commit 0d307935fefa6389eb726c6362351c162c949101 ]
The MIPS loongson cpufreq drivers don't build unless configured for the correct machine type, due to dependency on machine specific architecture headers and symbols in machine specific platform code.
More specifically loongson1-cpufreq.c uses RST_CPU_EN and RST_CPU, neither of which is defined in asm/mach-loongson32/regs-clk.h unless CONFIG_LOONGSON1_LS1B=y, and loongson2_cpufreq.c references loongson2_clockmod_table[], which is only defined in arch/mips/loongson64/lemote-2f/clock.c, i.e. when CONFIG_LEMOTE_MACH2F=y.
Add these dependencies to Kconfig to avoid randconfig / allyesconfig build failures (e.g. when based on BMIPS which also has a cpufreq driver).
Signed-off-by: James Hogan jhogan@kernel.org Acked-by: Viresh Kumar viresh.kumar@linaro.org Signed-off-by: Rafael J. Wysocki rafael.j.wysocki@intel.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/cpufreq/Kconfig | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 4ebae43118ef..d8addbce40bc 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -275,6 +275,7 @@ config BMIPS_CPUFREQ
config LOONGSON2_CPUFREQ tristate "Loongson2 CPUFreq Driver" + depends on LEMOTE_MACH2F help This option adds a CPUFreq driver for loongson processors which support software configurable cpu frequency. @@ -287,6 +288,7 @@ config LOONGSON2_CPUFREQ
config LOONGSON1_CPUFREQ tristate "Loongson1 CPUFreq Driver" + depends on LOONGSON1_LS1B help This option adds a CPUFreq driver for loongson1 processors which support software configurable cpu frequency.
From: David Disseldorp ddiss@suse.de
[ Upstream commit 1addb798e93893d33c8dfab743cd44f09fd7719a ]
null_alloc_dev() allocates memory for dev->badblocks, but cleanup currently only occurs in the configfs release codepath, missing a number of other places.
This bug was found running the blktests block/010 test, alongside kmemleak: rapido1:/blktests# ./check block/010 ... rapido1:/blktests# echo scan > /sys/kernel/debug/kmemleak [ 306.966708] kmemleak: 32 new suspected memory leaks (see /sys/kernel/debug/kmemleak) rapido1:/blktests# cat /sys/kernel/debug/kmemleak unreferenced object 0xffff88001f86d000 (size 4096): comm "modprobe", pid 231, jiffies 4294892415 (age 318.252s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<ffffffff814b0379>] kmemleak_alloc+0x49/0xa0 [<ffffffff810f180f>] kmem_cache_alloc+0x9f/0xe0 [<ffffffff8124e45f>] badblocks_init+0x2f/0x60 [<ffffffffa0019fae>] 0xffffffffa0019fae [<ffffffffa0021273>] nullb_device_badblocks_store+0x63/0x130 [null_blk] [<ffffffff810004cd>] do_one_initcall+0x3d/0x170 [<ffffffff8109fe0d>] do_init_module+0x56/0x1e9 [<ffffffff8109ebd7>] load_module+0x1c47/0x26a0 [<ffffffff8109f819>] SyS_finit_module+0xa9/0xd0 [<ffffffff814b4f60>] entry_SYSCALL_64_fastpath+0x13/0x94
Fixes: 2f54a613c942 ("nullb: badbblocks support") Reviewed-by: Shaohua Li shli@fb.com Signed-off-by: David Disseldorp ddiss@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/block/null_blk.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 4d55af5c6e5b..69dfa1d3f453 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -467,7 +467,6 @@ static void nullb_device_release(struct config_item *item) { struct nullb_device *dev = to_nullb_device(item);
- badblocks_exit(&dev->badblocks); null_free_device_storage(dev, false); null_free_dev(dev); } @@ -578,6 +577,10 @@ static struct nullb_device *null_alloc_dev(void)
static void null_free_dev(struct nullb_device *dev) { + if (!dev) + return; + + badblocks_exit(&dev->badblocks); kfree(dev); }
From: David Howells dhowells@redhat.com
[ Upstream commit 03a6c82218b9a87014b2c6c4e178294fdc8ebd8a ]
The caller of rxrpc_accept_call() must release the lock on call->user_mutex returned by that function.
Signed-off-by: David Howells dhowells@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- net/rxrpc/sendmsg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 9ea6f972767e..d2f51d6a253c 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -563,8 +563,8 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) /* The socket is now unlocked. */ if (IS_ERR(call)) return PTR_ERR(call); - rxrpc_put_call(call, rxrpc_call_put); - return 0; + ret = 0; + goto out_put_unlock; }
call = rxrpc_find_call_by_user_ID(rx, p.user_call_ID); @@ -633,6 +633,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = rxrpc_send_data(rx, call, msg, len, NULL); }
+out_put_unlock: mutex_unlock(&call->user_mutex); error_put: rxrpc_put_call(call, rxrpc_call_put);
From: Martin Schwidefsky schwidefsky@de.ibm.com
[ Upstream commit 53c4ab70c11c3ba1b9e3caa8e8c17e9c16d9cbc0 ]
git commit badb8bb983e9 "fix alloc_pgste check in init_new_context" fixed the problem of 'current->mm == NULL' in init_new_context back in 2011.
git commit 3eabaee998c7 "KVM: s390: allow sie enablement for multi- threaded programs" completely removed the check against alloc_pgste.
git commit 23fefe119ceb "s390/kvm: avoid global config of vm.alloc_pgste=1" re-added a check against the alloc_pgste flag but without the required check for current->mm != NULL.
For execve() called by a kernel thread init_new_context() reads from ((struct mm_struct *) NULL)->context.alloc_pgste to decide between 2K vs 4K page tables. If the bit happens to be set for the init process it will be created with large page tables. This decision is inherited by all the children of init, this waste quite some memory.
Re-add the check for 'current->mm != NULL'.
Fixes: 23fefe119ceb ("s390/kvm: avoid global config of vm.alloc_pgste=1") Signed-off-by: Martin Schwidefsky schwidefsky@de.ibm.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/s390/include/asm/mmu_context.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 43607bb12cc2..a6cc744ff5fb 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -28,7 +28,7 @@ static inline int init_new_context(struct task_struct *tsk, #ifdef CONFIG_PGSTE mm->context.alloc_pgste = page_table_allocate_pgste || test_thread_flag(TIF_PGSTE) || - current->mm->context.alloc_pgste; + (current->mm && current->mm->context.alloc_pgste); mm->context.has_pgste = 0; mm->context.use_skey = 0; mm->context.use_cmma = 0;
From: David Howells dhowells@redhat.com
[ Upstream commit f859ab61875978eeaa539740ff7f7d91f5d60006 ]
RxRPC service endpoints expire like they're supposed to by the following means:
(1) Mark dead rxrpc_net structs (with ->live) rather than twiddling the global service conn timeout, otherwise the first rxrpc_net struct to die will cause connections on all others to expire immediately from then on.
(2) Mark local service endpoints for which the socket has been closed (->service_closed) so that the expiration timeout can be much shortened for service and client connections going through that endpoint.
(3) rxrpc_put_service_conn() needs to schedule the reaper when the usage count reaches 1, not 0, as idle conns have a 1 count.
(4) The accumulator for the earliest time we might want to schedule for should be initialised to jiffies + MAX_JIFFY_OFFSET, not ULONG_MAX as the comparison functions use signed arithmetic.
(5) Simplify the expiration handling, adding the expiration value to the idle timestamp each time rather than keeping track of the time in the past before which the idle timestamp must go to be expired. This is much easier to read.
(6) Ignore the timeouts if the net namespace is dead.
(7) Restart the service reaper work item rather the client reaper.
Signed-off-by: David Howells dhowells@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- include/trace/events/rxrpc.h | 2 ++ net/rxrpc/af_rxrpc.c | 13 +++++++++++++ net/rxrpc/ar-internal.h | 3 +++ net/rxrpc/conn_client.c | 2 ++ net/rxrpc/conn_object.c | 42 ++++++++++++++++++++++++------------------ net/rxrpc/net_ns.c | 3 +++ 6 files changed, 47 insertions(+), 18 deletions(-)
diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index ebe96796027a..a58caf5807ff 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -49,6 +49,7 @@ enum rxrpc_conn_trace { rxrpc_conn_put_client, rxrpc_conn_put_service, rxrpc_conn_queued, + rxrpc_conn_reap_service, rxrpc_conn_seen, };
@@ -206,6 +207,7 @@ enum rxrpc_congest_change { EM(rxrpc_conn_put_client, "PTc") \ EM(rxrpc_conn_put_service, "PTs") \ EM(rxrpc_conn_queued, "QUE") \ + EM(rxrpc_conn_reap_service, "RPs") \ E_(rxrpc_conn_seen, "SEE")
#define rxrpc_client_traces \ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 4b0a8288c98a..7c1cb08874d5 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -823,6 +823,19 @@ static int rxrpc_release_sock(struct sock *sk) sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK;
+ /* We want to kill off all connections from a service socket + * as fast as possible because we can't share these; client + * sockets, on the other hand, can share an endpoint. + */ + switch (sk->sk_state) { + case RXRPC_SERVER_BOUND: + case RXRPC_SERVER_BOUND2: + case RXRPC_SERVER_LISTENING: + case RXRPC_SERVER_LISTEN_DISABLED: + rx->local->service_closed = true; + break; + } + spin_lock_bh(&sk->sk_receive_queue.lock); sk->sk_state = RXRPC_CLOSE; spin_unlock_bh(&sk->sk_receive_queue.lock); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 2b3992c5060e..e6c2c4f56fb1 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -84,6 +84,7 @@ struct rxrpc_net { unsigned int nr_client_conns; unsigned int nr_active_client_conns; bool kill_all_client_conns; + bool live; spinlock_t client_conn_cache_lock; /* Lock for ->*_client_conns */ spinlock_t client_conn_discard_lock; /* Prevent multiple discarders */ struct list_head waiting_client_conns; @@ -265,6 +266,7 @@ struct rxrpc_local { rwlock_t services_lock; /* lock for services list */ int debug_id; /* debug ID for printks */ bool dead; + bool service_closed; /* Service socket closed */ struct sockaddr_rxrpc srx; /* local address */ };
@@ -824,6 +826,7 @@ void rxrpc_process_connection(struct work_struct *); * conn_object.c */ extern unsigned int rxrpc_connection_expiry; +extern unsigned int rxrpc_closed_conn_expiry;
struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 5f9624bd311c..78a154173d90 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -1061,6 +1061,8 @@ next: expiry = rxrpc_conn_idle_client_expiry; if (nr_conns > rxrpc_reap_client_connections) expiry = rxrpc_conn_idle_client_fast_expiry; + if (conn->params.local->service_closed) + expiry = rxrpc_closed_conn_expiry * HZ;
conn_expires_at = conn->idle_timestamp + expiry;
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index fe575798592f..a48c817b792b 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -20,7 +20,8 @@ /* * Time till a connection expires after last use (in seconds). */ -unsigned int rxrpc_connection_expiry = 10 * 60; +unsigned int __read_mostly rxrpc_connection_expiry = 10 * 60; +unsigned int __read_mostly rxrpc_closed_conn_expiry = 10;
static void rxrpc_destroy_connection(struct rcu_head *);
@@ -312,7 +313,7 @@ void rxrpc_put_service_conn(struct rxrpc_connection *conn) n = atomic_dec_return(&conn->usage); trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here); ASSERTCMP(n, >=, 0); - if (n == 0) { + if (n == 1) { rxnet = conn->params.local->rxnet; rxrpc_queue_delayed_work(&rxnet->service_conn_reaper, 0); } @@ -353,15 +354,14 @@ void rxrpc_service_connection_reaper(struct work_struct *work) struct rxrpc_net *rxnet = container_of(to_delayed_work(work), struct rxrpc_net, service_conn_reaper); - unsigned long reap_older_than, earliest, idle_timestamp, now; + unsigned long expire_at, earliest, idle_timestamp, now;
LIST_HEAD(graveyard);
_enter("");
now = jiffies; - reap_older_than = now - rxrpc_connection_expiry * HZ; - earliest = ULONG_MAX; + earliest = now + MAX_JIFFY_OFFSET;
write_lock(&rxnet->conn_lock); list_for_each_entry_safe(conn, _p, &rxnet->service_conns, link) { @@ -371,15 +371,21 @@ void rxrpc_service_connection_reaper(struct work_struct *work) if (conn->state == RXRPC_CONN_SERVICE_PREALLOC) continue;
- idle_timestamp = READ_ONCE(conn->idle_timestamp); - _debug("reap CONN %d { u=%d,t=%ld }", - conn->debug_id, atomic_read(&conn->usage), - (long)reap_older_than - (long)idle_timestamp); - - if (time_after(idle_timestamp, reap_older_than)) { - if (time_before(idle_timestamp, earliest)) - earliest = idle_timestamp; - continue; + if (rxnet->live) { + idle_timestamp = READ_ONCE(conn->idle_timestamp); + expire_at = idle_timestamp + rxrpc_connection_expiry * HZ; + if (conn->params.local->service_closed) + expire_at = idle_timestamp + rxrpc_closed_conn_expiry * HZ; + + _debug("reap CONN %d { u=%d,t=%ld }", + conn->debug_id, atomic_read(&conn->usage), + (long)expire_at - (long)now); + + if (time_before(now, expire_at)) { + if (time_before(expire_at, earliest)) + earliest = expire_at; + continue; + } }
/* The usage count sits at 1 whilst the object is unused on the @@ -387,6 +393,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work) */ if (atomic_cmpxchg(&conn->usage, 1, 0) != 1) continue; + trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, 0);
if (rxrpc_conn_is_client(conn)) BUG(); @@ -397,10 +404,10 @@ void rxrpc_service_connection_reaper(struct work_struct *work) } write_unlock(&rxnet->conn_lock);
- if (earliest != ULONG_MAX) { - _debug("reschedule reaper %ld", (long) earliest - now); + if (earliest != now + MAX_JIFFY_OFFSET) { + _debug("reschedule reaper %ld", (long)earliest - (long)now); ASSERT(time_after(earliest, now)); - rxrpc_queue_delayed_work(&rxnet->client_conn_reaper, + rxrpc_queue_delayed_work(&rxnet->service_conn_reaper, earliest - now); }
@@ -429,7 +436,6 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet)
rxrpc_destroy_all_client_connections(rxnet);
- rxrpc_connection_expiry = 0; cancel_delayed_work(&rxnet->client_conn_reaper); rxrpc_queue_delayed_work(&rxnet->client_conn_reaper, 0); flush_workqueue(rxrpc_workqueue); diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 7edceb8522f5..684c51d600c7 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -22,6 +22,7 @@ static __net_init int rxrpc_init_net(struct net *net) struct rxrpc_net *rxnet = rxrpc_net(net); int ret;
+ rxnet->live = true; get_random_bytes(&rxnet->epoch, sizeof(rxnet->epoch)); rxnet->epoch |= RXRPC_RANDOM_EPOCH;
@@ -60,6 +61,7 @@ static __net_init int rxrpc_init_net(struct net *net) return 0;
err_proc: + rxnet->live = false; return ret; }
@@ -70,6 +72,7 @@ static __net_exit void rxrpc_exit_net(struct net *net) { struct rxrpc_net *rxnet = rxrpc_net(net);
+ rxnet->live = false; rxrpc_destroy_all_calls(rxnet); rxrpc_destroy_all_connections(rxnet); rxrpc_destroy_all_locals(rxnet);
From: David Howells dhowells@redhat.com
[ Upstream commit 9faaff593404a9c4e5abc6839a641635d7b9d0cd ]
Provide a different lockdep key for rxrpc_call::user_mutex when the call is made on a kernel socket, such as by the AFS filesystem.
The problem is that lockdep registers a false positive between userspace calling the sendmsg syscall on a user socket where call->user_mutex is held whilst userspace memory is accessed whereas the AFS filesystem may perform operations with mmap_sem held by the caller.
In such a case, the following warning is produced.
====================================================== WARNING: possible circular locking dependency detected 4.14.0-fscache+ #243 Tainted: G E ------------------------------------------------------ modpost/16701 is trying to acquire lock: (&vnode->io_lock){+.+.}, at: [<ffffffffa000fc40>] afs_begin_vnode_operation+0x33/0x77 [kafs]
but task is already holding lock: (&mm->mmap_sem){++++}, at: [<ffffffff8104376a>] __do_page_fault+0x1ef/0x486
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #3 (&mm->mmap_sem){++++}: __might_fault+0x61/0x89 _copy_from_iter_full+0x40/0x1fa rxrpc_send_data+0x8dc/0xff3 rxrpc_do_sendmsg+0x62f/0x6a1 rxrpc_sendmsg+0x166/0x1b7 sock_sendmsg+0x2d/0x39 ___sys_sendmsg+0x1ad/0x22b __sys_sendmsg+0x41/0x62 do_syscall_64+0x89/0x1be return_from_SYSCALL_64+0x0/0x75
-> #2 (&call->user_mutex){+.+.}: __mutex_lock+0x86/0x7d2 rxrpc_new_client_call+0x378/0x80e rxrpc_kernel_begin_call+0xf3/0x154 afs_make_call+0x195/0x454 [kafs] afs_vl_get_capabilities+0x193/0x198 [kafs] afs_vl_lookup_vldb+0x5f/0x151 [kafs] afs_create_volume+0x2e/0x2f4 [kafs] afs_mount+0x56a/0x8d7 [kafs] mount_fs+0x6a/0x109 vfs_kern_mount+0x67/0x135 do_mount+0x90b/0xb57 SyS_mount+0x72/0x98 do_syscall_64+0x89/0x1be return_from_SYSCALL_64+0x0/0x75
-> #1 (k-sk_lock-AF_RXRPC){+.+.}: lock_sock_nested+0x74/0x8a rxrpc_kernel_begin_call+0x8a/0x154 afs_make_call+0x195/0x454 [kafs] afs_fs_get_capabilities+0x17a/0x17f [kafs] afs_probe_fileserver+0xf7/0x2f0 [kafs] afs_select_fileserver+0x83f/0x903 [kafs] afs_fetch_status+0x89/0x11d [kafs] afs_iget+0x16f/0x4f8 [kafs] afs_mount+0x6c6/0x8d7 [kafs] mount_fs+0x6a/0x109 vfs_kern_mount+0x67/0x135 do_mount+0x90b/0xb57 SyS_mount+0x72/0x98 do_syscall_64+0x89/0x1be return_from_SYSCALL_64+0x0/0x75
-> #0 (&vnode->io_lock){+.+.}: lock_acquire+0x174/0x19f __mutex_lock+0x86/0x7d2 afs_begin_vnode_operation+0x33/0x77 [kafs] afs_fetch_data+0x80/0x12a [kafs] afs_readpages+0x314/0x405 [kafs] __do_page_cache_readahead+0x203/0x2ba filemap_fault+0x179/0x54d __do_fault+0x17/0x60 __handle_mm_fault+0x6d7/0x95c handle_mm_fault+0x24e/0x2a3 __do_page_fault+0x301/0x486 do_page_fault+0x236/0x259 page_fault+0x22/0x30 __clear_user+0x3d/0x60 padzero+0x1c/0x2b load_elf_binary+0x785/0xdc7 search_binary_handler+0x81/0x1ff do_execveat_common.isra.14+0x600/0x888 do_execve+0x1f/0x21 SyS_execve+0x28/0x2f do_syscall_64+0x89/0x1be return_from_SYSCALL_64+0x0/0x75
other info that might help us debug this:
Chain exists of: &vnode->io_lock --> &call->user_mutex --> &mm->mmap_sem
Possible unsafe locking scenario:
CPU0 CPU1 ---- ---- lock(&mm->mmap_sem); lock(&call->user_mutex); lock(&mm->mmap_sem); lock(&vnode->io_lock);
*** DEADLOCK ***
1 lock held by modpost/16701: #0: (&mm->mmap_sem){++++}, at: [<ffffffff8104376a>] __do_page_fault+0x1ef/0x486
stack backtrace: CPU: 0 PID: 16701 Comm: modpost Tainted: G E 4.14.0-fscache+ #243 Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014 Call Trace: dump_stack+0x67/0x8e print_circular_bug+0x341/0x34f check_prev_add+0x11f/0x5d4 ? add_lock_to_list.isra.12+0x8b/0x8b ? add_lock_to_list.isra.12+0x8b/0x8b ? __lock_acquire+0xf77/0x10b4 __lock_acquire+0xf77/0x10b4 lock_acquire+0x174/0x19f ? afs_begin_vnode_operation+0x33/0x77 [kafs] __mutex_lock+0x86/0x7d2 ? afs_begin_vnode_operation+0x33/0x77 [kafs] ? afs_begin_vnode_operation+0x33/0x77 [kafs] ? afs_begin_vnode_operation+0x33/0x77 [kafs] afs_begin_vnode_operation+0x33/0x77 [kafs] afs_fetch_data+0x80/0x12a [kafs] afs_readpages+0x314/0x405 [kafs] __do_page_cache_readahead+0x203/0x2ba ? filemap_fault+0x179/0x54d filemap_fault+0x179/0x54d __do_fault+0x17/0x60 __handle_mm_fault+0x6d7/0x95c handle_mm_fault+0x24e/0x2a3 __do_page_fault+0x301/0x486 do_page_fault+0x236/0x259 page_fault+0x22/0x30 RIP: 0010:__clear_user+0x3d/0x60 RSP: 0018:ffff880071e93da0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 000000000000011c RCX: 000000000000011c RDX: 0000000000000000 RSI: 0000000000000008 RDI: 000000000060f720 RBP: 000000000060f720 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: ffff8800b5459b68 R12: ffff8800ce150e00 R13: 000000000060f720 R14: 00000000006127a8 R15: 0000000000000000 padzero+0x1c/0x2b load_elf_binary+0x785/0xdc7 search_binary_handler+0x81/0x1ff do_execveat_common.isra.14+0x600/0x888 do_execve+0x1f/0x21 SyS_execve+0x28/0x2f do_syscall_64+0x89/0x1be entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:0x7fdb6009ee07 RSP: 002b:00007fff566d9728 EFLAGS: 00000246 ORIG_RAX: 000000000000003b RAX: ffffffffffffffda RBX: 000055ba57280900 RCX: 00007fdb6009ee07 RDX: 000055ba5727f270 RSI: 000055ba5727cac0 RDI: 000055ba57280900 RBP: 000055ba57280900 R08: 00007fff566d9700 R09: 0000000000000000 R10: 000055ba5727cac0 R11: 0000000000000246 R12: 0000000000000000 R13: 000055ba5727cac0 R14: 000055ba5727f270 R15: 0000000000000000
Signed-off-by: David Howells dhowells@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- net/rxrpc/ar-internal.h | 2 +- net/rxrpc/call_accept.c | 2 +- net/rxrpc/call_object.c | 19 +++++++++++++++---- 3 files changed, 17 insertions(+), 6 deletions(-)
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ea5600b747cc..2b3992c5060e 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -671,7 +671,7 @@ extern unsigned int rxrpc_max_call_lifetime; extern struct kmem_cache *rxrpc_call_jar;
struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); -struct rxrpc_call *rxrpc_alloc_call(gfp_t); +struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index cbd1701e813a..3028298ca561 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -94,7 +94,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, /* Now it gets complicated, because calls get registered with the * socket here, particularly if a user ID is preassigned by the user. */ - call = rxrpc_alloc_call(gfp); + call = rxrpc_alloc_call(rx, gfp); if (!call) return -ENOMEM; call->flags |= (1 << RXRPC_CALL_IS_SERVICE); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index fcdd6555a820..8a5a42e8ec23 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -55,6 +55,8 @@ static void rxrpc_call_timer_expired(unsigned long _call) rxrpc_set_timer(call, rxrpc_timer_expired, ktime_get_real()); }
+static struct lock_class_key rxrpc_call_user_mutex_lock_class_key; + /* * find an extant server call * - called in process context with IRQs enabled @@ -95,7 +97,7 @@ found_extant_call: /* * allocate a new call */ -struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) +struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp) { struct rxrpc_call *call;
@@ -114,6 +116,14 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) goto nomem_2;
mutex_init(&call->user_mutex); + + /* Prevent lockdep reporting a deadlock false positive between the afs + * filesystem and sys_sendmsg() via the mmap sem. + */ + if (rx->sk.sk_kern_sock) + lockdep_set_class(&call->user_mutex, + &rxrpc_call_user_mutex_lock_class_key); + setup_timer(&call->timer, rxrpc_call_timer_expired, (unsigned long)call); INIT_WORK(&call->processor, &rxrpc_process_call); @@ -150,7 +160,8 @@ nomem: /* * Allocate a new client call. */ -static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, +static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, + struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_call *call; @@ -158,7 +169,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
_enter("");
- call = rxrpc_alloc_call(gfp); + call = rxrpc_alloc_call(rx, gfp); if (!call) return ERR_PTR(-ENOMEM); call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; @@ -209,7 +220,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
_enter("%p,%lx", rx, user_call_ID);
- call = rxrpc_alloc_client_call(srx, gfp); + call = rxrpc_alloc_client_call(rx, srx, gfp); if (IS_ERR(call)) { release_sock(&rx->sk); _leave(" = %ld", PTR_ERR(call));
From: Michael Lyle mlyle@lyle.org
[ Upstream commit 6c4ca1e36cdc1a0a7a84797804b87920ccbebf51 ]
register_shrinker is now __must_check, so check it to kill a warning. Caller of bch_btree_cache_alloc in super.c appropriately checks return value so this is fully plumbed through.
This V2 fixes checkpatch warnings and improves the commit description, as I was too hasty getting the previous version out.
Signed-off-by: Michael Lyle mlyle@lyle.org Reviewed-by: Vojtech Pavlik vojtech@suse.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/md/bcache/btree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 658c54b3b07a..1598d1e04989 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -807,7 +807,10 @@ int bch_btree_cache_alloc(struct cache_set *c) c->shrink.scan_objects = bch_mca_scan; c->shrink.seeks = 4; c->shrink.batch = c->btree_pages * 2; - register_shrinker(&c->shrink); + + if (register_shrinker(&c->shrink)) + pr_warn("bcache: %s: could not register shrinker", + __func__);
return 0; }
From: Felix Kuehling Felix.Kuehling@amd.com
[ Upstream commit cf21654b40968609779751b34e7923180968fe5b ]
Fix the SDMA load and unload sequence as suggested by HW document.
Signed-off-by: shaoyun liu shaoyun.liu@amd.com Signed-off-by: Felix Kuehling Felix.Kuehling@amd.com Acked-by: Oded Gabbay oded.gabbay@gmail.com Signed-off-by: Oded Gabbay oded.gabbay@gmail.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 47 ++++++++++++++++------- 1 file changed, 34 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index b9dbbf9cb8b0..bdabaa3399db 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -369,29 +369,50 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) { struct amdgpu_device *adev = get_amdgpu_device(kgd); struct cik_sdma_rlc_registers *m; + unsigned long end_jiffies; uint32_t sdma_base_addr; + uint32_t data;
m = get_sdma_mqd(mqd); sdma_base_addr = get_sdma_base_addr(m);
- WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, - m->sdma_rlc_virtual_addr); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, + m->sdma_rlc_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, - m->sdma_rlc_rb_base); + end_jiffies = msecs_to_jiffies(2000) + jiffies; + while (true) { + data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); + if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) + break; + if (time_after(jiffies, end_jiffies)) + return -ETIME; + usleep_range(500, 1000); + } + if (m->sdma_engine_id) { + data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL); + data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL, + RESUME_CTX, 0); + WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data); + } else { + data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL); + data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, + RESUME_CTX, 0); + WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data); + }
+ WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, + m->sdma_rlc_doorbell); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); + WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, + m->sdma_rlc_virtual_addr); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base); WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, m->sdma_rlc_rb_base_hi); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, m->sdma_rlc_rb_rptr_addr_lo); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, m->sdma_rlc_rb_rptr_addr_hi); - - WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, - m->sdma_rlc_doorbell); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, m->sdma_rlc_rb_cntl);
@@ -564,9 +585,9 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, }
WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, + RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | + SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
return 0; }
From: shaoyunl Shaoyun.Liu@amd.com
[ Upstream commit d12fb13f23199faa7e536acec1db49068e5a067d ]
ffs function return the position of the first bit set on 1 based. (bit zero returns 1).
Signed-off-by: shaoyun liu shaoyun.liu@amd.com Signed-off-by: Felix Kuehling Felix.Kuehling@amd.com Reviewed-by: Oded Gabbay oded.gabbay@gmail.com Signed-off-by: Oded Gabbay oded.gabbay@gmail.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index 44ffd23348fc..164fa4b1f9a9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -205,8 +205,8 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, struct cik_sdma_rlc_registers *m;
m = get_sdma_mqd(mqd); - m->sdma_rlc_rb_cntl = ffs(q->queue_size / sizeof(unsigned int)) << - SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + m->sdma_rlc_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) + << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
From: "Dmitry V. Levin" ldv@altlinux.org
[ Upstream commit b4d085201d86af69cbda2214c6dafc0be240ef9f ]
Consistently use types provided by <linux/types.h> via <drm/drm.h> to fix the following linux/kfd_ioctl.h userspace compilation errors:
/usr/include/linux/kfd_ioctl.h:236:2: error: unknown type name 'uint64_t' uint64_t va_addr; /* to KFD */ /usr/include/linux/kfd_ioctl.h:237:2: error: unknown type name 'uint32_t' uint32_t gpu_id; /* to KFD */ /usr/include/linux/kfd_ioctl.h:238:2: error: unknown type name 'uint32_t' uint32_t pad; /usr/include/linux/kfd_ioctl.h:243:2: error: unknown type name 'uint64_t' uint64_t tile_config_ptr; /usr/include/linux/kfd_ioctl.h:245:2: error: unknown type name 'uint64_t' uint64_t macro_tile_config_ptr; /usr/include/linux/kfd_ioctl.h:249:2: error: unknown type name 'uint32_t' uint32_t num_tile_configs; /usr/include/linux/kfd_ioctl.h:253:2: error: unknown type name 'uint32_t' uint32_t num_macro_tile_configs; /usr/include/linux/kfd_ioctl.h:255:2: error: unknown type name 'uint32_t' uint32_t gpu_id; /* to KFD */ /usr/include/linux/kfd_ioctl.h:256:2: error: unknown type name 'uint32_t' uint32_t gb_addr_config; /* from KFD */ /usr/include/linux/kfd_ioctl.h:257:2: error: unknown type name 'uint32_t' uint32_t num_banks; /* from KFD */ /usr/include/linux/kfd_ioctl.h:258:2: error: unknown type name 'uint32_t' uint32_t num_ranks; /* from KFD */
Fixes: 6a1c9510694fe ("drm/amdkfd: Adding new IOCTL for scratch memory v2") Fixes: 5d71dbc3a5886 ("drm/amdkfd: Implement image tiling mode support v2") Signed-off-by: Dmitry V. Levin ldv@altlinux.org Signed-off-by: Oded Gabbay oded.gabbay@gmail.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- include/uapi/linux/kfd_ioctl.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 26283fefdf5f..f7015aa12347 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -233,29 +233,29 @@ struct kfd_ioctl_wait_events_args { };
struct kfd_ioctl_set_scratch_backing_va_args { - uint64_t va_addr; /* to KFD */ - uint32_t gpu_id; /* to KFD */ - uint32_t pad; + __u64 va_addr; /* to KFD */ + __u32 gpu_id; /* to KFD */ + __u32 pad; };
struct kfd_ioctl_get_tile_config_args { /* to KFD: pointer to tile array */ - uint64_t tile_config_ptr; + __u64 tile_config_ptr; /* to KFD: pointer to macro tile array */ - uint64_t macro_tile_config_ptr; + __u64 macro_tile_config_ptr; /* to KFD: array size allocated by user mode * from KFD: array size filled by kernel */ - uint32_t num_tile_configs; + __u32 num_tile_configs; /* to KFD: array size allocated by user mode * from KFD: array size filled by kernel */ - uint32_t num_macro_tile_configs; + __u32 num_macro_tile_configs;
- uint32_t gpu_id; /* to KFD */ - uint32_t gb_addr_config; /* from KFD */ - uint32_t num_banks; /* from KFD */ - uint32_t num_ranks; /* from KFD */ + __u32 gpu_id; /* to KFD */ + __u32 gb_addr_config; /* from KFD */ + __u32 num_banks; /* from KFD */ + __u32 num_ranks; /* from KFD */ /* struct size can be extended later if needed * without breaking ABI compatibility */
From: Felix Kuehling Felix.Kuehling@amd.com
[ Upstream commit 8c946b8988acec785bcf67088b6bd0747f36d2d3 ]
SDMA only supports a fixed number of queues. HWS cannot handle oversubscription.
Signed-off-by: shaoyun liu shaoyun.liu@amd.com Signed-off-by: Felix Kuehling Felix.Kuehling@amd.com Reviewed-by: Oded Gabbay oded.gabbay@gmail.com Signed-off-by: Oded Gabbay oded.gabbay@gmail.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 03bec765b03d..f9a1a4db9be7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -184,6 +184,24 @@ int pqm_create_queue(struct process_queue_manager *pqm,
switch (type) { case KFD_QUEUE_TYPE_SDMA: + if (dev->dqm->queue_count >= + CIK_SDMA_QUEUES_PER_ENGINE * CIK_SDMA_ENGINE_NUM) { + pr_err("Over-subscription is not allowed for SDMA.\n"); + retval = -EPERM; + goto err_create_queue; + } + + retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); + if (retval != 0) + goto err_create_queue; + pqn->q = q; + pqn->kq = NULL; + retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd, + &q->properties.vmid); + pr_debug("DQM returned %d for create_queue\n", retval); + print_queue(q); + break; + case KFD_QUEUE_TYPE_COMPUTE: /* check if there is over subscription */ if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) &&
From: Sagi Grimberg sagi@grimberg.me
[ Upstream commit 4af7f7ff92a42b6c713293c99e7982bcfcf51a70 ]
In order to guarantee that the HCA will never get an access violation (either from invalidated rkey or from iommu) when retrying a send operation we must complete a request only when both send completion and the nvme cqe has arrived. We need to set the send/recv completions flags atomically because we might have more than a single context accessing the request concurrently (one is cq irq-poll context and the other is user-polling used in IOCB_HIPRI).
Only then we are safe to invalidate the rkey (if needed), unmap the host buffers, and complete the IO.
Signed-off-by: Sagi Grimberg sagi@grimberg.me Reviewed-by: Max Gurtovoy maxg@mellanox.com Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/nvme/host/rdma.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 6de163e6c9eb..33d4431c2b4b 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -67,6 +67,9 @@ struct nvme_rdma_request { struct nvme_request req; struct ib_mr *mr; struct nvme_rdma_qe sqe; + union nvme_result result; + __le16 status; + refcount_t ref; struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; u32 num_sge; int nents; @@ -1177,6 +1180,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, req->num_sge = 1; req->inline_data = false; req->mr->need_inval = false; + refcount_set(&req->ref, 2); /* send and recv completions */
c->common.flags |= NVME_CMD_SGL_METABUF;
@@ -1213,8 +1217,19 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) { - if (unlikely(wc->status != IB_WC_SUCCESS)) + struct nvme_rdma_qe *qe = + container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); + struct nvme_rdma_request *req = + container_of(qe, struct nvme_rdma_request, sqe); + struct request *rq = blk_mq_rq_from_pdu(req); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { nvme_rdma_wr_error(cq, wc, "SEND"); + return; + } + + if (refcount_dec_and_test(&req->ref)) + nvme_end_request(rq, req->status, req->result); }
/* @@ -1359,14 +1374,19 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, } req = blk_mq_rq_to_pdu(rq);
- if (rq->tag == tag) - ret = 1; + req->status = cqe->status; + req->result = cqe->result;
if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) && wc->ex.invalidate_rkey == req->mr->rkey) req->mr->need_inval = false;
- nvme_end_request(rq, cqe->status, cqe->result); + if (refcount_dec_and_test(&req->ref)) { + if (rq->tag == tag) + ret = 1; + nvme_end_request(rq, req->status, req->result); + } + return ret; }
From: zhangliping zhangliping02@baidu.com
[ Upstream commit 67c8d22a73128ff910e2287567132530abcf5b71 ]
If we want to add a datapath flow, which has more than 500 vxlan outputs' action, we will get the following error reports: openvswitch: netlink: Flow action size 32832 bytes exceeds max openvswitch: netlink: Flow action size 32832 bytes exceeds max openvswitch: netlink: Actions may not be safe on all matching packets ... ...
It seems that we can simply enlarge the MAX_ACTIONS_BUFSIZE to fix it, but this is not the root cause. For example, for a vxlan output action, we need about 60 bytes for the nlattr, but after it is converted to the flow action, it only occupies 24 bytes. This means that we can still support more than 1000 vxlan output actions for a single datapath flow under the the current 32k max limitation.
So even if the nla_len(attr) is larger than MAX_ACTIONS_BUFSIZE, we shouldn't report EINVAL and keep it move on, as the judgement can be done by the reserve_sfa_size.
Signed-off-by: zhangliping zhangliping02@baidu.com Acked-by: Pravin B Shelar pshelar@ovn.org Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- net/openvswitch/flow_netlink.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index e8eb427ce6d1..0d9f6afa266c 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1903,14 +1903,11 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb)
#define MAX_ACTIONS_BUFSIZE (32 * 1024)
-static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log) +static struct sw_flow_actions *nla_alloc_flow_actions(int size) { struct sw_flow_actions *sfa;
- if (size > MAX_ACTIONS_BUFSIZE) { - OVS_NLERR(log, "Flow action size %u bytes exceeds max", size); - return ERR_PTR(-EINVAL); - } + WARN_ON_ONCE(size > MAX_ACTIONS_BUFSIZE);
sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); if (!sfa) @@ -1983,12 +1980,15 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, new_acts_size = ksize(*sfa) * 2;
if (new_acts_size > MAX_ACTIONS_BUFSIZE) { - if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) + if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) { + OVS_NLERR(log, "Flow action size exceeds max %u", + MAX_ACTIONS_BUFSIZE); return ERR_PTR(-EMSGSIZE); + } new_acts_size = MAX_ACTIONS_BUFSIZE; }
- acts = nla_alloc_flow_actions(new_acts_size, log); + acts = nla_alloc_flow_actions(new_acts_size); if (IS_ERR(acts)) return (void *)acts;
@@ -2660,7 +2660,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, { int err;
- *sfa = nla_alloc_flow_actions(nla_len(attr), log); + *sfa = nla_alloc_flow_actions(min(nla_len(attr), MAX_ACTIONS_BUFSIZE)); if (IS_ERR(*sfa)) return PTR_ERR(*sfa);
From: Mirza Krak mirza.krak@endian.se
[ Upstream commit 517f56839f581618d24f2e67a35738a5c6cbaecb ]
In the case where the bind gets deferred we would end up with a un-balanced runtime PM enable call.
Fix this by simply moving the pm_runtime_enable call to the end of the bind function when all paths have succeeded.
Signed-off-by: Mirza Krak mirza.krak@endian.se Signed-off-by: Sandy Huang hjc@rock-chips.com Link: https://patchwork.freedesktop.org/patch/msgid/1510734286-37434-1-git-send-em... Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/gpu/drm/rockchip/dw-mipi-dsi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/rockchip/dw-mipi-dsi.c b/drivers/gpu/drm/rockchip/dw-mipi-dsi.c index 9a20b9dc27c8..f7fc652b0027 100644 --- a/drivers/gpu/drm/rockchip/dw-mipi-dsi.c +++ b/drivers/gpu/drm/rockchip/dw-mipi-dsi.c @@ -1275,8 +1275,6 @@ static int dw_mipi_dsi_bind(struct device *dev, struct device *master, goto err_pllref; }
- pm_runtime_enable(dev); - dsi->dsi_host.ops = &dw_mipi_dsi_host_ops; dsi->dsi_host.dev = dev; ret = mipi_dsi_host_register(&dsi->dsi_host); @@ -1291,6 +1289,7 @@ static int dw_mipi_dsi_bind(struct device *dev, struct device *master, }
dev_set_drvdata(dev, dsi); + pm_runtime_enable(dev); return 0;
err_mipi_dsi_host:
From: Johannes Berg johannes.berg@intel.com
[ Upstream commit 7b6ddeaf27eca72795ceeae2f0f347db1b5f9a30 ]
When connected to a QoS/WMM AP, mac80211 should use a QoS NDP for probing it, instead of a regular non-QoS one, fix this.
Change all the drivers to *not* allow QoS NDP for now, even though it looks like most of them should be OK with that.
Signed-off-by: Johannes Berg johannes.berg@intel.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/net/wireless/ath/ath9k/channel.c | 2 +- drivers/net/wireless/st/cw1200/sta.c | 4 ++-- drivers/net/wireless/ti/wl1251/main.c | 2 +- drivers/net/wireless/ti/wlcore/cmd.c | 5 +++-- include/net/mac80211.h | 8 +++++++- net/mac80211/mlme.c | 2 +- net/mac80211/tx.c | 29 +++++++++++++++++++++++++++-- 7 files changed, 42 insertions(+), 10 deletions(-)
diff --git a/drivers/net/wireless/ath/ath9k/channel.c b/drivers/net/wireless/ath/ath9k/channel.c index f0439f2d566b..173891b11b2d 100644 --- a/drivers/net/wireless/ath/ath9k/channel.c +++ b/drivers/net/wireless/ath/ath9k/channel.c @@ -1112,7 +1112,7 @@ ath_chanctx_send_vif_ps_frame(struct ath_softc *sc, struct ath_vif *avp, if (!avp->assoc) return false;
- skb = ieee80211_nullfunc_get(sc->hw, vif); + skb = ieee80211_nullfunc_get(sc->hw, vif, false); if (!skb) return false;
diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c index a52224836a2b..666b88cb2cfe 100644 --- a/drivers/net/wireless/st/cw1200/sta.c +++ b/drivers/net/wireless/st/cw1200/sta.c @@ -198,7 +198,7 @@ void __cw1200_cqm_bssloss_sm(struct cw1200_common *priv,
priv->bss_loss_state++;
- skb = ieee80211_nullfunc_get(priv->hw, priv->vif); + skb = ieee80211_nullfunc_get(priv->hw, priv->vif, false); WARN_ON(!skb); if (skb) cw1200_tx(priv->hw, NULL, skb); @@ -2266,7 +2266,7 @@ static int cw1200_upload_null(struct cw1200_common *priv) .rate = 0xFF, };
- frame.skb = ieee80211_nullfunc_get(priv->hw, priv->vif); + frame.skb = ieee80211_nullfunc_get(priv->hw, priv->vif, false); if (!frame.skb) return -ENOMEM;
diff --git a/drivers/net/wireless/ti/wl1251/main.c b/drivers/net/wireless/ti/wl1251/main.c index 9915d83a4a30..6d02c660b4ab 100644 --- a/drivers/net/wireless/ti/wl1251/main.c +++ b/drivers/net/wireless/ti/wl1251/main.c @@ -566,7 +566,7 @@ static int wl1251_build_null_data(struct wl1251 *wl) size = sizeof(struct wl12xx_null_data_template); ptr = NULL; } else { - skb = ieee80211_nullfunc_get(wl->hw, wl->vif); + skb = ieee80211_nullfunc_get(wl->hw, wl->vif, false); if (!skb) goto out; size = skb->len; diff --git a/drivers/net/wireless/ti/wlcore/cmd.c b/drivers/net/wireless/ti/wlcore/cmd.c index 2bfc12fdc929..761cf8573a80 100644 --- a/drivers/net/wireless/ti/wlcore/cmd.c +++ b/drivers/net/wireless/ti/wlcore/cmd.c @@ -1069,7 +1069,8 @@ int wl12xx_cmd_build_null_data(struct wl1271 *wl, struct wl12xx_vif *wlvif) ptr = NULL; } else { skb = ieee80211_nullfunc_get(wl->hw, - wl12xx_wlvif_to_vif(wlvif)); + wl12xx_wlvif_to_vif(wlvif), + false); if (!skb) goto out; size = skb->len; @@ -1096,7 +1097,7 @@ int wl12xx_cmd_build_klv_null_data(struct wl1271 *wl, struct sk_buff *skb = NULL; int ret = -ENOMEM;
- skb = ieee80211_nullfunc_get(wl->hw, vif); + skb = ieee80211_nullfunc_get(wl->hw, vif, false); if (!skb) goto out;
diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 885690fa39c8..4f1d2dec43ce 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4470,18 +4470,24 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, * ieee80211_nullfunc_get - retrieve a nullfunc template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @qos_ok: QoS NDP is acceptable to the caller, this should be set + * if at all possible * * Creates a Nullfunc template which can, for example, uploaded to * hardware. The template must be updated after association so that correct * BSSID and address is used. * + * If @qos_ndp is set and the association is to an AP with QoS/WMM, the + * returned packet will be QoS NDP. + * * Note: Caller (or hardware) is responsible for setting the * &IEEE80211_FCTL_PM bit as well as Duration and Sequence Control fields. * * Return: The nullfunc template. %NULL on error. */ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, - struct ieee80211_vif *vif); + struct ieee80211_vif *vif, + bool qos_ok);
/** * ieee80211_probereq_get - retrieve a Probe Request template diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 3b8e2709d8de..9115cc52ce83 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -908,7 +908,7 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
- skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif); + skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, true); if (!skb) return;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 94826680cf2b..73429841f115 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -4404,13 +4404,15 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, EXPORT_SYMBOL(ieee80211_pspoll_get);
struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, - struct ieee80211_vif *vif) + struct ieee80211_vif *vif, + bool qos_ok) { struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_sub_if_data *sdata; struct ieee80211_if_managed *ifmgd; struct ieee80211_local *local; struct sk_buff *skb; + bool qos = false;
if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return NULL; @@ -4419,7 +4421,17 @@ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, ifmgd = &sdata->u.mgd; local = sdata->local;
- skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*nullfunc)); + if (qos_ok) { + struct sta_info *sta; + + rcu_read_lock(); + sta = sta_info_get(sdata, ifmgd->bssid); + qos = sta && sta->sta.wme; + rcu_read_unlock(); + } + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + + sizeof(*nullfunc) + 2); if (!skb) return NULL;
@@ -4429,6 +4441,19 @@ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, nullfunc->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_TODS); + if (qos) { + __le16 qos = cpu_to_le16(7); + + BUILD_BUG_ON((IEEE80211_STYPE_QOS_NULLFUNC | + IEEE80211_STYPE_NULLFUNC) != + IEEE80211_STYPE_QOS_NULLFUNC); + nullfunc->frame_control |= + cpu_to_le16(IEEE80211_STYPE_QOS_NULLFUNC); + skb->priority = 7; + skb_set_queue_mapping(skb, IEEE80211_AC_VO); + skb_put_data(skb, &qos, sizeof(qos)); + } + memcpy(nullfunc->addr1, ifmgd->bssid, ETH_ALEN); memcpy(nullfunc->addr2, vif->addr, ETH_ALEN); memcpy(nullfunc->addr3, ifmgd->bssid, ETH_ALEN);
From: Chun-Yeow Yeoh yeohchunyeow@gmail.com
[ Upstream commit fbbdad5edf0bb59786a51b94a9d006bc8c2da9a2 ]
The previous path metric update from RANN frame has not considered the own link metric toward the transmitting mesh STA. Fix this.
Reported-by: Michael65535 Signed-off-by: Chun-Yeow Yeoh yeohchunyeow@gmail.com Signed-off-by: Johannes Berg johannes.berg@intel.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- net/mac80211/mesh_hwmp.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index d8bbd0d2225a..d6d3f316de4c 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -797,7 +797,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, struct mesh_path *mpath; u8 ttl, flags, hopcount; const u8 *orig_addr; - u32 orig_sn, metric, metric_txsta, interval; + u32 orig_sn, new_metric, orig_metric, last_hop_metric, interval; bool root_is_gate;
ttl = rann->rann_ttl; @@ -808,7 +808,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, interval = le32_to_cpu(rann->rann_interval); hopcount = rann->rann_hopcount; hopcount++; - metric = le32_to_cpu(rann->rann_metric); + orig_metric = le32_to_cpu(rann->rann_metric);
/* Ignore our own RANNs */ if (ether_addr_equal(orig_addr, sdata->vif.addr)) @@ -825,7 +825,10 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, return; }
- metric_txsta = airtime_link_metric_get(local, sta); + last_hop_metric = airtime_link_metric_get(local, sta); + new_metric = orig_metric + last_hop_metric; + if (new_metric < orig_metric) + new_metric = MAX_METRIC;
mpath = mesh_path_lookup(sdata, orig_addr); if (!mpath) { @@ -838,7 +841,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, }
if (!(SN_LT(mpath->sn, orig_sn)) && - !(mpath->sn == orig_sn && metric < mpath->rann_metric)) { + !(mpath->sn == orig_sn && new_metric < mpath->rann_metric)) { rcu_read_unlock(); return; } @@ -856,7 +859,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, }
mpath->sn = orig_sn; - mpath->rann_metric = metric + metric_txsta; + mpath->rann_metric = new_metric; mpath->is_root = true; /* Recording RANNs sender address to send individually * addressed PREQs destined for root mesh STA */ @@ -876,7 +879,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, mesh_path_sel_frame_tx(MPATH_RANN, flags, orig_addr, orig_sn, 0, NULL, 0, broadcast_addr, hopcount, ttl, interval, - metric + metric_txsta, 0, sdata); + new_metric, 0, sdata); }
rcu_read_unlock();
From: Xin Long lucien.xin@gmail.com
[ Upstream commit 5c6144a0eb5366ae07fc5059301b139338f39bbd ]
As it says in rfc6525#section5.1.4, before sending the request,
C2: The sender has either no outstanding TSNs or considers all outstanding TSNs abandoned.
Prior to this patch, it tried to consider all outstanding TSNs abandoned by dropping all chunks in all outqs with sctp_outq_free (even including sacked, retransmit and transmitted queues) when doing this reset, which is too aggressive.
To make it work gently, this patch will only allow the asoc reset when the sender has no outstanding TSNs by checking if unsent, transmitted and retransmit are all empty with sctp_outq_is_empty before sending and processing the request.
Fixes: 692787cef651 ("sctp: implement receiver-side procedures for the SSN/TSN Reset Request Parameter") Signed-off-by: Xin Long lucien.xin@gmail.com Acked-by: Marcelo Ricardo Leitner marcelo.leitner@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- net/sctp/stream.c | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 724adf2786a2..7710133238ea 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -224,6 +224,9 @@ int sctp_send_reset_assoc(struct sctp_association *asoc) if (asoc->strreset_outstanding) return -EINPROGRESS;
+ if (!sctp_outq_is_empty(&asoc->outqueue)) + return -EAGAIN; + chunk = sctp_make_strreset_tsnreq(asoc); if (!chunk) return -ENOMEM; @@ -544,6 +547,12 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( } goto err; } + + if (!sctp_outq_is_empty(&asoc->outqueue)) { + result = SCTP_STRRESET_IN_PROGRESS; + goto err; + } + asoc->strreset_inseq++;
if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ))
From: Josef Bacik jbacik@fb.com
[ Upstream commit b77000ed558daa3bef0899d29bf171b8c9b5e6a8 ]
If we fail to prepare our pages for whatever reason (out of memory in our case) we need to make sure to drop the block_group->data_rwsem, otherwise hilarity ensues.
Signed-off-by: Josef Bacik jbacik@fb.com Reviewed-by: Omar Sandoval osandov@fb.com Reviewed-by: Liu Bo bo.li.liu@oracle.com Reviewed-by: David Sterba dsterba@suse.com [ add label and use existing unlocking code ] Signed-off-by: David Sterba dsterba@suse.com
Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/btrfs/free-space-cache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cdc9f4015ec3..4426d1c73e50 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1264,7 +1264,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* Lock all pages first so we can lock the extent safely. */ ret = io_ctl_prepare_pages(io_ctl, inode, 0); if (ret) - goto out; + goto out_unlock;
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state); @@ -1358,6 +1358,7 @@ out_nospc_locked: out_nospc: cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
+out_unlock: if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem);
From: Xin Long lucien.xin@gmail.com
[ Upstream commit 159f2a7456c6ae95c1e1a58e8b8ec65ef12d51cf ]
Now when doing asoc reset, it cleans up sacked and abandoned queues by calling sctp_outq_free where it also cleans up unsent, retransmit and transmitted queues.
It's safe for the sender of response, as these 3 queues are empty at that time. But when the receiver of response is doing the reset, the users may already enqueue some chunks into unsent during the time waiting the response, and these chunks should not be flushed.
To void the chunks in it would be removed, it moves the queue into a temp list, then gets it back after sctp_outq_free is done.
The patch also fixes some incorrect comments in sctp_process_strreset_tsnreq.
Signed-off-by: Xin Long lucien.xin@gmail.com Acked-by: Marcelo Ricardo Leitner marcelo.leitner@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- net/sctp/stream.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-)
diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 7710133238ea..aa629654d27e 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -563,9 +563,10 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( goto out; }
- /* G3: The same processing as though a SACK chunk with no gap report - * and a cumulative TSN ACK of the Sender's Next TSN minus 1 were - * received MUST be performed. + /* G4: The same processing as though a FWD-TSN chunk (as defined in + * [RFC3758]) with all streams affected and a new cumulative TSN + * ACK of the Receiver's Next TSN minus 1 were received MUST be + * performed. */ max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map); sctp_ulpq_reasm_flushtsn(&asoc->ulpq, max_tsn_seen); @@ -580,10 +581,9 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, init_tsn, GFP_ATOMIC);
- /* G4: The same processing as though a FWD-TSN chunk (as defined in - * [RFC3758]) with all streams affected and a new cumulative TSN - * ACK of the Receiver's Next TSN minus 1 were received MUST be - * performed. + /* G3: The same processing as though a SACK chunk with no gap report + * and a cumulative TSN ACK of the Sender's Next TSN minus 1 were + * received MUST be performed. */ sctp_outq_free(&asoc->outqueue);
@@ -844,6 +844,7 @@ struct sctp_chunk *sctp_process_strreset_resp( if (result == SCTP_STRRESET_PERFORMED) { __u32 mtsn = sctp_tsnmap_get_max_tsn_seen( &asoc->peer.tsn_map); + LIST_HEAD(temp);
sctp_ulpq_reasm_flushtsn(&asoc->ulpq, mtsn); sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); @@ -852,7 +853,13 @@ struct sctp_chunk *sctp_process_strreset_resp( SCTP_TSN_MAP_INITIAL, stsn, GFP_ATOMIC);
+ /* Clean up sacked and abandoned queues only. As the + * out_chunk_list may not be empty, splice it to temp, + * then get it back after sctp_outq_free is done. + */ + list_splice_init(&asoc->outqueue.out_chunk_list, &temp); sctp_outq_free(&asoc->outqueue); + list_splice_init(&temp, &asoc->outqueue.out_chunk_list);
asoc->next_tsn = rtsn; asoc->ctsn_ack_point = asoc->next_tsn - 1;
From: Xin Long lucien.xin@gmail.com
[ Upstream commit 52a395896a051a3d5c34fba67c324f69ec5e67c6 ]
When doing asoc reset, if the sender of the response has already sent some chunk and increased asoc->next_tsn before the duplicate request comes, the response will use the old result with an incorrect sender next_tsn.
Better than asoc->next_tsn, asoc->ctsn_ack_point can't be changed after the sender of the response has performed the asoc reset and before the peer has confirmed it, and it's value is still asoc->next_tsn original value minus 1.
This patch sets sender next_tsn for the old result with ctsn_ack_point plus 1 when processing the duplicate request, to make sure the sender next_tsn value peer gets will be always right.
Fixes: 692787cef651 ("sctp: implement receiver-side procedures for the SSN/TSN Reset Request Parameter") Signed-off-by: Xin Long lucien.xin@gmail.com Acked-by: Marcelo Ricardo Leitner marcelo.leitner@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- net/sctp/stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/sctp/stream.c b/net/sctp/stream.c index aa629654d27e..9ea6057ed28b 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -541,7 +541,7 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) { - next_tsn = asoc->next_tsn; + next_tsn = asoc->ctsn_ack_point + 1; init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + 1; }
From: Jeff Layton jlayton@redhat.com
[ Upstream commit 9f97df50c52c2887432debb6238f4e43567386a5 ]
The i_version field in reiserfs is not initialized and is only ever updated here. Nothing ever views it, so just remove it.
Signed-off-by: Jeff Layton jlayton@redhat.com Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/reiserfs/super.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 5464ec517702..4885c7b6e44f 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2591,7 +2591,6 @@ out: return err; if (inode->i_size < off + len - towrite) i_size_write(inode, off + len - towrite); - inode->i_version++; inode->i_mtime = inode->i_ctime = current_time(inode); mark_inode_dirty(inode); return len - towrite;
From: Wanpeng Li kernellwp@gmail.com
[ Upstream commit e70b57a6ce4e8b92a56a615ae79bdb2bd66035e7 ]
watchdog: BUG: soft lockup - CPU#6 stuck for 22s! [qemu-system-x86:10185] CPU: 6 PID: 10185 Comm: qemu-system-x86 Tainted: G OE 4.14.0-rc4+ #4 RIP: 0010:kvm_get_time_scale+0x4e/0xa0 [kvm] Call Trace: get_time_ref_counter+0x5a/0x80 [kvm] kvm_hv_process_stimers+0x120/0x5f0 [kvm] kvm_arch_vcpu_ioctl_run+0x4b4/0x1690 [kvm] kvm_vcpu_ioctl+0x33a/0x620 [kvm] do_vfs_ioctl+0xa1/0x5d0 SyS_ioctl+0x79/0x90 entry_SYSCALL_64_fastpath+0x1e/0xa9
This can be reproduced when running kvm-unit-tests/hyperv_stimer.flat and cpu-hotplug stress simultaneously. __this_cpu_read(cpu_tsc_khz) returns 0 (set in kvmclock_cpu_down_prep()) when the pCPU is unhotplug which results in kvm_get_time_scale() gets into an infinite loop.
This patch fixes it by treating the unhotplug pCPU as not using master clock.
Reviewed-by: Radim Krčmář rkrcmar@redhat.com Reviewed-by: David Hildenbrand david@redhat.com Cc: Paolo Bonzini pbonzini@redhat.com Cc: Radim Krčmář rkrcmar@redhat.com Signed-off-by: Wanpeng Li wanpeng.li@hotmail.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/x86/kvm/x86.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b25326ad82a1..2dd19f234245 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1795,10 +1795,13 @@ u64 get_kvmclock_ns(struct kvm *kvm) /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu();
- kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, - &hv_clock.tsc_shift, - &hv_clock.tsc_to_system_mul); - ret = __pvclock_read_cycles(&hv_clock, rdtsc()); + if (__this_cpu_read(cpu_tsc_khz)) { + kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, + &hv_clock.tsc_shift, + &hv_clock.tsc_to_system_mul); + ret = __pvclock_read_cycles(&hv_clock, rdtsc()); + } else + ret = ktime_get_boot_ns() + ka->kvmclock_offset;
put_cpu();
From: Wanpeng Li wanpeng.li@hotmail.com
[ Upstream commit c37c28730bb031cc8a44a130c2555c0f3efbe2d0 ]
Reported by syzkaller:
*** Guest State *** CR0: actual=0x0000000080010031, shadow=0x0000000060000010, gh_mask=fffffffffffffff7 CR4: actual=0x0000000000002061, shadow=0x0000000000000000, gh_mask=ffffffffffffe8f1 CR3 = 0x000000002081e000 RSP = 0x000000000000fffa RIP = 0x0000000000000000 RFLAGS=0x00023000 DR7 = 0x00000000000000 ^^^^^^^^^^ ------------[ cut here ]------------ WARNING: CPU: 6 PID: 24431 at /home/kernel/linux/arch/x86/kvm//x86.c:7302 kvm_arch_vcpu_ioctl_run+0x651/0x2ea0 [kvm] CPU: 6 PID: 24431 Comm: reprotest Tainted: G W OE 4.14.0+ #26 RIP: 0010:kvm_arch_vcpu_ioctl_run+0x651/0x2ea0 [kvm] RSP: 0018:ffff880291d179e0 EFLAGS: 00010202 Call Trace: kvm_vcpu_ioctl+0x479/0x880 [kvm] do_vfs_ioctl+0x142/0x9a0 SyS_ioctl+0x74/0x80 entry_SYSCALL_64_fastpath+0x23/0x9a
The failed vmentry is triggered by the following beautified testcase:
#include <unistd.h> #include <sys/syscall.h> #include <string.h> #include <stdint.h> #include <linux/kvm.h> #include <fcntl.h> #include <sys/ioctl.h>
long r[5]; int main() { struct kvm_debugregs dr = { 0 };
r[2] = open("/dev/kvm", O_RDONLY); r[3] = ioctl(r[2], KVM_CREATE_VM, 0); r[4] = ioctl(r[3], KVM_CREATE_VCPU, 7); struct kvm_guest_debug debug = { .control = 0xf0403, .arch = { .debugreg[6] = 0x2, .debugreg[7] = 0x2 } }; ioctl(r[4], KVM_SET_GUEST_DEBUG, &debug); ioctl(r[4], KVM_RUN, 0); }
which testcase tries to setup the processor specific debug registers and configure vCPU for handling guest debug events through KVM_SET_GUEST_DEBUG. The KVM_SET_GUEST_DEBUG ioctl will get and set rflags in order to set TF bit if single step is needed. All regs' caches are reset to avail and GUEST_RFLAGS vmcs field is reset to 0x2 during vCPU reset. However, the cache of rflags is not reset during vCPU reset. The function vmx_get_rflags() returns an unreset rflags cache value since the cache is marked avail, it is 0 after boot. Vmentry fails if the rflags reserved bit 1 is 0.
This patch fixes it by resetting both the GUEST_RFLAGS vmcs field and its cache to 0x2 during vCPU reset.
Reported-by: Dmitry Vyukov dvyukov@google.com Tested-by: Dmitry Vyukov dvyukov@google.com Reviewed-by: David Hildenbrand david@redhat.com Cc: Paolo Bonzini pbonzini@redhat.com Cc: Radim Krčmář rkrcmar@redhat.com Cc: Nadav Amit nadav.amit@gmail.com Cc: Dmitry Vyukov dvyukov@google.com Signed-off-by: Wanpeng Li wanpeng.li@hotmail.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d073f91bc089..5242fac8165b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5608,7 +5608,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write64(GUEST_IA32_DEBUGCTL, 0); }
- vmcs_writel(GUEST_RFLAGS, 0x02); + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0xfff0);
vmcs_writel(GUEST_GDTR_BASE, 0);
From: Liu Bo bo.li.liu@oracle.com
[ Upstream commit ebb70442cdd4872260c2415929c456be3562da82 ]
Xfstests btrfs/146 revealed this corruption,
[ 58.138831] Buffer I/O error on dev dm-0, logical block 2621424, async page read [ 58.151233] BTRFS error (device sdf): bdev /dev/mapper/error-test errs: wr 1, rd 0, flush 0, corrupt 0, gen 0 [ 58.152403] list_add corruption. prev->next should be next (ffff88005e6775d8), but was ffffc9000189be88. (prev=ffffc9000189be88). [ 58.153518] ------------[ cut here ]------------ [ 58.153892] WARNING: CPU: 1 PID: 1287 at lib/list_debug.c:31 __list_add_valid+0x169/0x1f0 ... [ 58.157379] RIP: 0010:__list_add_valid+0x169/0x1f0 ... [ 58.161956] Call Trace: [ 58.162264] btrfs_log_inode_parent+0x5bd/0xfb0 [btrfs] [ 58.163583] btrfs_log_dentry_safe+0x60/0x80 [btrfs] [ 58.164003] btrfs_sync_file+0x4c2/0x6f0 [btrfs] [ 58.164393] vfs_fsync_range+0x5f/0xd0 [ 58.164898] do_fsync+0x5a/0x90 [ 58.165170] SyS_fsync+0x10/0x20 [ 58.165395] entry_SYSCALL_64_fastpath+0x1f/0xbe ...
It turns out that we could record btrfs_log_ctx:io_err in log_one_extents when IO fails, but make log_one_extents() return '0' instead of -EIO, so the IO error is not acknowledged by the callers, i.e. btrfs_log_inode_parent(), which would remove btrfs_log_ctx:list from list head 'root->log_ctxs'. Since btrfs_log_ctx is allocated from stack memory, it'd get freed with a object alive on the list. then a future list_add will throw the above warning.
This returns the correct error in the above case.
Jeff also reported this while testing against his fsync error patch set[1].
[1]: https://www.spinics.net/lists/linux-btrfs/msg65308.html "btrfs list corruption and soft lockups while testing writeback error handling"
Fixes: 8407f553268a4611f254 ("Btrfs: fix data corruption after fast fsync and writeback error") Signed-off-by: Liu Bo bo.li.liu@oracle.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/btrfs/file.c | 5 +++-- fs/btrfs/tree-log.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index aafcc785f840..d564a7049d7f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2056,6 +2056,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) len = (u64)end - (u64)start + 1; trace_btrfs_sync_file(file, datasync);
+ btrfs_init_log_ctx(&ctx, inode); + /* * We write the dirty pages in the range and wait until they complete * out of the ->i_mutex. If so, we can flush the dirty pages by @@ -2202,8 +2204,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } trans->sync = true;
- btrfs_init_log_ctx(&ctx, inode); - ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ @@ -2261,6 +2261,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = btrfs_end_transaction(trans); } out: + ASSERT(list_empty(&ctx.list)); err = file_check_and_advance_wb_err(file); if (!ret) ret = err; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c800d067fcbf..d3002842d7f6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4100,7 +4100,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
if (ordered_io_err) { ctx->io_err = -EIO; - return 0; + return ctx->io_err; }
btrfs_init_map_token(&token);
From: Jan H. Schönherr jschoenh@amazon.de
[ Upstream commit 20b7035c66bacc909ae3ffe92c1a1ea7db99fe4f ]
KVM API says for the signal mask you set via KVM_SET_SIGNAL_MASK, that "any unblocked signal received [...] will cause KVM_RUN to return with -EINTR" and that "the signal will only be delivered if not blocked by the original signal mask".
This, however, is only true, when the calling task has a signal handler registered for a signal. If not, signal evaluation is short-circuited for SIG_IGN and SIG_DFL, and the signal is either ignored without KVM_RUN returning or the whole process is terminated.
Make KVM_SET_SIGNAL_MASK behave as advertised by utilizing logic similar to that in do_sigtimedwait() to avoid short-circuiting of signals.
Signed-off-by: Jan H. Schönherr jschoenh@amazon.de Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/mips/kvm/mips.c | 7 ++----- arch/powerpc/kvm/powerpc.c | 7 ++----- arch/s390/kvm/kvm-s390.c | 7 ++----- arch/x86/kvm/x86.c | 7 ++----- include/linux/kvm_host.h | 3 +++ virt/kvm/arm/arm.c | 8 +++----- virt/kvm/kvm_main.c | 23 +++++++++++++++++++++++ 7 files changed, 37 insertions(+), 25 deletions(-)
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index d535edc01434..75fdeaa8c62f 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -445,10 +445,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { int r = -EINTR; - sigset_t sigsaved;
- if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + kvm_sigset_activate(vcpu);
if (vcpu->mmio_needed) { if (!vcpu->mmio_is_write) @@ -480,8 +478,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) local_irq_enable();
out: - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + kvm_sigset_deactivate(vcpu);
return r; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index ee279c7f4802..2b02d51d14d8 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1407,7 +1407,6 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { int r; - sigset_t sigsaved;
if (vcpu->mmio_needed) { vcpu->mmio_needed = 0; @@ -1448,16 +1447,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) #endif }
- if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + kvm_sigset_activate(vcpu);
if (run->immediate_exit) r = -EINTR; else r = kvmppc_vcpu_run(run, vcpu);
- if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + kvm_sigset_deactivate(vcpu);
return r; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index b87a930c2201..e6e63017bb49 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3374,7 +3374,6 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int rc; - sigset_t sigsaved;
if (kvm_run->immediate_exit) return -EINTR; @@ -3384,8 +3383,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; }
- if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + kvm_sigset_activate(vcpu);
if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) { kvm_s390_vcpu_start(vcpu); @@ -3419,8 +3417,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) disable_cpu_timer_accounting(vcpu); store_regs(vcpu, kvm_run);
- if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + kvm_sigset_deactivate(vcpu);
vcpu->stat.exit_userspace++; return rc; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2dd19f234245..8c28023a43b1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7245,12 +7245,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct fpu *fpu = ¤t->thread.fpu; int r; - sigset_t sigsaved;
fpu__initialize(fpu);
- if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + kvm_sigset_activate(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { if (kvm_run->immediate_exit) { @@ -7293,8 +7291,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
out: post_kvm_run_save(vcpu); - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + kvm_sigset_deactivate(vcpu);
return r; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6882538eda32..5a8019befafd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -714,6 +714,9 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len); void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
+void kvm_sigset_activate(struct kvm_vcpu *vcpu); +void kvm_sigset_deactivate(struct kvm_vcpu *vcpu); + void kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 95cba0799828..9a07ee94a230 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -612,7 +612,6 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { int ret; - sigset_t sigsaved;
if (unlikely(!kvm_vcpu_initialized(vcpu))) return -ENOEXEC; @@ -630,8 +629,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) if (run->immediate_exit) return -EINTR;
- if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + kvm_sigset_activate(vcpu);
ret = 1; run->exit_reason = KVM_EXIT_UNKNOWN; @@ -753,8 +751,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_pmu_update_run(vcpu); }
- if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + kvm_sigset_deactivate(vcpu); + return ret; }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2447d7c017e7..8401774f5aeb 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2073,6 +2073,29 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
+void kvm_sigset_activate(struct kvm_vcpu *vcpu) +{ + if (!vcpu->sigset_active) + return; + + /* + * This does a lockless modification of ->real_blocked, which is fine + * because, only current can change ->real_blocked and all readers of + * ->real_blocked don't care as long ->real_blocked is always a subset + * of ->blocked. + */ + sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked); +} + +void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) +{ + if (!vcpu->sigset_active) + return; + + sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL); + sigemptyset(¤t->real_blocked); +} + static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) { unsigned int old, val, grow;
From: "Darrick J. Wong" darrick.wong@oracle.com
[ Upstream commit 509955823cc9cc225c05673b1b83d70ca70c5c60 ]
As part of testing log recovery with dm_log_writes, Amir Goldstein discovered an error in the deferred ops recovery that lead to corruption of the filesystem metadata if a reflink+rmap filesystem happened to shut down midway through a CoW remap:
"This is what happens [after failed log recovery]:
"Phase 1 - find and verify superblock... "Phase 2 - using internal log " - zero log... " - scan filesystem freespace and inode maps... " - found root inode chunk "Phase 3 - for each AG... " - scan (but don't clear) agi unlinked lists... " - process known inodes and perform inode discovery... " - agno = 0 "data fork in regular inode 134 claims CoW block 376 "correcting nextents for inode 134 "bad data fork in inode 134 "would have cleared inode 134"
Hou Tao dissected the log contents of exactly such a crash:
"According to the implementation of xfs_defer_finish(), these ops should be completed in the following sequence:
"Have been done: "(1) CUI: Oper (160) "(2) BUI: Oper (161) "(3) CUD: Oper (194), for CUI Oper (160) "(4) RUI A: Oper (197), free rmap [0x155, 2, -9]
"Should be done: "(5) BUD: for BUI Oper (161) "(6) RUI B: add rmap [0x155, 2, 137] "(7) RUD: for RUI A "(8) RUD: for RUI B
"Actually be done by xlog_recover_process_intents() "(5) BUD: for BUI Oper (161) "(6) RUI B: add rmap [0x155, 2, 137] "(7) RUD: for RUI B "(8) RUD: for RUI A
"So the rmap entry [0x155, 2, -9] for COW should be freed firstly, then a new rmap entry [0x155, 2, 137] will be added. However, as we can see from the log record in post_mount.log (generated after umount) and the trace print, the new rmap entry [0x155, 2, 137] are added firstly, then the rmap entry [0x155, 2, -9] are freed."
When reconstructing the internal log state from the log items found on disk, it's required that deferred ops replay in exactly the same order that they would have had the filesystem not gone down. However, replaying unfinished deferred ops can create /more/ deferred ops. These new deferred ops are finished in the wrong order. This causes fs corruption and replay crashes, so let's create a single defer_ops to handle the subsequent ops created during replay, then use one single transaction at the end of log recovery to ensure that everything is replayed in the same order as they're supposed to be.
Reported-by: Amir Goldstein amir73il@gmail.com Analyzed-by: Hou Tao houtao1@huawei.com Reviewed-by: Christoph Hellwig hch@lst.de Tested-by: Amir Goldstein amir73il@gmail.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/xfs/xfs_bmap_item.c | 23 +++++--------- fs/xfs/xfs_bmap_item.h | 3 +- fs/xfs/xfs_log_recover.c | 75 +++++++++++++++++++++++++++++++++++++++++----- fs/xfs/xfs_refcount_item.c | 21 +++++-------- fs/xfs/xfs_refcount_item.h | 3 +- 5 files changed, 85 insertions(+), 40 deletions(-)
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index dd136f7275e4..e5fb008d75e8 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -389,7 +389,8 @@ xfs_bud_init( int xfs_bui_recover( struct xfs_mount *mp, - struct xfs_bui_log_item *buip) + struct xfs_bui_log_item *buip, + struct xfs_defer_ops *dfops) { int error = 0; unsigned int bui_type; @@ -404,9 +405,7 @@ xfs_bui_recover( xfs_exntst_t state; struct xfs_trans *tp; struct xfs_inode *ip = NULL; - struct xfs_defer_ops dfops; struct xfs_bmbt_irec irec; - xfs_fsblock_t firstfsb;
ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
@@ -464,7 +463,6 @@ xfs_bui_recover(
if (VFS_I(ip)->i_nlink == 0) xfs_iflags_set(ip, XFS_IRECOVERY); - xfs_defer_init(&dfops, &firstfsb);
/* Process deferred bmap item. */ state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? @@ -479,16 +477,16 @@ xfs_bui_recover( break; default: error = -EFSCORRUPTED; - goto err_dfops; + goto err_inode; } xfs_trans_ijoin(tp, ip, 0);
count = bmap->me_len; - error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, + error = xfs_trans_log_finish_bmap_update(tp, budp, dfops, type, ip, whichfork, bmap->me_startoff, bmap->me_startblock, &count, state); if (error) - goto err_dfops; + goto err_inode;
if (count > 0) { ASSERT(type == XFS_BMAP_UNMAP); @@ -496,16 +494,11 @@ xfs_bui_recover( irec.br_blockcount = count; irec.br_startoff = bmap->me_startoff; irec.br_state = state; - error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec); + error = xfs_bmap_unmap_extent(tp->t_mountp, dfops, ip, &irec); if (error) - goto err_dfops; + goto err_inode; }
- /* Finish transaction, free inodes. */ - error = xfs_defer_finish(&tp, &dfops); - if (error) - goto err_dfops; - set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -513,8 +506,6 @@ xfs_bui_recover(
return error;
-err_dfops: - xfs_defer_cancel(&dfops); err_inode: xfs_trans_cancel(tp); if (ip) { diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index c867daae4a3c..24b354a2c836 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -93,6 +93,7 @@ struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *, struct xfs_bui_log_item *); void xfs_bui_item_free(struct xfs_bui_log_item *); void xfs_bui_release(struct xfs_bui_log_item *); -int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip); +int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip, + struct xfs_defer_ops *dfops);
#endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index d6e049fdd977..eaf29646c28f 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -24,6 +24,7 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" @@ -4714,7 +4715,8 @@ STATIC int xlog_recover_process_cui( struct xfs_mount *mp, struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip, + struct xfs_defer_ops *dfops) { struct xfs_cui_log_item *cuip; int error; @@ -4727,7 +4729,7 @@ xlog_recover_process_cui( return 0;
spin_unlock(&ailp->xa_lock); - error = xfs_cui_recover(mp, cuip); + error = xfs_cui_recover(mp, cuip, dfops); spin_lock(&ailp->xa_lock);
return error; @@ -4754,7 +4756,8 @@ STATIC int xlog_recover_process_bui( struct xfs_mount *mp, struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip, + struct xfs_defer_ops *dfops) { struct xfs_bui_log_item *buip; int error; @@ -4767,7 +4770,7 @@ xlog_recover_process_bui( return 0;
spin_unlock(&ailp->xa_lock); - error = xfs_bui_recover(mp, buip); + error = xfs_bui_recover(mp, buip, dfops); spin_lock(&ailp->xa_lock);
return error; @@ -4803,6 +4806,46 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip) } }
+/* Take all the collected deferred ops and finish them in order. */ +static int +xlog_finish_defer_ops( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops) +{ + struct xfs_trans *tp; + int64_t freeblks; + uint resblks; + int error; + + /* + * We're finishing the defer_ops that accumulated as a result of + * recovering unfinished intent items during log recovery. We + * reserve an itruncate transaction because it is the largest + * permanent transaction type. Since we're the only user of the fs + * right now, take 93% (15/16) of the available free blocks. Use + * weird math to avoid a 64-bit division. + */ + freeblks = percpu_counter_sum(&mp->m_fdblocks); + if (freeblks <= 0) + return -ENOSPC; + resblks = min_t(int64_t, UINT_MAX, freeblks); + resblks = (resblks * 15) >> 4; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks, + 0, XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + error = xfs_defer_finish(&tp, dfops); + if (error) + goto out_cancel; + + return xfs_trans_commit(tp); + +out_cancel: + xfs_trans_cancel(tp); + return error; +} + /* * When this is called, all of the log intent items which did not have * corresponding log done items should be in the AIL. What we do now @@ -4823,10 +4866,12 @@ STATIC int xlog_recover_process_intents( struct xlog *log) { - struct xfs_log_item *lip; - int error = 0; + struct xfs_defer_ops dfops; struct xfs_ail_cursor cur; + struct xfs_log_item *lip; struct xfs_ail *ailp; + xfs_fsblock_t firstfsb; + int error = 0; #if defined(DEBUG) || defined(XFS_WARN) xfs_lsn_t last_lsn; #endif @@ -4837,6 +4882,7 @@ xlog_recover_process_intents( #if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); #endif + xfs_defer_init(&dfops, &firstfsb); while (lip != NULL) { /* * We're done when we see something other than an intent. @@ -4857,6 +4903,12 @@ xlog_recover_process_intents( */ ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
+ /* + * NOTE: If your intent processing routine can create more + * deferred ops, you /must/ attach them to the dfops in this + * routine or else those subsequent intents will get + * replayed in the wrong order! + */ switch (lip->li_type) { case XFS_LI_EFI: error = xlog_recover_process_efi(log->l_mp, ailp, lip); @@ -4865,10 +4917,12 @@ xlog_recover_process_intents( error = xlog_recover_process_rui(log->l_mp, ailp, lip); break; case XFS_LI_CUI: - error = xlog_recover_process_cui(log->l_mp, ailp, lip); + error = xlog_recover_process_cui(log->l_mp, ailp, lip, + &dfops); break; case XFS_LI_BUI: - error = xlog_recover_process_bui(log->l_mp, ailp, lip); + error = xlog_recover_process_bui(log->l_mp, ailp, lip, + &dfops); break; } if (error) @@ -4878,6 +4932,11 @@ xlog_recover_process_intents( out: xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); + if (error) + xfs_defer_cancel(&dfops); + else + error = xlog_finish_defer_ops(log->l_mp, &dfops); + return error; }
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 8f2e2fac4255..3a55d6fc271b 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -393,7 +393,8 @@ xfs_cud_init( int xfs_cui_recover( struct xfs_mount *mp, - struct xfs_cui_log_item *cuip) + struct xfs_cui_log_item *cuip, + struct xfs_defer_ops *dfops) { int i; int error = 0; @@ -405,11 +406,9 @@ xfs_cui_recover( struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; enum xfs_refcount_intent_type type; - xfs_fsblock_t firstfsb; xfs_fsblock_t new_fsb; xfs_extlen_t new_len; struct xfs_bmbt_irec irec; - struct xfs_defer_ops dfops; bool requeue_only = false;
ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)); @@ -465,7 +464,6 @@ xfs_cui_recover( return error; cudp = xfs_trans_get_cud(tp, cuip);
- xfs_defer_init(&dfops, &firstfsb); for (i = 0; i < cuip->cui_format.cui_nextents; i++) { refc = &cuip->cui_format.cui_extents[i]; refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; @@ -485,7 +483,7 @@ xfs_cui_recover( new_len = refc->pe_len; } else error = xfs_trans_log_finish_refcount_update(tp, cudp, - &dfops, type, refc->pe_startblock, refc->pe_len, + dfops, type, refc->pe_startblock, refc->pe_len, &new_fsb, &new_len, &rcur); if (error) goto abort_error; @@ -497,21 +495,21 @@ xfs_cui_recover( switch (type) { case XFS_REFCOUNT_INCREASE: error = xfs_refcount_increase_extent( - tp->t_mountp, &dfops, &irec); + tp->t_mountp, dfops, &irec); break; case XFS_REFCOUNT_DECREASE: error = xfs_refcount_decrease_extent( - tp->t_mountp, &dfops, &irec); + tp->t_mountp, dfops, &irec); break; case XFS_REFCOUNT_ALLOC_COW: error = xfs_refcount_alloc_cow_extent( - tp->t_mountp, &dfops, + tp->t_mountp, dfops, irec.br_startblock, irec.br_blockcount); break; case XFS_REFCOUNT_FREE_COW: error = xfs_refcount_free_cow_extent( - tp->t_mountp, &dfops, + tp->t_mountp, dfops, irec.br_startblock, irec.br_blockcount); break; @@ -525,17 +523,12 @@ xfs_cui_recover( }
xfs_refcount_finish_one_cleanup(tp, rcur, error); - error = xfs_defer_finish(&tp, &dfops); - if (error) - goto abort_defer; set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); error = xfs_trans_commit(tp); return error;
abort_error: xfs_refcount_finish_one_cleanup(tp, rcur, error); -abort_defer: - xfs_defer_cancel(&dfops); xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index 5b74dddfa64b..0e5327349a13 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -96,6 +96,7 @@ struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *, struct xfs_cui_log_item *); void xfs_cui_item_free(struct xfs_cui_log_item *); void xfs_cui_release(struct xfs_cui_log_item *); -int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip); +int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip, + struct xfs_defer_ops *dfops);
#endif /* __XFS_REFCOUNT_ITEM_H__ */
From: "Darrick J. Wong" darrick.wong@oracle.com
[ Upstream commit 98c4f78dcdd8cec112d1cbc5e9a792ee6e5ab7a6 ]
In xfs_ifree, we reset the data/attr forks to extents format without bothering to free any inline data buffer that might still be around after all the blocks have been truncated off the file. Prior to commit 43518812d2 ("xfs: remove support for inlining data/extents into the inode fork") nobody noticed because the leftover inline data after truncation was small enough to fit inside the inline buffer inside the fork itself.
However, now that we've removed the inline buffer, we /always/ have to free the inline data buffer or else we leak them like crazy. This test was found by turning on kmemleak for generic/001 or generic/388.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/xfs/xfs_inode.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 63350906961a..cb4833d06467 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2422,6 +2422,24 @@ retry: }
/* + * Free any local-format buffers sitting around before we reset to + * extents format. + */ +static inline void +xfs_ifree_local_data( + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return; + + ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); +} + +/* * This is called to return an inode to the inode free list. * The inode should already be truncated to 0 length and have * no pages associated with it. This routine also assumes that @@ -2458,6 +2476,9 @@ xfs_ifree( if (error) return error;
+ xfs_ifree_local_data(ip, XFS_DATA_FORK); + xfs_ifree_local_data(ip, XFS_ATTR_FORK); + VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; ip->i_d.di_dmevmask = 0;
From: Colin Ian King colin.king@canonical.com
[ Upstream commit 66a7c84d677e8e4a5a2ef4afdb9bd52e1399a866 ]
Currently when an error occurs devinfo is still allocated but is unused when the error exit paths break out of the for-loop. Fix this by kfree'ing devinfo to avoid the leak.
Detected by CoverityScan, CID#1416590 ("Resource Leak")
Fixes: 4124c4eba402 ("i2c: allow attaching IRQ resources to i2c_board_info") Fixes: 0daaf99d8424 ("i2c: copy device properties when using i2c_register_board_info()") Signed-off-by: Colin Ian King colin.king@canonical.com Signed-off-by: Wolfram Sang wsa@the-dreams.de Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/i2c/i2c-boardinfo.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/i2c/i2c-boardinfo.c b/drivers/i2c/i2c-boardinfo.c index 31186ead5a40..509a6007cdf6 100644 --- a/drivers/i2c/i2c-boardinfo.c +++ b/drivers/i2c/i2c-boardinfo.c @@ -86,6 +86,7 @@ int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsig property_entries_dup(info->properties); if (IS_ERR(devinfo->board_info.properties)) { status = PTR_ERR(devinfo->board_info.properties); + kfree(devinfo); break; } } @@ -98,6 +99,7 @@ int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsig GFP_KERNEL); if (!devinfo->board_info.resources) { status = -ENOMEM; + kfree(devinfo); break; } }
From: Eduardo Otubo otubo@redhat.com
[ Upstream commit 5b5971df3bc2775107ddad164018a8a8db633b81 ]
v2: * Replace busy wait with wait_event()/wake_up_all() * Cannot garantee that at the time xennet_remove is called, the xen_netback state will not be XenbusStateClosed, so added a condition for that * There's a small chance for the xen_netback state is XenbusStateUnknown by the time the xen_netfront switches to Closed, so added a condition for that.
When unloading module xen_netfront from guest, dmesg would output warning messages like below:
[ 105.236836] xen:grant_table: WARNING: g.e. 0x903 still in use! [ 105.236839] deferring g.e. 0x903 (pfn 0x35805)
This problem relies on netfront and netback being out of sync. By the time netfront revokes the g.e.'s netback didn't have enough time to free all of them, hence displaying the warnings on dmesg.
The trick here is to make netfront to wait until netback frees all the g.e.'s and only then continue to cleanup for the module removal, and this is done by manipulating both device states.
Signed-off-by: Eduardo Otubo otubo@redhat.com Acked-by: Juergen Gross jgross@suse.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/net/xen-netfront.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+)
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 8b8689c6d887..391432e2725d 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -87,6 +87,8 @@ struct netfront_cb { /* IRQ name is queue name with "-tx" or "-rx" appended */ #define IRQ_NAME_SIZE (QUEUE_NAME_SIZE + 3)
+static DECLARE_WAIT_QUEUE_HEAD(module_unload_q); + struct netfront_stats { u64 packets; u64 bytes; @@ -2021,10 +2023,12 @@ static void netback_changed(struct xenbus_device *dev, break;
case XenbusStateClosed: + wake_up_all(&module_unload_q); if (dev->state == XenbusStateClosed) break; /* Missed the backend's CLOSING state -- fallthrough */ case XenbusStateClosing: + wake_up_all(&module_unload_q); xenbus_frontend_closed(dev); break; } @@ -2130,6 +2134,20 @@ static int xennet_remove(struct xenbus_device *dev)
dev_dbg(&dev->dev, "%s\n", dev->nodename);
+ if (xenbus_read_driver_state(dev->otherend) != XenbusStateClosed) { + xenbus_switch_state(dev, XenbusStateClosing); + wait_event(module_unload_q, + xenbus_read_driver_state(dev->otherend) == + XenbusStateClosing); + + xenbus_switch_state(dev, XenbusStateClosed); + wait_event(module_unload_q, + xenbus_read_driver_state(dev->otherend) == + XenbusStateClosed || + xenbus_read_driver_state(dev->otherend) == + XenbusStateUnknown); + } + xennet_disconnect_backend(info);
unregister_netdev(info->netdev);
From: Thomas Meyer thomas@m3y3r.de
[ Upstream commit 141cbfba1d0502006463aa80f57c64086226af1a ]
This avoids the MODPOST error:
ERROR: "devm_ioremap_resource" [drivers/auxdisplay/img-ascii-lcd.ko] undefined!
Signed-off-by: Thomas Meyer thomas@m3y3r.de Acked-by: Randy Dunlap rdunlap@infradead.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/auxdisplay/Kconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/auxdisplay/Kconfig b/drivers/auxdisplay/Kconfig index d7d21118d3e0..2c2ed9cf8796 100644 --- a/drivers/auxdisplay/Kconfig +++ b/drivers/auxdisplay/Kconfig @@ -136,6 +136,7 @@ config CFAG12864B_RATE
config IMG_ASCII_LCD tristate "Imagination Technologies ASCII LCD Display" + depends on HAS_IOMEM default y if MIPS_MALTA || MIPS_SEAD3 select SYSCON help
From: Trond Myklebust trond.myklebust@primarydata.com
[ Upstream commit fb500a7cfee7f2f447d2bbf30cb59629feab6ac1 ]
Signed-off-by: Trond Myklebust trond.myklebust@primarydata.com Signed-off-by: J. Bruce Fields bfields@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/nfsd/nfs4state.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a439a70177a4..61f38346ce9d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -63,6 +63,9 @@ static const stateid_t zero_stateid = { static const stateid_t currentstateid = { .si_generation = 1, }; +static const stateid_t close_stateid = { + .si_generation = 0xffffffffU, +};
static u64 current_sessionid = 1;
@@ -5411,6 +5414,11 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_close_open_stateid(stp); mutex_unlock(&stp->st_mutex);
+ /* See RFC5661 sectionm 18.2.4 */ + if (stp->st_stid.sc_client->cl_minorversion) + memcpy(&close->cl_stateid, &close_stateid, + sizeof(close->cl_stateid)); + /* put reference from nfs4_preprocess_seqid_op */ nfs4_put_stid(&stp->st_stid); out:
From: Trond Myklebust trond.myklebust@primarydata.com
[ Upstream commit 659aefb68eca28ba9aa482a9fc64de107332e256 ]
In order to deal with lookup races, nfsd4_free_lock_stateid() needs to be able to signal to other stateful functions that the lock stateid is no longer valid. Right now, nfsd_lock() will check whether or not an existing stateid is still hashed, but only in the "new lock" path.
To ensure the stateid invalidation is also recognised by the "existing lock" path, and also by a second call to nfsd4_free_lock_stateid() itself, we can change the type to NFS4_CLOSED_STID under the stp->st_mutex.
Signed-off-by: Trond Myklebust trond.myklebust@primarydata.com Signed-off-by: J. Bruce Fields bfields@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/nfsd/nfs4state.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 61f38346ce9d..5ec0ca5cbc1e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5099,7 +5099,9 @@ nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) struct nfs4_ol_stateid *stp = openlockstateid(s); __be32 ret;
- mutex_lock(&stp->st_mutex); + ret = nfsd4_lock_ol_stateid(stp); + if (ret) + goto out_put_stid;
ret = check_stateid_generation(stateid, &s->sc_stateid, 1); if (ret) @@ -5110,11 +5112,13 @@ nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) lockowner(stp->st_stateowner))) goto out;
+ stp->st_stid.sc_type = NFS4_CLOSED_STID; release_lock_stateid(stp); ret = nfs_ok;
out: mutex_unlock(&stp->st_mutex); +out_put_stid: nfs4_put_stid(s); return ret; } @@ -5683,6 +5687,8 @@ find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp) lockdep_assert_held(&clp->cl_lock);
list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) { + if (lst->st_stid.sc_type != NFS4_LOCK_STID) + continue; if (lst->st_stid.sc_file == fp) { atomic_inc(&lst->st_stid.sc_count); return lst; @@ -5757,7 +5763,6 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_lockowner *lo; struct nfs4_ol_stateid *lst; unsigned int strhashval; - bool hashed;
lo = find_lockowner_str(cl, &lock->lk_new_owner); if (!lo) { @@ -5780,15 +5785,7 @@ retry: goto out; }
- mutex_lock(&lst->st_mutex); - - /* See if it's still hashed to avoid race with FREE_STATEID */ - spin_lock(&cl->cl_lock); - hashed = !list_empty(&lst->st_perfile); - spin_unlock(&cl->cl_lock); - - if (!hashed) { - mutex_unlock(&lst->st_mutex); + if (nfsd4_lock_ol_stateid(lst) != nfs_ok) { nfs4_put_stid(&lst->st_stid); goto retry; }
Chuck Lever found this caused a regression, and Trond's fix hasn't hit Linus's tree yet.
--b.
On Wed, Jan 24, 2018 at 04:14:53AM +0000, Sasha Levin wrote:
From: Trond Myklebust trond.myklebust@primarydata.com
[ Upstream commit 659aefb68eca28ba9aa482a9fc64de107332e256 ]
In order to deal with lookup races, nfsd4_free_lock_stateid() needs to be able to signal to other stateful functions that the lock stateid is no longer valid. Right now, nfsd_lock() will check whether or not an existing stateid is still hashed, but only in the "new lock" path.
To ensure the stateid invalidation is also recognised by the "existing lock" path, and also by a second call to nfsd4_free_lock_stateid() itself, we can change the type to NFS4_CLOSED_STID under the stp->st_mutex.
Signed-off-by: Trond Myklebust trond.myklebust@primarydata.com Signed-off-by: J. Bruce Fields bfields@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com
fs/nfsd/nfs4state.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 61f38346ce9d..5ec0ca5cbc1e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5099,7 +5099,9 @@ nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) struct nfs4_ol_stateid *stp = openlockstateid(s); __be32 ret;
- mutex_lock(&stp->st_mutex);
- ret = nfsd4_lock_ol_stateid(stp);
- if (ret)
goto out_put_stid;
ret = check_stateid_generation(stateid, &s->sc_stateid, 1); if (ret) @@ -5110,11 +5112,13 @@ nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) lockowner(stp->st_stateowner))) goto out;
- stp->st_stid.sc_type = NFS4_CLOSED_STID; release_lock_stateid(stp); ret = nfs_ok;
out: mutex_unlock(&stp->st_mutex); +out_put_stid: nfs4_put_stid(s); return ret; } @@ -5683,6 +5687,8 @@ find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp) lockdep_assert_held(&clp->cl_lock); list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) {
if (lst->st_stid.sc_type != NFS4_LOCK_STID)
if (lst->st_stid.sc_file == fp) { atomic_inc(&lst->st_stid.sc_count); return lst;continue;
@@ -5757,7 +5763,6 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_lockowner *lo; struct nfs4_ol_stateid *lst; unsigned int strhashval;
- bool hashed;
lo = find_lockowner_str(cl, &lock->lk_new_owner); if (!lo) { @@ -5780,15 +5785,7 @@ retry: goto out; }
- mutex_lock(&lst->st_mutex);
- /* See if it's still hashed to avoid race with FREE_STATEID */
- spin_lock(&cl->cl_lock);
- hashed = !list_empty(&lst->st_perfile);
- spin_unlock(&cl->cl_lock);
- if (!hashed) {
mutex_unlock(&lst->st_mutex);
- if (nfsd4_lock_ol_stateid(lst) != nfs_ok) { nfs4_put_stid(&lst->st_stid); goto retry; }
-- 2.11.0
On Wed, Jan 24, 2018 at 10:47:09AM -0500, J. Bruce Fields wrote:
Chuck Lever found this caused a regression, and Trond's fix hasn't hit Linus's tree yet.
I'll remove it from all trees for now, thanks!
-- Thanks, Sasha
From: Vasily Averin vvs@virtuozzo.com
[ Upstream commit b872285751c1af010e12d02bce7069e2061a58ca ]
Signed-off-by: Vasily Averin vvs@virtuozzo.com Signed-off-by: J. Bruce Fields bfields@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/nfs_common/grace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index 420d3a0ab258..1bd659938646 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -104,7 +104,9 @@ grace_exit_net(struct net *net) { struct list_head *grace_list = net_generic(net, grace_net_id);
- BUG_ON(!list_empty(grace_list)); + WARN_ONCE(!list_empty(grace_list), + "net %x %s: grace_list is not empty\n", + net->ns.inum, __func__); }
static struct pernet_operations grace_net_ops = {
From: Trond Myklebust trond.myklebust@primarydata.com
[ Upstream commit 9271d7e509c1bfc0b9a418caec29ec8d1ac38270 ]
After taking the stateid st_mutex, we want to know that the stateid still represents valid state before performing any non-idempotent actions.
Signed-off-by: Trond Myklebust trond.myklebust@primarydata.com Signed-off-by: J. Bruce Fields bfields@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/nfsd/nfs4state.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 5ec0ca5cbc1e..c927dbc8f490 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5184,15 +5184,9 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ status = nfsd4_check_seqid(cstate, sop, seqid); if (status) return status; - if (stp->st_stid.sc_type == NFS4_CLOSED_STID - || stp->st_stid.sc_type == NFS4_REVOKED_DELEG_STID) - /* - * "Closed" stateid's exist *only* to return - * nfserr_replay_me from the previous step, and - * revoked delegations are kept only for free_stateid. - */ - return nfserr_bad_stateid; - mutex_lock(&stp->st_mutex); + status = nfsd4_lock_ol_stateid(stp); + if (status != nfs_ok) + return status; status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); if (status == nfs_ok) status = nfs4_check_fh(current_fh, &stp->st_stid);
From: Vasily Averin vvs@virtuozzo.com
[ Upstream commit 6b18dd1c03e07262ea0866084856b2a3c5ba8d09 ]
lockd_inet[6]addr_event use nlmsvc_rqst without taken nlmsvc_mutex, nlmsvc_rqst can be changed during execution of notifiers and crash the host.
Patch enables access to nlmsvc_rqst only when it was correctly initialized and delays its cleanup until notifiers are no longer in use.
Note that nlmsvc_rqst can be temporally set to ERR_PTR, so the "if (nlmsvc_rqst)" check in notifiers is insufficient on its own.
Signed-off-by: Vasily Averin vvs@virtuozzo.com Tested-by: Scott Mayhew smayhew@redhat.com Signed-off-by: J. Bruce Fields bfields@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/lockd/svc.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 45e96549ebd2..809cbccbad28 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -57,6 +57,9 @@ static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout;
+atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0); +DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq); + unsigned int lockd_net_id;
/* @@ -292,7 +295,8 @@ static int lockd_inetaddr_event(struct notifier_block *this, struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct sockaddr_in sin;
- if (event != NETDEV_DOWN) + if ((event != NETDEV_DOWN) || + !atomic_inc_not_zero(&nlm_ntf_refcnt)) goto out;
if (nlmsvc_rqst) { @@ -303,6 +307,8 @@ static int lockd_inetaddr_event(struct notifier_block *this, svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, (struct sockaddr *)&sin); } + atomic_dec(&nlm_ntf_refcnt); + wake_up(&nlm_ntf_wq);
out: return NOTIFY_DONE; @@ -319,7 +325,8 @@ static int lockd_inet6addr_event(struct notifier_block *this, struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct sockaddr_in6 sin6;
- if (event != NETDEV_DOWN) + if ((event != NETDEV_DOWN) || + !atomic_inc_not_zero(&nlm_ntf_refcnt)) goto out;
if (nlmsvc_rqst) { @@ -331,6 +338,8 @@ static int lockd_inet6addr_event(struct notifier_block *this, svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, (struct sockaddr *)&sin6); } + atomic_dec(&nlm_ntf_refcnt); + wake_up(&nlm_ntf_wq);
out: return NOTIFY_DONE; @@ -347,10 +356,12 @@ static void lockd_unregister_notifiers(void) #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&lockd_inet6addr_notifier); #endif + wait_event(nlm_ntf_wq, atomic_read(&nlm_ntf_refcnt) == 0); }
static void lockd_svc_exit_thread(void) { + atomic_dec(&nlm_ntf_refcnt); lockd_unregister_notifiers(); svc_exit_thread(nlmsvc_rqst); } @@ -375,6 +386,7 @@ static int lockd_start_svc(struct svc_serv *serv) goto out_rqst; }
+ atomic_inc(&nlm_ntf_refcnt); svc_sock_update_bufs(serv); serv->sv_maxconn = nlm_max_connections;
From: Andrew Elble aweits@rit.edu
[ Upstream commit ae254dac721d44c0bfebe2795df87459e2e88219 ]
Prevent the use of the closed (invalid) special stateid by clients.
Signed-off-by: Andrew Elble aweits@rit.edu Signed-off-by: J. Bruce Fields bfields@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/nfsd/nfs4state.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c927dbc8f490..cc74fb0dc02d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -72,6 +72,7 @@ static u64 current_sessionid = 1; #define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t))) #define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t))) #define CURRENT_STATEID(stateid) (!memcmp((stateid), ¤tstateid, sizeof(stateid_t))) +#define CLOSE_STATEID(stateid) (!memcmp((stateid), &close_stateid, sizeof(stateid_t)))
/* forward declarations */ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner); @@ -4869,7 +4870,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) struct nfs4_stid *s; __be32 status = nfserr_bad_stateid;
- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || + CLOSE_STATEID(stateid)) return status; /* Client debugging aid. */ if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) { @@ -4927,7 +4929,8 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, else if (typemask & NFS4_DELEG_STID) typemask |= NFS4_REVOKED_DELEG_STID;
- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || + CLOSE_STATEID(stateid)) return nfserr_bad_stateid; status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn); if (status == nfserr_stale_clientid) {
From: Vasily Averin vvs@virtuozzo.com
[ Upstream commit 81833de1a46edce9ca20cfe079872ac1c20ef359 ]
restart_grace() uses hardcoded init_net. It can cause to "list_add double add" in following scenario:
1) nfsd and lockd was started in several net namespaces 2) nfsd in init_net was stopped (lockd was not stopped because it have users from another net namespaces) 3) lockd got signal, called restart_grace() -> set_grace_period() and enabled lock_manager in hardcoded init_net. 4) nfsd in init_net is started again, its lockd_up() calls set_grace_period() and tries to add lock_manager into init_net 2nd time.
Jeff Layton suggest: "Make it safe to call locks_start_grace multiple times on the same lock_manager. If it's already on the global grace_list, then don't try to add it again. (But we don't intentionally add twice, so for now we WARN about that case.)
With this change, we also need to ensure that the nfsd4 lock manager initializes the list before we call locks_start_grace. While we're at it, move the rest of the nfsd_net initialization into nfs4_state_create_net. I see no reason to have it spread over two functions like it is today."
Suggested patch was updated to generate warning in described situation.
Suggested-by: Jeff Layton jlayton@redhat.com Signed-off-by: Vasily Averin vvs@virtuozzo.com Signed-off-by: J. Bruce Fields bfields@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/nfs_common/grace.c | 6 +++++- fs/nfsd/nfs4state.c | 7 ++++--- 2 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index 1bd659938646..3b13fb3b0553 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -30,7 +30,11 @@ locks_start_grace(struct net *net, struct lock_manager *lm) struct list_head *grace_list = net_generic(net, grace_net_id);
spin_lock(&grace_lock); - list_add(&lm->list, grace_list); + if (list_empty(&lm->list)) + list_add(&lm->list, grace_list); + else + WARN(1, "double list_add attempt detected in net %x %s\n", + net->ns.inum, (net == &init_net) ? "(init_net)" : ""); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_start_grace); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index cc74fb0dc02d..58764786091e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -7018,6 +7018,10 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]); nn->conf_name_tree = RB_ROOT; nn->unconf_name_tree = RB_ROOT; + nn->boot_time = get_seconds(); + nn->grace_ended = false; + nn->nfsd4_manager.block_opens = true; + INIT_LIST_HEAD(&nn->nfsd4_manager.list); INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); @@ -7075,9 +7079,6 @@ nfs4_state_start_net(struct net *net) ret = nfs4_state_create_net(net); if (ret) return ret; - nn->boot_time = get_seconds(); - nn->grace_ended = false; - nn->nfsd4_manager.block_opens = true; locks_start_grace(net, &nn->nfsd4_manager); nfsd4_client_tracking_init(net); printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
From: Robert Lippert roblip@gmail.com
[ Upstream commit bd467e4eababe4c04272c1e646f066db02734c79 ]
Power values in the 100s of watt range can easily blow past 32bit math limits when processing everything in microwatts.
Use 64bit math instead to avoid these issues on common 32bit ARM BMC platforms.
Fixes: 442aba78728e ("hwmon: PMBus device driver") Signed-off-by: Robert Lippert rlippert@google.com Signed-off-by: Guenter Roeck linux@roeck-us.net Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/hwmon/pmbus/pmbus_core.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c index 52a58b8b6e1b..a139940cd991 100644 --- a/drivers/hwmon/pmbus/pmbus_core.c +++ b/drivers/hwmon/pmbus/pmbus_core.c @@ -21,6 +21,7 @@
#include <linux/debugfs.h> #include <linux/kernel.h> +#include <linux/math64.h> #include <linux/module.h> #include <linux/init.h> #include <linux/err.h> @@ -499,8 +500,8 @@ static long pmbus_reg2data_linear(struct pmbus_data *data, static long pmbus_reg2data_direct(struct pmbus_data *data, struct pmbus_sensor *sensor) { - long val = (s16) sensor->data; - long m, b, R; + s64 b, val = (s16)sensor->data; + s32 m, R;
m = data->info->m[sensor->class]; b = data->info->b[sensor->class]; @@ -528,11 +529,12 @@ static long pmbus_reg2data_direct(struct pmbus_data *data, R--; } while (R < 0) { - val = DIV_ROUND_CLOSEST(val, 10); + val = div_s64(val + 5LL, 10L); /* round closest */ R++; }
- return (val - b) / m; + val = div_s64(val - b, m); + return clamp_val(val, LONG_MIN, LONG_MAX); }
/* @@ -656,7 +658,8 @@ static u16 pmbus_data2reg_linear(struct pmbus_data *data, static u16 pmbus_data2reg_direct(struct pmbus_data *data, struct pmbus_sensor *sensor, long val) { - long m, b, R; + s64 b, val64 = val; + s32 m, R;
m = data->info->m[sensor->class]; b = data->info->b[sensor->class]; @@ -673,18 +676,18 @@ static u16 pmbus_data2reg_direct(struct pmbus_data *data, R -= 3; /* Adjust R and b for data in milli-units */ b *= 1000; } - val = val * m + b; + val64 = val64 * m + b;
while (R > 0) { - val *= 10; + val64 *= 10; R--; } while (R < 0) { - val = DIV_ROUND_CLOSEST(val, 10); + val64 = div_s64(val64 + 5LL, 10L); /* round closest */ R++; }
- return val; + return (u16)clamp_val(val64, S16_MIN, S16_MAX); }
static u16 pmbus_data2reg_vid(struct pmbus_data *data,
From: Chao Yu yuchao0@huawei.com
[ Upstream commit 1a6152d36dee08da2be2a3030dceb45ef680460a ]
In commit 6184fc0b8dd7 ("quota: Propagate error from ->acquire_dquot()"), we have propagated error from __dquot_initialize to caller, but we forgot to handle such error in add_dquot_ref(), so, currently, during quota accounting information initialization flow, if we failed for some of inodes, we just ignore such error, and do account for others, which is not a good implementation.
In this patch, we choose to let user be aware of such error, so after turning on quota successfully, we can make sure all inodes disk usage can be accounted, which will be more reasonable.
Suggested-by: Jan Kara jack@suse.cz Signed-off-by: Chao Yu yuchao0@huawei.com Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/quota/dquot.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 9f78b5015f2e..8406be97a518 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -934,12 +934,13 @@ static int dqinit_needed(struct inode *inode, int type) }
/* This routine is guarded by s_umount semaphore */ -static void add_dquot_ref(struct super_block *sb, int type) +static int add_dquot_ref(struct super_block *sb, int type) { struct inode *inode, *old_inode = NULL; #ifdef CONFIG_QUOTA_DEBUG int reserved = 0; #endif + int err = 0;
spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { @@ -959,7 +960,11 @@ static void add_dquot_ref(struct super_block *sb, int type) reserved = 1; #endif iput(old_inode); - __dquot_initialize(inode, type); + err = __dquot_initialize(inode, type); + if (err) { + iput(inode); + goto out; + }
/* * We hold a reference to 'inode' so it couldn't have been @@ -974,7 +979,7 @@ static void add_dquot_ref(struct super_block *sb, int type) } spin_unlock(&sb->s_inode_list_lock); iput(old_inode); - +out: #ifdef CONFIG_QUOTA_DEBUG if (reserved) { quota_error(sb, "Writes happened before quota was turned on " @@ -982,6 +987,7 @@ static void add_dquot_ref(struct super_block *sb, int type) "Please run quotacheck(8)"); } #endif + return err; }
/* @@ -2372,10 +2378,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, dqopt->flags |= dquot_state_flag(flags, type); spin_unlock(&dq_state_lock);
- add_dquot_ref(sb, type); - - return 0; + error = add_dquot_ref(sb, type); + if (error) + dquot_disable(sb, type, flags);
+ return error; out_file_init: dqopt->files[type] = NULL; iput(inode);
From: Antoine Tenart antoine.tenart@free-electrons.com
[ Upstream commit ba2d8d887d962c2f790e6dc01b2fd25b4608720b ]
When an allocation in the txq_init path fails, the allocated buffers end-up being freed twice: in the txq_init error path, and in txq_deinit. This lead to issues as txq_deinit would work on already freed memory regions:
kernel BUG at mm/slub.c:3915! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
This patch fixes this by removing the txq_init own error path, as the txq_deinit function is always called on errors. This was introduced by TSO as way more buffers are allocated.
Fixes: 186cd4d4e414 ("net: mvpp2: software tso support") Signed-off-by: Antoine Tenart antoine.tenart@free-electrons.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/net/ethernet/marvell/mvpp2.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-)
diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index fcf9ba5eb8d1..8044cfb0b667 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -5606,7 +5606,7 @@ static int mvpp2_txq_init(struct mvpp2_port *port, sizeof(*txq_pcpu->buffs), GFP_KERNEL); if (!txq_pcpu->buffs) - goto cleanup; + return -ENOMEM;
txq_pcpu->count = 0; txq_pcpu->reserved_num = 0; @@ -5619,26 +5619,10 @@ static int mvpp2_txq_init(struct mvpp2_port *port, &txq_pcpu->tso_headers_dma, GFP_KERNEL); if (!txq_pcpu->tso_headers) - goto cleanup; + return -ENOMEM; }
return 0; -cleanup: - for_each_present_cpu(cpu) { - txq_pcpu = per_cpu_ptr(txq->pcpu, cpu); - kfree(txq_pcpu->buffs); - - dma_free_coherent(port->dev->dev.parent, - txq_pcpu->size * TSO_HEADER_SIZE, - txq_pcpu->tso_headers, - txq_pcpu->tso_headers_dma); - } - - dma_free_coherent(port->dev->dev.parent, - txq->size * MVPP2_DESC_ALIGNED_SIZE, - txq->descs, txq->descs_dma); - - return -ENOMEM; }
/* Free allocated TXQ resources */
From: Yan Markman ymarkman@marvell.com
[ Upstream commit e749aca84b10f3987b2ee1f76e0c7d8aacc5653c ]
Short fragmented packets may never be sent by the hardware when padding is disabled. This patch stop modifying the GMAC padding bits, to leave them to their reset value (disabled).
Fixes: 3919357fb0bb ("net: mvpp2: initialize the GMAC when using a port") Signed-off-by: Yan Markman ymarkman@marvell.com [Antoine: commit message] Signed-off-by: Antoine Tenart antoine.tenart@free-electrons.com Signed-off-by: David S. Miller davem@davemloft.net
Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/net/ethernet/marvell/mvpp2.c | 9 --------- 1 file changed, 9 deletions(-)
diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index 8044cfb0b667..1dd3a1264a53 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -4552,11 +4552,6 @@ static void mvpp2_port_mii_gmac_configure_mode(struct mvpp2_port *port) MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE; val &= ~MVPP22_CTRL4_EXT_PIN_GMII_SEL; writel(val, port->base + MVPP22_GMAC_CTRL_4_REG); - - val = readl(port->base + MVPP2_GMAC_CTRL_2_REG); - val |= MVPP2_GMAC_DISABLE_PADDING; - val &= ~MVPP2_GMAC_FLOW_CTRL_MASK; - writel(val, port->base + MVPP2_GMAC_CTRL_2_REG); } else if (phy_interface_mode_is_rgmii(port->phy_interface)) { val = readl(port->base + MVPP22_GMAC_CTRL_4_REG); val |= MVPP22_CTRL4_EXT_PIN_GMII_SEL | @@ -4564,10 +4559,6 @@ static void mvpp2_port_mii_gmac_configure_mode(struct mvpp2_port *port) MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE; val &= ~MVPP22_CTRL4_DP_CLK_SEL; writel(val, port->base + MVPP22_GMAC_CTRL_4_REG); - - val = readl(port->base + MVPP2_GMAC_CTRL_2_REG); - val &= ~MVPP2_GMAC_DISABLE_PADDING; - writel(val, port->base + MVPP2_GMAC_CTRL_2_REG); }
/* The port is connected to a copper PHY */
From: Antoine Tenart antoine.tenart@free-electrons.com
[ Upstream commit 952b6b3b07877419386e719ff20917170e1ce684 ]
The Marvell 10G PHY driver supports different hardware revisions, which have their bits 3..0 differing. To get the correct revision number these bits should be ignored. This patch fixes this by using the already defined MARVELL_PHY_ID_MASK (0xfffffff0) instead of the custom 0xffffffff mask.
Fixes: 20b2af32ff3f ("net: phy: add Marvell Alaska X 88X3310 10Gigabit PHY support") Suggested-by: Yan Markman ymarkman@marvell.com Signed-off-by: Antoine Tenart antoine.tenart@free-electrons.com Reviewed-by: Andrew Lunn andrew@lunn.ch Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/net/phy/marvell10g.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c index aebc08beceba..21b3f36e023a 100644 --- a/drivers/net/phy/marvell10g.c +++ b/drivers/net/phy/marvell10g.c @@ -16,6 +16,7 @@ * link takes priority and the other port is completely locked out. */ #include <linux/phy.h> +#include <linux/marvell_phy.h>
enum { MV_PCS_BASE_T = 0x0000, @@ -338,7 +339,7 @@ static int mv3310_read_status(struct phy_device *phydev) static struct phy_driver mv3310_drivers[] = { { .phy_id = 0x002b09aa, - .phy_id_mask = 0xffffffff, + .phy_id_mask = MARVELL_PHY_ID_MASK, .name = "mv88x3310", .features = SUPPORTED_10baseT_Full | SUPPORTED_100baseT_Full | @@ -360,7 +361,7 @@ static struct phy_driver mv3310_drivers[] = { module_phy_driver(mv3310_drivers);
static struct mdio_device_id __maybe_unused mv3310_tbl[] = { - { 0x002b09aa, 0xffffffff }, + { 0x002b09aa, MARVELL_PHY_ID_MASK }, { }, }; MODULE_DEVICE_TABLE(mdio, mv3310_tbl);
From: Christophe JAILLET christophe.jaillet@wanadoo.fr
[ Upstream commit dea521a2b9f96e905fa2bb2f95e23ec00c2ec436 ]
Error code returned by 'bnxt_read_sfp_module_eeprom_info()' is handled a few lines above when reading the A0 portion of the EEPROM. The same should be done when reading the A2 portion of the EEPROM.
In order to correctly propagate an error, update 'rc' in this 2nd call as well, otherwise 0 (success) is returned.
Signed-off-by: Christophe JAILLET christophe.jaillet@wanadoo.fr Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 3cbe771b3352..a22336fef66b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -2133,8 +2133,8 @@ static int bnxt_get_module_eeprom(struct net_device *dev, /* Read A2 portion of the EEPROM */ if (length) { start -= ETH_MODULE_SFF_8436_LEN; - bnxt_read_sfp_module_eeprom_info(bp, I2C_DEV_ADDR_A2, 1, start, - length, data); + rc = bnxt_read_sfp_module_eeprom_info(bp, I2C_DEV_ADDR_A2, 1, + start, length, data); } return rc; }
From: Minwoo Im minwoo.im.dev@gmail.com
[ Upstream commit 7e5dd57ef3081ff6c03908d786ed5087f6fbb7ae ]
Following condition which will cause NULL pointer dereference will occur in nvme_free_host_mem() when it tries to remove pci device via nvme_remove() especially after a failure of host memory allocation for HMB.
"(host_mem_descs == NULL) && (nr_host_mem_descs != 0)"
It's because __nr_host_mem_descs__ is not cleared to 0 unlike __host_mem_descs__ is so.
Signed-off-by: Minwoo Im minwoo.im.dev@gmail.com Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/nvme/host/pci.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5dae650438c1..cdd2fd509ddc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1617,6 +1617,7 @@ static void nvme_free_host_mem(struct nvme_dev *dev) dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs), dev->host_mem_descs, dev->host_mem_descs_dma); dev->host_mem_descs = NULL; + dev->nr_host_mem_descs = 0; }
static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
From: Filipe Manana fdmanana@suse.com
[ Upstream commit ea37d5998b50a72b9045ba60a132eeb20e1c4230 ]
Under some circumstances, an incremental send operation can issue wrong paths for unlink commands related to files that have multiple hard links and some (or all) of those links were renamed between the parent and send snapshots. Consider the following example:
Parent snapshot
. (ino 256) |---- a/ (ino 257) | |---- b/ (ino 259) | | |---- c/ (ino 260) | | |---- f2 (ino 261) | | | |---- f2l1 (ino 261) | |---- d/ (ino 262) |---- f1l1_2 (ino 258) |---- f2l2 (ino 261) |---- f1_2 (ino 258)
Send snapshot
. (ino 256) |---- a/ (ino 257) | |---- f2l1/ (ino 263) | |---- b2/ (ino 259) | |---- c/ (ino 260) | | |---- d3 (ino 262) | | |---- f1l1_2 (ino 258) | | |---- f2l2_2 (ino 261) | | |---- f1_2 (ino 258) | | | |---- f2 (ino 261) | |---- f1l2 (ino 258) | |---- d (ino 261)
When computing the incremental send stream the following steps happen:
1) When processing inode 261, a rename operation is issued that renames inode 262, which currently as a path of "d", to an orphan name of "o262-7-0". This is done because in the send snapshot, inode 261 has of its hard links with a path of "d" as well.
2) Two link operations are issued that create the new hard links for inode 261, whose names are "d" and "f2l2_2", at paths "/" and "o262-7-0/" respectively.
3) Still while processing inode 261, unlink operations are issued to remove the old hard links of inode 261, with names "f2l1" and "f2l2", at paths "a/" and "d/". However path "d/" does not correspond anymore to the directory inode 262 but corresponds instead to a hard link of inode 261 (link command issued in the previous step). This makes the receiver fail with a ENOTDIR error when attempting the unlink operation.
The problem happens because before sending the unlink operation, we failed to detect that inode 262 was one of ancestors for inode 261 in the parent snapshot, and therefore we didn't recompute the path for inode 262 before issuing the unlink operation for the link named "f2l2" of inode 262. The detection failed because the function "is_ancestor()" only follows the first hard link it finds for an inode instead of all of its hard links (as it was originally created for being used with directories only, for which only one hard link exists). So fix this by making "is_ancestor()" follow all hard links of the input inode.
A test case for fstests follows soon.
Signed-off-by: Filipe Manana fdmanana@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/btrfs/send.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 106 insertions(+), 18 deletions(-)
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8fd195cfe81b..2c35717a3470 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3527,7 +3527,40 @@ out: }
/* - * Check if ino ino1 is an ancestor of inode ino2 in the given root. + * Check if inode ino2, or any of its ancestors, is inode ino1. + * Return 1 if true, 0 if false and < 0 on error. + */ +static int check_ino_in_path(struct btrfs_root *root, + const u64 ino1, + const u64 ino1_gen, + const u64 ino2, + const u64 ino2_gen, + struct fs_path *fs_path) +{ + u64 ino = ino2; + + if (ino1 == ino2) + return ino1_gen == ino2_gen; + + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + u64 parent; + u64 parent_gen; + int ret; + + fs_path_reset(fs_path); + ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); + if (ret < 0) + return ret; + if (parent == ino1) + return parent_gen == ino1_gen; + ino = parent; + } + return 0; +} + +/* + * Check if ino ino1 is an ancestor of inode ino2 in the given root for any + * possible path (in case ino2 is not a directory and has multiple hard links). * Return 1 if true, 0 if false and < 0 on error. */ static int is_ancestor(struct btrfs_root *root, @@ -3536,36 +3569,91 @@ static int is_ancestor(struct btrfs_root *root, const u64 ino2, struct fs_path *fs_path) { - u64 ino = ino2; - bool free_path = false; + bool free_fs_path = false; int ret = 0; + struct btrfs_path *path = NULL; + struct btrfs_key key;
if (!fs_path) { fs_path = fs_path_alloc(); if (!fs_path) return -ENOMEM; - free_path = true; + free_fs_path = true; }
- while (ino > BTRFS_FIRST_FREE_OBJECTID) { - u64 parent; - u64 parent_gen; + path = alloc_path_for_send(); + if (!path) { + ret = -ENOMEM; + goto out; + }
- fs_path_reset(fs_path); - ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); - if (ret < 0) { - if (ret == -ENOENT && ino == ino2) - ret = 0; - goto out; + key.objectid = ino2; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + u32 cur_offset = 0; + u32 item_size; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + continue; } - if (parent == ino1) { - ret = parent_gen == ino1_gen ? 1 : 0; - goto out; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino2) + break; + if (key.type != BTRFS_INODE_REF_KEY && + key.type != BTRFS_INODE_EXTREF_KEY) + break; + + item_size = btrfs_item_size_nr(leaf, slot); + while (cur_offset < item_size) { + u64 parent; + u64 parent_gen; + + if (key.type == BTRFS_INODE_EXTREF_KEY) { + unsigned long ptr; + struct btrfs_inode_extref *extref; + + ptr = btrfs_item_ptr_offset(leaf, slot); + extref = (struct btrfs_inode_extref *) + (ptr + cur_offset); + parent = btrfs_inode_extref_parent(leaf, + extref); + cur_offset += sizeof(*extref); + cur_offset += btrfs_inode_extref_name_len(leaf, + extref); + } else { + parent = key.offset; + cur_offset = item_size; + } + + ret = get_inode_info(root, parent, NULL, &parent_gen, + NULL, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = check_ino_in_path(root, ino1, ino1_gen, + parent, parent_gen, fs_path); + if (ret) + goto out; } - ino = parent; + path->slots[0]++; } + ret = 0; out: - if (free_path) + btrfs_free_path(path); + if (free_fs_path) fs_path_free(fs_path); return ret; }
From: Michal Hocko mhocko@suse.com
[ Upstream commit d210a9874b8f6166579408131cb74495caff1958 ]
percpu_counter_init failure path doesn't clean up &btp->bt_lru list. Call list_lru_destroy in that error path. Similarly register_shrinker error path is not handled.
While it is unlikely to trigger these error path, it is not impossible especially the later might fail with large NUMAs. Let's handle the failure to make the code more robust.
Noticed-by: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Michal Hocko mhocko@suse.com Acked-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/xfs/xfs_buf.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 2f97c12ca75e..16f93d7356b7 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1813,22 +1813,27 @@ xfs_alloc_buftarg( btp->bt_daxdev = dax_dev;
if (xfs_setsize_buftarg_early(btp, bdev)) - goto error; + goto error_free;
if (list_lru_init(&btp->bt_lru)) - goto error; + goto error_free;
if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) - goto error; + goto error_lru;
btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; - register_shrinker(&btp->bt_shrinker); + if (register_shrinker(&btp->bt_shrinker)) + goto error_pcpu; return btp;
-error: +error_pcpu: + percpu_counter_destroy(&btp->bt_io_count); +error_lru: + list_lru_destroy(&btp->bt_lru); +error_free: kmem_free(btp); return NULL; }
From: Christian König christian.koenig@amd.com
[ Upstream commit 6edc6910ba4cd6eab309263539c8f09b8ad772bf ]
Never try to move pinned BOs during CS.
Signed-off-by: Christian König christian.koenig@amd.com Reviewed-by: Michel Dänzer michel.daenzer@amd.com Signed-off-by: Alex Deucher alexander.deucher@amd.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 60d8bedb694d..b5aa8e6f8e0b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -403,6 +403,10 @@ static bool amdgpu_cs_try_evict(struct amdgpu_cs_parser *p, if (candidate->robj == validated) break;
+ /* We can't move pinned BOs here */ + if (bo->pin_count) + continue; + other = amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type);
/* Check if this BO is in one of the domains we need space for */
From: Geert Uytterhoeven geert+renesas@glider.be
[ Upstream commit 15bfe05c8d6386f1a90e9340d15336e85e32aad6 ]
On 64-bit (e.g. powerpc64/allmodconfig):
drivers/net/ethernet/xilinx/ll_temac_main.c: In function 'temac_start_xmit_done': drivers/net/ethernet/xilinx/ll_temac_main.c:633:22: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] dev_kfree_skb_irq((struct sk_buff *)cur_p->app4); ^
cdmac_bd.app4 is u32, so it is too small to hold a kernel pointer.
Note that several other fields in struct cdmac_bd are also too small to hold physical addresses on 64-bit platforms.
Signed-off-by: Geert Uytterhoeven geert+renesas@glider.be Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/net/ethernet/xilinx/Kconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/net/ethernet/xilinx/Kconfig b/drivers/net/ethernet/xilinx/Kconfig index 6d68c8a8f4f2..da4ec575ccf9 100644 --- a/drivers/net/ethernet/xilinx/Kconfig +++ b/drivers/net/ethernet/xilinx/Kconfig @@ -34,6 +34,7 @@ config XILINX_AXI_EMAC config XILINX_LL_TEMAC tristate "Xilinx LL TEMAC (LocalLink Tri-mode Ethernet MAC) driver" depends on (PPC || MICROBLAZE) + depends on !64BIT || BROKEN select PHYLIB ---help--- This driver supports the Xilinx 10/100/1000 LocalLink TEMAC
From: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp
[ Upstream commit 88bc0ede8d35edc969350852894dc864a2dc1859 ]
register_shrinker() might return -ENOMEM error since Linux 3.12. Call panic() as with other failure checks in this function if register_shrinker() failed.
Fixes: 1d3d4437eae1 ("vmscan: per-node deferred work") Signed-off-by: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp Cc: Jan Kara jack@suse.com Cc: Michal Hocko mhocko@suse.com Reviewed-by: Michal Hocko mhocko@suse.com Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/quota/dquot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 8406be97a518..4cd0c2336624 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2985,7 +2985,8 @@ static int __init dquot_init(void) pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld," " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));
- register_shrinker(&dqcache_shrinker); + if (register_shrinker(&dqcache_shrinker)) + panic("Cannot register dquot shrinker");
return 0; }
From: Trond Myklebust trond.myklebust@primarydata.com
[ Upstream commit 4ba161a793d5f43757c35feff258d9f20a082940 ]
Reported-by: Dmitry Vyukov dvyukov@google.com Signed-off-by: Trond Myklebust trond.myklebust@primarydata.com Tested-by: Dmitry Vyukov dvyukov@google.com Signed-off-by: Anna Schumaker Anna.Schumaker@Netapp.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- net/sunrpc/xprtsock.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 4dad5da388d6..8cb40f8ffa5b 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2437,6 +2437,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -ECONNREFUSED: case -ECONNRESET: case -ENETUNREACH: + case -EHOSTUNREACH: case -EADDRINUSE: case -ENOBUFS: /*
From: "Liu, Changcheng" changcheng.liu@intel.com
[ Upstream commit 95a87982541932503d3f59aba4c30b0bde0a6294 ]
When cross-compiling, fadd2line should use the binary tool used for the target system, rather than that of the host.
Link: http://lkml.kernel.org/r/20171121092911.GA150711@sofia Signed-off-by: Liu Changcheng changcheng.liu@intel.com Cc: Kate Stewart kstewart@linuxfoundation.org Cc: NeilBrown neilb@suse.com Cc: Thomas Gleixner tglx@linutronix.de Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- scripts/faddr2line | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-)
diff --git a/scripts/faddr2line b/scripts/faddr2line index 1f5ce959f596..39e07d8574dd 100755 --- a/scripts/faddr2line +++ b/scripts/faddr2line @@ -44,9 +44,16 @@ set -o errexit set -o nounset
+READELF="${CROSS_COMPILE}readelf" +ADDR2LINE="${CROSS_COMPILE}addr2line" +SIZE="${CROSS_COMPILE}size" +NM="${CROSS_COMPILE}nm" + command -v awk >/dev/null 2>&1 || die "awk isn't installed" -command -v readelf >/dev/null 2>&1 || die "readelf isn't installed" -command -v addr2line >/dev/null 2>&1 || die "addr2line isn't installed" +command -v ${READELF} >/dev/null 2>&1 || die "readelf isn't installed" +command -v ${ADDR2LINE} >/dev/null 2>&1 || die "addr2line isn't installed" +command -v ${SIZE} >/dev/null 2>&1 || die "size isn't installed" +command -v ${NM} >/dev/null 2>&1 || die "nm isn't installed"
usage() { echo "usage: faddr2line <object file> <func+offset> <func+offset>..." >&2 @@ -69,10 +76,10 @@ die() { find_dir_prefix() { local objfile=$1
- local start_kernel_addr=$(readelf -sW $objfile | awk '$8 == "start_kernel" {printf "0x%s", $2}') + local start_kernel_addr=$(${READELF} -sW $objfile | awk '$8 == "start_kernel" {printf "0x%s", $2}') [[ -z $start_kernel_addr ]] && return
- local file_line=$(addr2line -e $objfile $start_kernel_addr) + local file_line=$(${ADDR2LINE} -e $objfile $start_kernel_addr) [[ -z $file_line ]] && return
local prefix=${file_line%init/main.c:*} @@ -104,7 +111,7 @@ __faddr2line() {
# Go through each of the object's symbols which match the func name. # In rare cases there might be duplicates. - file_end=$(size -Ax $objfile | awk '$1 == ".text" {print $2}') + file_end=$(${SIZE} -Ax $objfile | awk '$1 == ".text" {print $2}') while read symbol; do local fields=($symbol) local sym_base=0x${fields[0]} @@ -156,10 +163,10 @@ __faddr2line() {
# pass real address to addr2line echo "$func+$offset/$sym_size:" - addr2line -fpie $objfile $addr | sed "s; $dir_prefix(./)*; ;" + ${ADDR2LINE} -fpie $objfile $addr | sed "s; $dir_prefix(./)*; ;" DONE=1
- done < <(nm -n $objfile | awk -v fn=$func -v end=$file_end '$3 == fn { found=1; line=$0; start=$1; next } found == 1 { found=0; print line, "0x"$1 } END {if (found == 1) print line, end; }') + done < <(${NM} -n $objfile | awk -v fn=$func -v end=$file_end '$3 == fn { found=1; line=$0; start=$1; next } found == 1 { found=0; print line, "0x"$1 } END {if (found == 1) print line, end; }') }
[[ $# -lt 2 ]] && usage
From: Jiang Biao jiang.biao2@zte.com.cn
[ Upstream commit d5dabd633922ac5ee5bcc67748f7defb8b211469 ]
When running ltp stress test for 7*24 hours, vmscan occasionally emits the following warning continuously:
mb_cache_scan+0x0/0x3f0 negative objects to delete nr=-9232265467809300450 ...
Tracing shows the freeable(mb_cache_count returns) is -1, which causes the continuous accumulation and overflow of total_scan.
This patch makes sure that mb_cache_count() cannot return a negative value, which makes the mbcache shrinker more robust.
Link: http://lkml.kernel.org/r/1511753419-52328-1-git-send-email-jiang.biao2@zte.c... Signed-off-by: Jiang Biao jiang.biao2@zte.com.cn Cc: Al Viro viro@zeniv.linux.org.uk Cc: Minchan Kim minchan@kernel.org Cc: Michal Hocko mhocko@kernel.org Cc: zhong.weidong@zte.com.cn Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/mbcache.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/fs/mbcache.c b/fs/mbcache.c index d818fd236787..b8b8b9ced9f8 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -269,6 +269,9 @@ static unsigned long mb_cache_count(struct shrinker *shrink, struct mb_cache *cache = container_of(shrink, struct mb_cache, c_shrink);
+ /* Unlikely, but not impossible */ + if (unlikely(cache->c_entry_count < 0)) + return 0; return cache->c_entry_count; }
From: Yisheng Xie xieyisheng1@huawei.com
[ Upstream commit bde5f6bc68db51128f875a756e9082a6c6ff7b4c ]
kmemleak_scan() will scan struct page for each node and it can be really large and resulting in a soft lockup. We have seen a soft lockup when do scan while compile kernel:
watchdog: BUG: soft lockup - CPU#53 stuck for 22s! [bash:10287] [...] Call Trace: kmemleak_scan+0x21a/0x4c0 kmemleak_write+0x312/0x350 full_proxy_write+0x5a/0xa0 __vfs_write+0x33/0x150 vfs_write+0xad/0x1a0 SyS_write+0x52/0xc0 do_syscall_64+0x61/0x1a0 entry_SYSCALL64_slow_path+0x25/0x25
Fix this by adding cond_resched every MAX_SCAN_SIZE.
Link: http://lkml.kernel.org/r/1511439788-20099-1-git-send-email-xieyisheng1@huawe... Signed-off-by: Yisheng Xie xieyisheng1@huawei.com Suggested-by: Catalin Marinas catalin.marinas@arm.com Acked-by: Catalin Marinas catalin.marinas@arm.com Cc: Michal Hocko mhocko@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- mm/kmemleak.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 7780cd83a495..a1ba553816eb 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1532,6 +1532,8 @@ static void kmemleak_scan(void) if (page_count(page) == 0) continue; scan_block(page, page + 1, NULL); + if (!(pfn % (MAX_SCAN_SIZE / sizeof(*page)))) + cond_resched(); } } put_online_mems();
Hi Sasha,
There is a fix patch for this patch:
13ab183 mm/kmemleak.c: make cond_resched() rate-limiting more efficient
So if this patch is backported to stable, maybe the above patch is also need to be backported too.
Thanks Yisheng
On 2018/1/24 12:15, Sasha Levin wrote:
From: Yisheng Xie xieyisheng1@huawei.com
[ Upstream commit bde5f6bc68db51128f875a756e9082a6c6ff7b4c ]
kmemleak_scan() will scan struct page for each node and it can be really large and resulting in a soft lockup. We have seen a soft lockup when do scan while compile kernel:
watchdog: BUG: soft lockup - CPU#53 stuck for 22s! [bash:10287] [...] Call Trace: kmemleak_scan+0x21a/0x4c0 kmemleak_write+0x312/0x350 full_proxy_write+0x5a/0xa0 __vfs_write+0x33/0x150 vfs_write+0xad/0x1a0 SyS_write+0x52/0xc0 do_syscall_64+0x61/0x1a0 entry_SYSCALL64_slow_path+0x25/0x25
Fix this by adding cond_resched every MAX_SCAN_SIZE.
Link: http://lkml.kernel.org/r/1511439788-20099-1-git-send-email-xieyisheng1@huawe... Signed-off-by: Yisheng Xie xieyisheng1@huawei.com Suggested-by: Catalin Marinas catalin.marinas@arm.com Acked-by: Catalin Marinas catalin.marinas@arm.com Cc: Michal Hocko mhocko@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin alexander.levin@microsoft.com
mm/kmemleak.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 7780cd83a495..a1ba553816eb 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1532,6 +1532,8 @@ static void kmemleak_scan(void) if (page_count(page) == 0) continue; scan_block(page, page + 1, NULL);
if (!(pfn % (MAX_SCAN_SIZE / sizeof(*page))))
} } put_online_mems();cond_resched();
On Wed, Jan 24, 2018 at 03:46:27PM +0800, Yisheng Xie wrote:
Hi Sasha,
There is a fix patch for this patch:
13ab183 mm/kmemleak.c: make cond_resched() rate-limiting more efficient
So if this patch is backported to stable, maybe the above patch is also need to be backported too.
Hi Yisheng,
This patch fixes a softlockup, while the patch you linked seems like an performance optimization for this patch, so I'd rather skip it unless you're aware of an actual issue it might fix.
-- Thanks, Sasha
From: Eric Anholt eric@anholt.net
[ Upstream commit dbb58bfd9ae6c885b2ca001a9a5ab8b881fb4ba9 ]
The panel_bridge bridge attaches to the panel's OF node, not the lvds-encoder's node. Put in a little no-op bridge of our own so that our consumers can still find a bridge where they expect.
This also fixes an unintended unregistration and leak of the panel-bridge on module remove.
Signed-off-by: Eric Anholt eric@anholt.net Fixes: 13dfc0540a57 ("drm/bridge: Refactor out the panel wrapper from the lvds-encoder bri dge.") Tested-by: Lothar Waßmann LW@KARO-electronics.de Signed-off-by: Archit Taneja architt@codeaurora.org Link: https://patchwork.freedesktop.org/patch/msgid/20171114191647.22207-1-eric@an...
Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/gpu/drm/bridge/lvds-encoder.c | 48 ++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/bridge/lvds-encoder.c b/drivers/gpu/drm/bridge/lvds-encoder.c index 0903ba574f61..75b0d3f6e4de 100644 --- a/drivers/gpu/drm/bridge/lvds-encoder.c +++ b/drivers/gpu/drm/bridge/lvds-encoder.c @@ -13,13 +13,37 @@
#include <linux/of_graph.h>
+struct lvds_encoder { + struct drm_bridge bridge; + struct drm_bridge *panel_bridge; +}; + +static int lvds_encoder_attach(struct drm_bridge *bridge) +{ + struct lvds_encoder *lvds_encoder = container_of(bridge, + struct lvds_encoder, + bridge); + + return drm_bridge_attach(bridge->encoder, lvds_encoder->panel_bridge, + bridge); +} + +static struct drm_bridge_funcs funcs = { + .attach = lvds_encoder_attach, +}; + static int lvds_encoder_probe(struct platform_device *pdev) { struct device_node *port; struct device_node *endpoint; struct device_node *panel_node; struct drm_panel *panel; - struct drm_bridge *bridge; + struct lvds_encoder *lvds_encoder; + + lvds_encoder = devm_kzalloc(&pdev->dev, sizeof(*lvds_encoder), + GFP_KERNEL); + if (!lvds_encoder) + return -ENOMEM;
/* Locate the panel DT node. */ port = of_graph_get_port_by_id(pdev->dev.of_node, 1); @@ -49,20 +73,30 @@ static int lvds_encoder_probe(struct platform_device *pdev) return -EPROBE_DEFER; }
- bridge = drm_panel_bridge_add(panel, DRM_MODE_CONNECTOR_LVDS); - if (IS_ERR(bridge)) - return PTR_ERR(bridge); + lvds_encoder->panel_bridge = + devm_drm_panel_bridge_add(&pdev->dev, + panel, DRM_MODE_CONNECTOR_LVDS); + if (IS_ERR(lvds_encoder->panel_bridge)) + return PTR_ERR(lvds_encoder->panel_bridge); + + /* The panel_bridge bridge is attached to the panel's of_node, + * but we need a bridge attached to our of_node for our user + * to look up. + */ + lvds_encoder->bridge.of_node = pdev->dev.of_node; + lvds_encoder->bridge.funcs = &funcs; + drm_bridge_add(&lvds_encoder->bridge);
- platform_set_drvdata(pdev, bridge); + platform_set_drvdata(pdev, lvds_encoder);
return 0; }
static int lvds_encoder_remove(struct platform_device *pdev) { - struct drm_bridge *bridge = platform_get_drvdata(pdev); + struct lvds_encoder *lvds_encoder = platform_get_drvdata(pdev);
- drm_bridge_remove(bridge); + drm_bridge_remove(&lvds_encoder->bridge);
return 0; }
From: Andrey Gusakov andrey.gusakov@cogentembedded.com
[ Upstream commit cffd2b16c01c3431a7a7dd62e722af33490fc436 ]
Do not fail data rates higher than 2.7 and more than 2 lanes. Try to fall back to 2.7Gbps and 2 lanes.
Acked-by: Philipp Zabel p.zabel@pengutronix.de Reviewed-by: Andrzej Hajda a.hajda@samsung.com Signed-off-by: Andrey Gusakov andrey.gusakov@cogentembedded.com Signed-off-by: Andrzej Hajda a.hajda@samsung.com Link: https://patchwork.freedesktop.org/patch/msgid/1510073785-16108-2-git-send-em... Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/gpu/drm/bridge/tc358767.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/bridge/tc358767.c b/drivers/gpu/drm/bridge/tc358767.c index 8571cfd877c5..e1996dbfb0a1 100644 --- a/drivers/gpu/drm/bridge/tc358767.c +++ b/drivers/gpu/drm/bridge/tc358767.c @@ -603,8 +603,15 @@ static int tc_get_display_props(struct tc_data *tc) ret = drm_dp_link_probe(&tc->aux, &tc->link.base); if (ret < 0) goto err_dpcd_read; - if ((tc->link.base.rate != 162000) && (tc->link.base.rate != 270000)) - goto err_dpcd_inval; + if (tc->link.base.rate != 162000 && tc->link.base.rate != 270000) { + dev_dbg(tc->dev, "Falling to 2.7 Gbps rate\n"); + tc->link.base.rate = 270000; + } + + if (tc->link.base.num_lanes > 2) { + dev_dbg(tc->dev, "Falling to 2 lanes\n"); + tc->link.base.num_lanes = 2; + }
ret = drm_dp_dpcd_readb(&tc->aux, DP_MAX_DOWNSPREAD, tmp); if (ret < 0) @@ -637,9 +644,6 @@ static int tc_get_display_props(struct tc_data *tc) err_dpcd_read: dev_err(tc->dev, "failed to read DPCD: %d\n", ret); return ret; -err_dpcd_inval: - dev_err(tc->dev, "invalid DPCD\n"); - return -EINVAL; }
static int tc_set_video_mode(struct tc_data *tc, struct drm_display_mode *mode)
From: Andrey Gusakov andrey.gusakov@cogentembedded.com
[ Upstream commit f3b8adbe1911f66fd3cab1aaa74f0f66b7ceda25 ]
Remove shift from TU_SIZE_RECOMMENDED define as it used to calculate max_tu_symbols.
Acked-by: Philipp Zabel p.zabel@pengutronix.de Signed-off-by: Andrey Gusakov andrey.gusakov@cogentembedded.com Signed-off-by: Andrzej Hajda a.hajda@samsung.com Link: https://patchwork.freedesktop.org/patch/msgid/1510073785-16108-4-git-send-em... Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/gpu/drm/bridge/tc358767.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/bridge/tc358767.c b/drivers/gpu/drm/bridge/tc358767.c index e571b5e29ae7..7334cb2121a3 100644 --- a/drivers/gpu/drm/bridge/tc358767.c +++ b/drivers/gpu/drm/bridge/tc358767.c @@ -97,7 +97,7 @@ #define DP0_ACTIVEVAL 0x0650 #define DP0_SYNCVAL 0x0654 #define DP0_MISC 0x0658 -#define TU_SIZE_RECOMMENDED (0x3f << 16) /* LSCLK cycles per TU */ +#define TU_SIZE_RECOMMENDED (63) /* LSCLK cycles per TU */ #define BPC_6 (0 << 5) #define BPC_8 (1 << 5)
@@ -716,7 +716,8 @@ static int tc_set_video_mode(struct tc_data *tc, struct drm_display_mode *mode) * Must be less than tu_size. */ max_tu_symbol = TU_SIZE_RECOMMENDED - 1; - tc_write(DP0_MISC, (max_tu_symbol << 23) | TU_SIZE_RECOMMENDED | BPC_8); + tc_write(DP0_MISC, (max_tu_symbol << 23) | (TU_SIZE_RECOMMENDED << 16) | + BPC_8);
return 0; err:
From: Andrey Gusakov andrey.gusakov@cogentembedded.com
[ Upstream commit 99fc8e963a4c0203dba26a77cf737db6081bca14 ]
Pixel clock limitation for DPI is 154 MHz. Do not accept modes with higher pixel clock rate.
Reviewed-by: Andrzej Hajda a.hajda@samsung.com Signed-off-by: Andrey Gusakov andrey.gusakov@cogentembedded.com Signed-off-by: Andrzej Hajda a.hajda@samsung.com Link: https://patchwork.freedesktop.org/patch/msgid/1510073785-16108-3-git-send-em... Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/gpu/drm/bridge/tc358767.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/bridge/tc358767.c b/drivers/gpu/drm/bridge/tc358767.c index e1996dbfb0a1..e571b5e29ae7 100644 --- a/drivers/gpu/drm/bridge/tc358767.c +++ b/drivers/gpu/drm/bridge/tc358767.c @@ -1103,7 +1103,10 @@ static bool tc_bridge_mode_fixup(struct drm_bridge *bridge, static int tc_connector_mode_valid(struct drm_connector *connector, struct drm_display_mode *mode) { - /* Accept any mode */ + /* DPI interface clock limitation: upto 154 MHz */ + if (mode->clock > 154000) + return MODE_CLOCK_HIGH; + return MODE_OK; }
From: Andrey Gusakov andrey.gusakov@cogentembedded.com
[ Upstream commit 9217c1abbc145a77d65c476cf2004a3df02104c7 ]
First four bytes should go to DP0_AUXWDATA0. Due to bug if len > 4 first four bytes was writen to DP0_AUXWDATA1 and all data get shifted by 4 bytes. Fix it.
Acked-by: Philipp Zabel p.zabel@pengutronix.de Signed-off-by: Andrey Gusakov andrey.gusakov@cogentembedded.com Signed-off-by: Andrzej Hajda a.hajda@samsung.com Link: https://patchwork.freedesktop.org/patch/msgid/1510073785-16108-6-git-send-em... Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/gpu/drm/bridge/tc358767.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/bridge/tc358767.c b/drivers/gpu/drm/bridge/tc358767.c index b916346b933a..24f495bf0567 100644 --- a/drivers/gpu/drm/bridge/tc358767.c +++ b/drivers/gpu/drm/bridge/tc358767.c @@ -318,7 +318,7 @@ static ssize_t tc_aux_transfer(struct drm_dp_aux *aux, tmp = (tmp << 8) | buf[i]; i++; if (((i % 4) == 0) || (i == size)) { - tc_write(DP0_AUXWDATA(i >> 2), tmp); + tc_write(DP0_AUXWDATA((i - 1) >> 2), tmp); tmp = 0; } }
From: Andrey Gusakov andrey.gusakov@cogentembedded.com
[ Upstream commit 66d1c3b94d5d59e4325e61a78d520f92c043d645 ]
Fields in HTIM01 and HTIM02 regs should be even. Recomended thresh_dly value is max_tu_symbol. Remove set of VPCTRL0.VSDELAY as it is related to DSI input interface. Currently driver supports only DPI.
Acked-by: Philipp Zabel p.zabel@pengutronix.de Signed-off-by: Andrey Gusakov andrey.gusakov@cogentembedded.com Signed-off-by: Andrzej Hajda a.hajda@samsung.com Link: https://patchwork.freedesktop.org/patch/msgid/1510073785-16108-5-git-send-em... Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/gpu/drm/bridge/tc358767.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-)
diff --git a/drivers/gpu/drm/bridge/tc358767.c b/drivers/gpu/drm/bridge/tc358767.c index 7334cb2121a3..b916346b933a 100644 --- a/drivers/gpu/drm/bridge/tc358767.c +++ b/drivers/gpu/drm/bridge/tc358767.c @@ -659,6 +659,14 @@ static int tc_set_video_mode(struct tc_data *tc, struct drm_display_mode *mode) int lower_margin = mode->vsync_start - mode->vdisplay; int vsync_len = mode->vsync_end - mode->vsync_start;
+ /* + * Recommended maximum number of symbols transferred in a transfer unit: + * DIV_ROUND_UP((input active video bandwidth in bytes) * tu_size, + * (output active video bandwidth in bytes)) + * Must be less than tu_size. + */ + max_tu_symbol = TU_SIZE_RECOMMENDED - 1; + dev_dbg(tc->dev, "set mode %dx%d\n", mode->hdisplay, mode->vdisplay); dev_dbg(tc->dev, "H margin %d,%d sync %d\n", @@ -668,13 +676,18 @@ static int tc_set_video_mode(struct tc_data *tc, struct drm_display_mode *mode) dev_dbg(tc->dev, "total: %dx%d\n", mode->htotal, mode->vtotal);
- /* LCD Ctl Frame Size */ - tc_write(VPCTRL0, (0x40 << 20) /* VSDELAY */ | + /* + * LCD Ctl Frame Size + * datasheet is not clear of vsdelay in case of DPI + * assume we do not need any delay when DPI is a source of + * sync signals + */ + tc_write(VPCTRL0, (0 << 20) /* VSDELAY */ | OPXLFMT_RGB888 | FRMSYNC_DISABLED | MSF_DISABLED); - tc_write(HTIM01, (left_margin << 16) | /* H back porch */ - (hsync_len << 0)); /* Hsync */ - tc_write(HTIM02, (right_margin << 16) | /* H front porch */ - (mode->hdisplay << 0)); /* width */ + tc_write(HTIM01, (ALIGN(left_margin, 2) << 16) | /* H back porch */ + (ALIGN(hsync_len, 2) << 0)); /* Hsync */ + tc_write(HTIM02, (ALIGN(right_margin, 2) << 16) | /* H front porch */ + (ALIGN(mode->hdisplay, 2) << 0)); /* width */ tc_write(VTIM01, (upper_margin << 16) | /* V back porch */ (vsync_len << 0)); /* Vsync */ tc_write(VTIM02, (lower_margin << 16) | /* V front porch */ @@ -693,7 +706,7 @@ static int tc_set_video_mode(struct tc_data *tc, struct drm_display_mode *mode) /* DP Main Stream Attributes */ vid_sync_dly = hsync_len + left_margin + mode->hdisplay; tc_write(DP0_VIDSYNCDELAY, - (0x003e << 16) | /* thresh_dly */ + (max_tu_symbol << 16) | /* thresh_dly */ (vid_sync_dly << 0));
tc_write(DP0_TOTALVAL, (mode->vtotal << 16) | (mode->htotal)); @@ -709,13 +722,6 @@ static int tc_set_video_mode(struct tc_data *tc, struct drm_display_mode *mode) tc_write(DPIPXLFMT, VS_POL_ACTIVE_LOW | HS_POL_ACTIVE_LOW | DE_POL_ACTIVE_HIGH | SUB_CFG_TYPE_CONFIG1 | DPI_BPP_RGB888);
- /* - * Recommended maximum number of symbols transferred in a transfer unit: - * DIV_ROUND_UP((input active video bandwidth in bytes) * tu_size, - * (output active video bandwidth in bytes)) - * Must be less than tu_size. - */ - max_tu_symbol = TU_SIZE_RECOMMENDED - 1; tc_write(DP0_MISC, (max_tu_symbol << 23) | (TU_SIZE_RECOMMENDED << 16) | BPC_8);
From: Andrey Gusakov andrey.gusakov@cogentembedded.com
[ Upstream commit 4dbd6c03fbf88299c573d676838896c6e06aade2 ]
Use drm_dp_channel_eq_ok helper
Acked-by: Philipp Zabel p.zabel@pengutronix.de Signed-off-by: Andrey Gusakov andrey.gusakov@cogentembedded.com Signed-off-by: Andrzej Hajda a.hajda@samsung.com Link: https://patchwork.freedesktop.org/patch/msgid/1510073785-16108-7-git-send-em... Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/gpu/drm/bridge/tc358767.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/bridge/tc358767.c b/drivers/gpu/drm/bridge/tc358767.c index 24f495bf0567..8636e7eeb731 100644 --- a/drivers/gpu/drm/bridge/tc358767.c +++ b/drivers/gpu/drm/bridge/tc358767.c @@ -819,8 +819,6 @@ static int tc_main_link_setup(struct tc_data *tc) unsigned int rate; u32 dp_phy_ctrl; int timeout; - bool aligned; - bool ready; u32 value; int ret; u8 tmp[8]; @@ -965,16 +963,15 @@ static int tc_main_link_setup(struct tc_data *tc) ret = drm_dp_dpcd_read_link_status(aux, tmp + 2); if (ret < 0) goto err_dpcd_read; - ready = (tmp[2] == ((DP_CHANNEL_EQ_BITS << 4) | /* Lane1 */ - DP_CHANNEL_EQ_BITS)); /* Lane0 */ - aligned = tmp[4] & DP_INTERLANE_ALIGN_DONE; - } while ((--timeout) && !(ready && aligned)); + } while ((--timeout) && + !(drm_dp_channel_eq_ok(tmp + 2, tc->link.base.num_lanes)));
if (timeout == 0) { /* Read DPCD 0x200-0x201 */ ret = drm_dp_dpcd_read(aux, DP_SINK_COUNT, tmp, 2); if (ret < 0) goto err_dpcd_read; + dev_err(dev, "channel(s) EQ not ok\n"); dev_info(dev, "0x0200 SINK_COUNT: 0x%02x\n", tmp[0]); dev_info(dev, "0x0201 DEVICE_SERVICE_IRQ_VECTOR: 0x%02x\n", tmp[1]); @@ -985,10 +982,6 @@ static int tc_main_link_setup(struct tc_data *tc) dev_info(dev, "0x0206 ADJUST_REQUEST_LANE0_1: 0x%02x\n", tmp[6]);
- if (!ready) - dev_err(dev, "Lane0/1 not ready\n"); - if (!aligned) - dev_err(dev, "Lane0/1 not aligned\n"); return -EAGAIN; }
From: Christophe JAILLET christophe.jaillet@wanadoo.fr
[ Upstream commit 8677b1ac2db021ab30bb1fa34f1e56ebe0051ec3 ]
If we don't find a matching device node, we must free the memory allocated in 'omap_dmm' a few lines above.
Fixes: 7cb0d6c17b96 ("drm/omap: fix TILER on OMAP5") Signed-off-by: Christophe JAILLET christophe.jaillet@wanadoo.fr Signed-off-by: Tomi Valkeinen tomi.valkeinen@ti.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/gpu/drm/omapdrm/omap_dmm_tiler.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c index 1dd3dafc59af..c60a85e82c6d 100644 --- a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c +++ b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c @@ -638,7 +638,8 @@ static int omap_dmm_probe(struct platform_device *dev) match = of_match_node(dmm_of_match, dev->dev.of_node); if (!match) { dev_err(&dev->dev, "failed to find matching device node\n"); - return -ENODEV; + ret = -ENODEV; + goto fail; }
omap_dmm->plat_data = match->data;
From: Arnd Bergmann arnd@arndb.de
[ Upstream commit 499ec0ed5eb2f6a7fcaab2dd66ffc5993484bda9 ]
The new backlight code causes a link failure when backlight support itself is disabled:
drivers/gpu/drm/omapdrm/displays/panel-dpi.o: In function `panel_dpi_probe_of': panel-dpi.c:(.text+0x35c): undefined reference to `of_find_backlight_by_node'
This adds a Kconfig dependency like we have for the other OMAP display targets.
Fixes: 39135a305a0f ("drm/omap: displays: panel-dpi: Support for handling backlight devices") Signed-off-by: Arnd Bergmann arnd@arndb.de Signed-off-by: Tomi Valkeinen tomi.valkeinen@ti.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/gpu/drm/omapdrm/displays/Kconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/gpu/drm/omapdrm/displays/Kconfig b/drivers/gpu/drm/omapdrm/displays/Kconfig index c226da145fb3..a349cb61961e 100644 --- a/drivers/gpu/drm/omapdrm/displays/Kconfig +++ b/drivers/gpu/drm/omapdrm/displays/Kconfig @@ -35,6 +35,7 @@ config DRM_OMAP_CONNECTOR_ANALOG_TV
config DRM_OMAP_PANEL_DPI tristate "Generic DPI panel" + depends on BACKLIGHT_CLASS_DEVICE help Driver for generic DPI panels.
From: "Darrick J. Wong" darrick.wong@oracle.com
[ Upstream commit 22a6c83777ac7c17d6c63891beeeac24cf5da450 ]
Fix some complaints from the UBSAN about signed integer addition overflows.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/xfs/xfs_aops.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a3eeaba156c5..b0cccf8a81a8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -399,7 +399,7 @@ xfs_map_blocks( (ip->i_df.if_flags & XFS_IFEXTENTS)); ASSERT(offset <= mp->m_super->s_maxbytes);
- if (offset + count > mp->m_super->s_maxbytes) + if ((xfs_ufsize_t)offset + count > mp->m_super->s_maxbytes) count = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); @@ -1265,7 +1265,7 @@ xfs_map_trim_size( if (mapping_size > size) mapping_size = size; if (offset < i_size_read(inode) && - offset + mapping_size >= i_size_read(inode)) { + (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) { /* limit mapping to block that spans EOF */ mapping_size = roundup_64(i_size_read(inode) - offset, i_blocksize(inode)); @@ -1312,7 +1312,7 @@ xfs_get_blocks( lockmode = xfs_ilock_data_map_shared(ip);
ASSERT(offset <= mp->m_super->s_maxbytes); - if (offset + size > mp->m_super->s_maxbytes) + if ((xfs_ufsize_t)offset + size > mp->m_super->s_maxbytes) size = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset);
From: Carlos Maiolino cmaiolino@redhat.com
[ Upstream commit 373b0589dc8d58bc09c9a28d03611ae4fb216057 ]
Once the inode item writeback errors is already fixed, it's time to fix the same problem in dquot code.
Although there were no reports of users hitting this bug in dquot code (at least none I've seen), the bug is there and I was already planning to fix it when the correct approach to fix the inodes part was decided.
This patch aims to fix the same problem in dquot code, regarding failed buffers being unable to be resubmitted once they are flush locked.
Tested with the recently test-case sent to fstests list by Hou Tao.
Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Carlos Maiolino cmaiolino@redhat.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- fs/xfs/xfs_dquot.c | 14 +++++++++++--- fs/xfs/xfs_dquot_item.c | 40 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 5 deletions(-)
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index cd82429d8df7..5a86495127fd 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -987,14 +987,22 @@ xfs_qm_dqflush_done( * holding the lock before removing the dquot from the AIL. */ if ((lip->li_flags & XFS_LI_IN_AIL) && - lip->li_lsn == qip->qli_flush_lsn) { + ((lip->li_lsn == qip->qli_flush_lsn) || + (lip->li_flags & XFS_LI_FAILED))) {
/* xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->xa_lock); - if (lip->li_lsn == qip->qli_flush_lsn) + if (lip->li_lsn == qip->qli_flush_lsn) { xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); - else + } else { + /* + * Clear the failed state since we are about to drop the + * flush lock + */ + if (lip->li_flags & XFS_LI_FAILED) + xfs_clear_li_failed(lip); spin_unlock(&ailp->xa_lock); + } }
/* diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 2c7a1629e064..664dea105e76 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -137,6 +137,26 @@ xfs_qm_dqunpin_wait( wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); }
+/* + * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer + * have been failed during writeback + * + * this informs the AIL that the dquot is already flush locked on the next push, + * and acquires a hold on the buffer to ensure that it isn't reclaimed before + * dirty data makes it to disk. + */ +STATIC void +xfs_dquot_item_error( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + struct xfs_dquot *dqp; + + dqp = DQUOT_ITEM(lip)->qli_dquot; + ASSERT(!completion_done(&dqp->q_flush)); + xfs_set_li_failed(lip, bp); +} + STATIC uint xfs_qm_dquot_logitem_push( struct xfs_log_item *lip, @@ -144,13 +164,28 @@ xfs_qm_dquot_logitem_push( __acquires(&lip->li_ailp->xa_lock) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; - struct xfs_buf *bp = NULL; + struct xfs_buf *bp = lip->li_buf; uint rval = XFS_ITEM_SUCCESS; int error;
if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED;
+ /* + * The buffer containing this item failed to be written back + * previously. Resubmit the buffer for IO + */ + if (lip->li_flags & XFS_LI_FAILED) { + if (!xfs_buf_trylock(bp)) + return XFS_ITEM_LOCKED; + + if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list)) + rval = XFS_ITEM_FLUSHING; + + xfs_buf_unlock(bp); + return rval; + } + if (!xfs_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED;
@@ -242,7 +277,8 @@ static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_unlock = xfs_qm_dquot_logitem_unlock, .iop_committed = xfs_qm_dquot_logitem_committed, .iop_push = xfs_qm_dquot_logitem_push, - .iop_committing = xfs_qm_dquot_logitem_committing + .iop_committing = xfs_qm_dquot_logitem_committing, + .iop_error = xfs_dquot_item_error };
/*
From: Vasily Averin vvs@virtuozzo.com
[ Upstream commit 0e18dd12064e07519f7cbff4149ca7fff620cbed ]
perf with --namespace key leaks various memory objects including namespaces
4.14.0+ pid_namespace 1 12 2568 12 8 user_namespace 1 39 824 39 8 net_namespace 1 5 6272 5 8
This happen because perf_fill_ns_link_info() struct patch ns_path: during initialization ns_path incremented counters on related mnt and dentry, but without lost path_put nobody decremented them back. Leaked dentry is name of related namespace, and its leak does not allow to free unused namespace.
Signed-off-by: Vasily Averin vvs@virtuozzo.com Acked-by: Peter Zijlstra peterz@infradead.org Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Arnaldo Carvalho de Melo acme@kernel.org Cc: Hari Bathini hbathini@linux.vnet.ibm.com Cc: Jiri Olsa jolsa@redhat.com Cc: Linus Torvalds torvalds@linux-foundation.org Cc: Namhyung Kim namhyung@kernel.org Cc: Thomas Gleixner tglx@linutronix.de Fixes: commit e422267322cd ("perf: Add PERF_RECORD_NAMESPACES to include namespaces related info") Link: http://lkml.kernel.org/r/c510711b-3904-e5e1-d296-61273d21118d@virtuozzo.com Signed-off-by: Ingo Molnar mingo@kernel.org Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/kernel/events/core.c b/kernel/events/core.c index 24ebad5567b4..8c20af8738ac 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6756,6 +6756,7 @@ static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, ns_inode = ns_path.dentry->d_inode; ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev); ns_link_info->ino = ns_inode->i_ino; + path_put(&ns_path); } }
From: "Guilherme G. Piccoli" gpiccoli@linux.vnet.ibm.com
[ Upstream commit e4717292ddebcfe231651b5aff9fa19ca158d178 ]
As part of the scsi EH path, aacraid performs a reinitialization of the adapter, which encompass freeing resources and IRQs, NULLifying lots of pointers, and then initialize it all over again. We've identified a problem during the free IRQ portion of this path if CONFIG_DEBUG_SHIRQ is enabled on kernel config file.
Happens that, in case this flag was set, right after free_irq() effectively clears the interrupt, it checks if it was requested as IRQF_SHARED. In positive case, it performs another call to the IRQ handler on driver. Problem is: since aacraid currently free some resources *before* freeing the IRQ, once free_irq() path calls the handler again (due to CONFIG_DEBUG_SHIRQ), aacraid crashes due to NULL pointer dereference with the following trace:
aac_src_intr_message+0xf8/0x740 [aacraid] __free_irq+0x33c/0x4a0 free_irq+0x78/0xb0 aac_free_irq+0x13c/0x150 [aacraid] aac_reset_adapter+0x2e8/0x970 [aacraid] aac_eh_reset+0x3a8/0x5d0 [aacraid] scsi_try_host_reset+0x74/0x180 scsi_eh_ready_devs+0xc70/0x1510 scsi_error_handler+0x624/0xa20
This patch prevents the crash by changing the order of the deinitialization in this path of aacraid: first we clear the IRQ, then we free other resources. No functional change intended.
Signed-off-by: Guilherme G. Piccoli gpiccoli@linux.vnet.ibm.com Reviewed-by: Raghava Aditya Renukunta RaghavaAditya.Renukunta@microsemi.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/scsi/aacraid/commsup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c index 525a652dab48..16a200127687 100644 --- a/drivers/scsi/aacraid/commsup.c +++ b/drivers/scsi/aacraid/commsup.c @@ -1583,6 +1583,7 @@ static int _aac_reset_adapter(struct aac_dev *aac, int forced, u8 reset_type) * will ensure that i/o is queisced and the card is flushed in that * case. */ + aac_free_irq(aac); aac_fib_map_free(aac); dma_free_coherent(&aac->pdev->dev, aac->comm_size, aac->comm_addr, aac->comm_phys); @@ -1590,7 +1591,6 @@ static int _aac_reset_adapter(struct aac_dev *aac, int forced, u8 reset_type) aac->comm_phys = 0; kfree(aac->queues); aac->queues = NULL; - aac_free_irq(aac); kfree(aac->fsa_dev); aac->fsa_dev = NULL;
From: "Gustavo A. R. Silva" garsilva@embeddedor.com
[ Upstream commit 727535903bea924c4f73abb202c4b3e85fff0ca4 ]
_vreg_ is being dereferenced before it is null checked, hence there is a potential null pointer dereference.
Fix this by moving the pointer dereference after _vreg_ has been null checked.
This issue was detected with the help of Coccinelle.
Fixes: aa4976130934 ("ufs: Add regulator enable support") Signed-off-by: Gustavo A. R. Silva garsilva@embeddedor.com Reviewed-by: Subhash Jadavani subhashj@codeaurora.org Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/scsi/ufs/ufshcd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 794a4600e952..d344fef01f1d 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -6555,12 +6555,15 @@ static int ufshcd_config_vreg(struct device *dev, struct ufs_vreg *vreg, bool on) { int ret = 0; - struct regulator *reg = vreg->reg; - const char *name = vreg->name; + struct regulator *reg; + const char *name; int min_uV, uA_load;
BUG_ON(!vreg);
+ reg = vreg->reg; + name = vreg->name; + if (regulator_count_voltages(reg) > 0) { min_uV = on ? vreg->min_uV : 0; ret = regulator_set_voltage(reg, min_uV, vreg->max_uV);
From: Emmanuel Grumbach emmanuel.grumbach@intel.com
[ Upstream commit d1b275ffec459c5ae12b5c7086c84175696e5a9f ]
The MONITOR type is missing in the interface type switch. Add it.
Signed-off-by: Emmanuel Grumbach emmanuel.grumbach@intel.com Signed-off-by: Luca Coelho luciano.coelho@intel.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/net/wireless/intel/iwlwifi/mvm/utils.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c index 53e269d54050..0ae7624eac9d 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c @@ -1181,6 +1181,8 @@ unsigned int iwl_mvm_get_wd_timeout(struct iwl_mvm *mvm, return le32_to_cpu(txq_timer->p2p_go); case NL80211_IFTYPE_P2P_DEVICE: return le32_to_cpu(txq_timer->p2p_device); + case NL80211_IFTYPE_MONITOR: + return default_timeout; default: WARN_ON(1); return mvm->cfg->base_params->wd_timeout;
From: Sara Sharon sara.sharon@intel.com
[ Upstream commit 0232d2cd7aa8e1b810fe84fb4059a0bd1eabe2ba ]
When getting HW rfkill we get stop_device being called from two paths. One path is the IRQ calling stop device, and updating op mode and stack. As a result, cfg80211 is running rfkill sync work that shuts down all devices (second path). In the second path, we eventually get to iwl_mvm_stop_device which calls iwl_fw_dump_conf_clear->iwl_fw_dbg_stop_recording, that access periphery registers. The device may be stopped at this point from the first path, which will result with a failure to access those registers. Simply checking for the trans status is insufficient, since the race will still exist, only minimized. Instead, move the stop from iwl_fw_dump_conf_clear (which is getting called only from stop path) to the transport stop device function, where the access is always safe. This has the added value, of actually stopping dbgc before stopping device even when the stop is initiated from the transport.
Fixes: 1efc3843a4ee ("iwlwifi: stop dbgc recording before stopping DMA") Signed-off-by: Sara Sharon sara.sharon@intel.com Signed-off-by: Luca Coelho luciano.coelho@intel.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/net/wireless/intel/iwlwifi/fw/dbg.h | 2 -- drivers/net/wireless/intel/iwlwifi/pcie/trans-gen2.c | 6 ++++++ drivers/net/wireless/intel/iwlwifi/pcie/trans.c | 9 +++++++++ 3 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/drivers/net/wireless/intel/iwlwifi/fw/dbg.h b/drivers/net/wireless/intel/iwlwifi/fw/dbg.h index 9c889a32fe24..223fb77a3aa9 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/dbg.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/dbg.h @@ -209,8 +209,6 @@ static inline void iwl_fw_dbg_stop_recording(struct iwl_fw_runtime *fwrt)
static inline void iwl_fw_dump_conf_clear(struct iwl_fw_runtime *fwrt) { - iwl_fw_dbg_stop_recording(fwrt); - fwrt->dump.conf = FW_DBG_INVALID; }
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans-gen2.c index c59f4581e972..ac05fd1e74c4 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans-gen2.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans-gen2.c @@ -49,6 +49,7 @@ * *****************************************************************************/ #include "iwl-trans.h" +#include "iwl-prph.h" #include "iwl-context-info.h" #include "internal.h"
@@ -156,6 +157,11 @@ void _iwl_trans_pcie_gen2_stop_device(struct iwl_trans *trans, bool low_power)
trans_pcie->is_down = true;
+ /* Stop dbgc before stopping device */ + iwl_write_prph(trans, DBGC_IN_SAMPLE, 0); + udelay(100); + iwl_write_prph(trans, DBGC_OUT_CTRL, 0); + /* tell the device to stop sending interrupts */ iwl_disable_interrupts(trans);
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index 2e3e013ec95a..12a9b86d71ea 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -1138,6 +1138,15 @@ static void _iwl_trans_pcie_stop_device(struct iwl_trans *trans, bool low_power)
trans_pcie->is_down = true;
+ /* Stop dbgc before stopping device */ + if (trans->cfg->device_family == IWL_DEVICE_FAMILY_7000) { + iwl_set_bits_prph(trans, MON_BUFF_SAMPLE_CTL, 0x100); + } else { + iwl_write_prph(trans, DBGC_IN_SAMPLE, 0); + udelay(100); + iwl_write_prph(trans, DBGC_OUT_CTRL, 0); + } + /* tell the device to stop sending interrupts */ iwl_disable_interrupts(trans);
From: Florian Fainelli f.fainelli@gmail.com
[ Upstream commit 77416ab35f5712382e5a792bfa1736ceb70d5bbb ]
The AHCI controller is currently enabled for all of these boards: bcm958623hr and bcm958625hr would result in a hard hang on boot that we cannot get rid of. Since this does not appear to have an easy and simple fix, just disable the AHCI controller for now until this gets resolved.
Fixes: 70725d6e97ac ("ARM: dts: NSP: Enable SATA on bcm958625hr") Fixes: d454c3762437 ("ARM: dts: NSP: Add new DT file for bcm958623hr") Acked-by: Jon Mason jon.mason@broadcom.com Signed-off-by: Florian Fainelli f.fainelli@gmail.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/arm/boot/dts/bcm958623hr.dts | 4 ---- arch/arm/boot/dts/bcm958625hr.dts | 4 ---- 2 files changed, 8 deletions(-)
diff --git a/arch/arm/boot/dts/bcm958623hr.dts b/arch/arm/boot/dts/bcm958623hr.dts index 3bc50849d013..b8bde13de90a 100644 --- a/arch/arm/boot/dts/bcm958623hr.dts +++ b/arch/arm/boot/dts/bcm958623hr.dts @@ -141,10 +141,6 @@ status = "okay"; };
-&sata { - status = "okay"; -}; - &qspi { bspi-sel = <0>; flash: m25p80@0 { diff --git a/arch/arm/boot/dts/bcm958625hr.dts b/arch/arm/boot/dts/bcm958625hr.dts index d94d14b3c745..6a44b8021702 100644 --- a/arch/arm/boot/dts/bcm958625hr.dts +++ b/arch/arm/boot/dts/bcm958625hr.dts @@ -177,10 +177,6 @@ status = "okay"; };
-&sata { - status = "okay"; -}; - &srab { compatible = "brcm,bcm58625-srab", "brcm,nsp-srab"; status = "okay";
From: Florian Fainelli f.fainelli@gmail.com
[ Upstream commit 5f1aa51c7a1eef1c5a60b8334e32c89904964245 ]
Booting a kernel results in the kernel warning us about the following PPI interrupts configuration: [ 0.105127] smp: Bringing up secondary CPUs ... [ 0.110545] GIC: PPI11 is secure or misconfigured [ 0.110551] GIC: PPI13 is secure or misconfigured
Fix this by using the appropriate edge configuration for PPI11 and PPI13, this is similar to what was fixed for Northstar (BCM5301X) in commit 0e34079cd1f6 ("ARM: dts: BCM5301X: Correct GIC_PPI interrupt flags").
Fixes: 7b2e987de207 ("ARM: NSP: add minimal Northstar Plus device tree") Fixes: 1a9d53cabaf4 ("ARM: dts: NSP: Add TWD Support to DT") Acked-by: Jon Mason jon.mason@broadcom.com Signed-off-by: Florian Fainelli f.fainelli@gmail.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/arm/boot/dts/bcm-nsp.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/arm/boot/dts/bcm-nsp.dtsi b/arch/arm/boot/dts/bcm-nsp.dtsi index dff66974feed..d5f5e92e7488 100644 --- a/arch/arm/boot/dts/bcm-nsp.dtsi +++ b/arch/arm/boot/dts/bcm-nsp.dtsi @@ -85,7 +85,7 @@ timer@20200 { compatible = "arm,cortex-a9-global-timer"; reg = <0x20200 0x100>; - interrupts = <GIC_PPI 11 IRQ_TYPE_LEVEL_HIGH>; + interrupts = <GIC_PPI 11 IRQ_TYPE_EDGE_RISING>; clocks = <&periph_clk>; };
@@ -93,7 +93,7 @@ compatible = "arm,cortex-a9-twd-timer"; reg = <0x20600 0x20>; interrupts = <GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(2) | - IRQ_TYPE_LEVEL_HIGH)>; + IRQ_TYPE_EDGE_RISING)>; clocks = <&periph_clk>; };
From: Icenowy Zheng icenowy@aosc.io
[ Upstream commit 04226916d2360f56d57ad00bc48d2d1854d1e0b0 ]
A new usbid of UTV007 is found in a newly bought device.
The usbid is 1f71:3301.
The ID on the chip is: UTV007 A89029.1 1520L18K1
Both video and audio is tested with the modified usbtv driver.
Signed-off-by: Icenowy Zheng icenowy@aosc.io Acked-by: Lubomir Rintel lkundrak@v3.sk Signed-off-by: Mauro Carvalho Chehab mchehab@s-opensource.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/media/usb/usbtv/usbtv-core.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/media/usb/usbtv/usbtv-core.c b/drivers/media/usb/usbtv/usbtv-core.c index f06f09a0876e..71fb5734995b 100644 --- a/drivers/media/usb/usbtv/usbtv-core.c +++ b/drivers/media/usb/usbtv/usbtv-core.c @@ -144,6 +144,7 @@ static void usbtv_disconnect(struct usb_interface *intf)
static const struct usb_device_id usbtv_id_table[] = { { USB_DEVICE(0x1b71, 0x3002) }, + { USB_DEVICE(0x1f71, 0x3301) }, {} }; MODULE_DEVICE_TABLE(usb, usbtv_id_table);
From: Colin Ian King colin.king@canonical.com
[ Upstream commit b2fc059fa549fe6881d4c1f8d698b0f50bcd16ec ]
Avoid dereferencing pointer g until after g has been sanity null checked; move the assignment of cdev much later when it is required into a more local scope.
Detected by CoverityScan, CID#1222135 ("Dereference before null check")
Fixes: b785ea7ce662 ("usb: gadget: composite: fix ep->maxburst initialization") Signed-off-by: Colin Ian King colin.king@canonical.com Signed-off-by: Felipe Balbi felipe.balbi@linux.intel.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/usb/gadget/composite.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index 5d061b3d8224..ed9346f0b000 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -150,7 +150,6 @@ int config_ep_by_speed(struct usb_gadget *g, struct usb_function *f, struct usb_ep *_ep) { - struct usb_composite_dev *cdev = get_gadget_data(g); struct usb_endpoint_descriptor *chosen_desc = NULL; struct usb_descriptor_header **speed_desc = NULL;
@@ -229,8 +228,12 @@ ep_found: _ep->maxburst = comp_desc->bMaxBurst + 1; break; default: - if (comp_desc->bMaxBurst != 0) + if (comp_desc->bMaxBurst != 0) { + struct usb_composite_dev *cdev; + + cdev = get_gadget_data(g); ERROR(cdev, "ep0 bMaxBurst must be 0\n"); + } _ep->maxburst = 1; break; }
From: Juergen Gross jgross@suse.com
[ Upstream commit 42b3a4cb5609de757f5445fcad18945ba9239a07 ]
Add early interrupt handlers activated by idt_setup_early_handler() to the handlers supported by Xen pv guests. This will allow for early WARN() calls not crashing the guest.
Suggested-by: Andy Lutomirski luto@kernel.org Signed-off-by: Juergen Gross jgross@suse.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: xen-devel@lists.xenproject.org Cc: boris.ostrovsky@oracle.com Link: https://lkml.kernel.org/r/20171124084221.30172-1-jgross@suse.com Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- arch/x86/include/asm/segment.h | 12 ++++++++++++ arch/x86/mm/extable.c | 4 +++- arch/x86/xen/enlighten_pv.c | 37 ++++++++++++++++++++++++------------- arch/x86/xen/xen-asm_64.S | 14 ++++++++++++++ 4 files changed, 53 insertions(+), 14 deletions(-)
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index b20f9d623f9c..8f09012b92e7 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -236,11 +236,23 @@ */ #define EARLY_IDT_HANDLER_SIZE 9
+/* + * xen_early_idt_handler_array is for Xen pv guests: for each entry in + * early_idt_handler_array it contains a prequel in the form of + * pop %rcx; pop %r11; jmp early_idt_handler_array[i]; summing up to + * max 8 bytes. + */ +#define XEN_EARLY_IDT_HANDLER_SIZE 8 + #ifndef __ASSEMBLY__
extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; extern void early_ignore_irq(void);
+#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +extern const char xen_early_idt_handler_array[NUM_EXCEPTION_VECTORS][XEN_EARLY_IDT_HANDLER_SIZE]; +#endif + /* * Load a segment. Fall back on loading the zero segment if something goes * wrong. This variant assumes that loading zero fully clears the segment. diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 30bc4812ceb8..9fe656c42aa5 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,6 +1,7 @@ #include <linux/extable.h> #include <linux/uaccess.h> #include <linux/sched/debug.h> +#include <xen/xen.h>
#include <asm/fpu/internal.h> #include <asm/traps.h> @@ -212,8 +213,9 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) * Old CPUs leave the high bits of CS on the stack * undefined. I'm not sure which CPUs do this, but at least * the 486 DX works this way. + * Xen pv domains are not using the default __KERNEL_CS. */ - if (regs->cs != __KERNEL_CS) + if (!xen_pv_domain() && regs->cs != __KERNEL_CS) goto fail;
/* diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index ae3a071e1d0f..899a22a02e95 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -622,7 +622,7 @@ static struct trap_array_entry trap_array[] = { { simd_coprocessor_error, xen_simd_coprocessor_error, false }, };
-static bool get_trap_addr(void **addr, unsigned int ist) +static bool __ref get_trap_addr(void **addr, unsigned int ist) { unsigned int nr; bool ist_okay = false; @@ -644,6 +644,14 @@ static bool get_trap_addr(void **addr, unsigned int ist) } }
+ if (nr == ARRAY_SIZE(trap_array) && + *addr >= (void *)early_idt_handler_array[0] && + *addr < (void *)early_idt_handler_array[NUM_EXCEPTION_VECTORS]) { + nr = (*addr - (void *)early_idt_handler_array[0]) / + EARLY_IDT_HANDLER_SIZE; + *addr = (void *)xen_early_idt_handler_array[nr]; + } + if (WARN_ON(ist != 0 && !ist_okay)) return false;
@@ -1261,6 +1269,21 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_setup_gdt(0);
xen_init_irq_ops(); + + /* Let's presume PV guests always boot on vCPU with id 0. */ + per_cpu(xen_vcpu_id, 0) = 0; + + /* + * Setup xen_vcpu early because idt_setup_early_handler needs it for + * local_irq_disable(), irqs_disabled(). + * + * Don't do the full vcpu_info placement stuff until we have + * the cpu_possible_mask and a non-dummy shared_info. + */ + xen_vcpu_info_reset(0); + + idt_setup_early_handler(); + xen_init_capabilities();
#ifdef CONFIG_X86_LOCAL_APIC @@ -1294,18 +1317,6 @@ asmlinkage __visible void __init xen_start_kernel(void) */ acpi_numa = -1; #endif - /* Let's presume PV guests always boot on vCPU with id 0. */ - per_cpu(xen_vcpu_id, 0) = 0; - - /* - * Setup xen_vcpu early because start_kernel needs it for - * local_irq_disable(), irqs_disabled(). - * - * Don't do the full vcpu_info placement stuff until we have - * the cpu_possible_mask and a non-dummy shared_info. - */ - xen_vcpu_info_reset(0); - WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));
local_irq_disable(); diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 8a10c9a9e2b5..417b339e5c8e 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -15,6 +15,7 @@
#include <xen/interface/xen.h>
+#include <linux/init.h> #include <linux/linkage.h>
.macro xen_pv_trap name @@ -54,6 +55,19 @@ xen_pv_trap entry_INT80_compat #endif xen_pv_trap hypervisor_callback
+ __INIT +ENTRY(xen_early_idt_handler_array) + i = 0 + .rept NUM_EXCEPTION_VECTORS + pop %rcx + pop %r11 + jmp early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE + i = i + 1 + .fill xen_early_idt_handler_array + i*XEN_EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc + .endr +END(xen_early_idt_handler_array) + __FINIT + hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 /* * Xen64 iret frame:
From: Larry Finger Larry.Finger@lwfinger.net
[ Upstream commit b77992d2df9e47144354d1b25328b180afa33442 ]
When not associated with an AP, wifi device drivers should respond to the SIOCGIWESSID ioctl with a zero-length string for the SSID, which is the behavior expected by dhcpcd.
Currently, this driver returns an error code (-1) from the ioctl call, which causes dhcpcd to assume that the device is not a wireless interface and therefore it fails to work correctly with it thereafter.
This problem was reported and tested at https://github.com/lwfinger/rtl8188eu/issues/234.
Signed-off-by: Larry Finger Larry.Finger@lwfinger.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Sasha Levin alexander.levin@microsoft.com --- drivers/staging/rtl8188eu/os_dep/ioctl_linux.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-)
diff --git a/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c b/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c index c0664dc80bf2..446310775e90 100644 --- a/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c +++ b/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c @@ -1395,19 +1395,13 @@ static int rtw_wx_get_essid(struct net_device *dev, if ((check_fwstate(pmlmepriv, _FW_LINKED)) || (check_fwstate(pmlmepriv, WIFI_ADHOC_MASTER_STATE))) { len = pcur_bss->Ssid.SsidLength; - - wrqu->essid.length = len; - memcpy(extra, pcur_bss->Ssid.Ssid, len); - - wrqu->essid.flags = 1; } else { - ret = -1; - goto exit; + len = 0; + *extra = 0; } - -exit: - + wrqu->essid.length = len; + wrqu->essid.flags = 1;
return ret; }
linux-stable-mirror@lists.linaro.org