Limit the number of files in io_uring fixed tables by RLIMIT_NOFILE,
that's the first and the simpliest restriction that we should impose.
Cc: stable(a)vger.kernel.org
Suggested-by: Jens Axboe <axboe(a)kernel.dk>
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
---
fs/io_uring.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 30edc329d803..e6301d5d03a8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7730,6 +7730,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
+ if (nr_args > rlimit(RLIMIT_NOFILE))
+ return -EMFILE;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
--
2.32.0
When a mapped level interrupt (a timer, for example) is deactivated
by the guest, the corresponding host interrupt is equally deactivated.
However, the fate of the pending state still needs to be dealt
with in SW.
This is specially true when the interrupt was in the active+pending
state in the virtual distributor at the point where the guest
was entered. On exit, the pending state is potentially stale
(the guest may have put the interrupt in a non-pending state).
If we don't do anything, the interrupt will be spuriously injected
in the guest. Although this shouldn't have any ill effect (spurious
interrupts are always possible), we can improve the emulation by
detecting the deactivation-while-pending case and resample the
interrupt.
While we're at it, move the logic into a common helper that can
be shared between the two GIC implementations.
Fixes: e40cc57bac79 ("KVM: arm/arm64: vgic: Support level-triggered mapped interrupts")
Reported-by: Raghavendra Rao Ananta <rananta(a)google.com>
Tested-by: Raghavendra Rao Ananta <rananta(a)google.com>
Reviewed-by: Oliver Upton <oupton(a)google.com>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
Notes:
* From v1:
- Moved the resampling into a separate helper, and pointed both
GIC implementations to it.
- Collected TB, RB, with thanks.
arch/arm64/kvm/vgic/vgic-v2.c | 36 +++++----------------------------
arch/arm64/kvm/vgic/vgic-v3.c | 36 +++++----------------------------
arch/arm64/kvm/vgic/vgic.c | 38 +++++++++++++++++++++++++++++++++++
arch/arm64/kvm/vgic/vgic.h | 2 ++
4 files changed, 50 insertions(+), 62 deletions(-)
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 2c580204f1dc..95a18cec14a3 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -60,6 +60,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
u32 val = cpuif->vgic_lr[lr];
u32 cpuid, intid = val & GICH_LR_VIRTUALID;
struct vgic_irq *irq;
+ bool deactivated;
/* Extract the source vCPU id from the LR */
cpuid = val & GICH_LR_PHYSID_CPUID;
@@ -75,7 +76,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
raw_spin_lock(&irq->irq_lock);
- /* Always preserve the active bit */
+ /* Always preserve the active bit, note deactivation */
+ deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT);
irq->active = !!(val & GICH_LR_ACTIVE_BIT);
if (irq->active && vgic_irq_is_sgi(intid))
@@ -96,36 +98,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
irq->pending_latch = false;
- /*
- * Level-triggered mapped IRQs are special because we only
- * observe rising edges as input to the VGIC.
- *
- * If the guest never acked the interrupt we have to sample
- * the physical line and set the line level, because the
- * device state could have changed or we simply need to
- * process the still pending interrupt later.
- *
- * If this causes us to lower the level, we have to also clear
- * the physical active state, since we will otherwise never be
- * told when the interrupt becomes asserted again.
- *
- * Another case is when the interrupt requires a helping hand
- * on deactivation (no HW deactivation, for example).
- */
- if (vgic_irq_is_mapped_level(irq)) {
- bool resample = false;
-
- if (val & GICH_LR_PENDING_BIT) {
- irq->line_level = vgic_get_phys_line_level(irq);
- resample = !irq->line_level;
- } else if (vgic_irq_needs_resampling(irq) &&
- !(irq->active || irq->pending_latch)) {
- resample = true;
- }
-
- if (resample)
- vgic_irq_set_phys_active(irq, false);
- }
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT);
raw_spin_unlock(&irq->irq_lock);
vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 66004f61cd83..21a6207fb2ee 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -46,6 +46,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
u32 intid, cpuid;
struct vgic_irq *irq;
bool is_v2_sgi = false;
+ bool deactivated;
cpuid = val & GICH_LR_PHYSID_CPUID;
cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
@@ -68,7 +69,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
raw_spin_lock(&irq->irq_lock);
- /* Always preserve the active bit */
+ /* Always preserve the active bit, note deactivation */
+ deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
irq->active = !!(val & ICH_LR_ACTIVE_BIT);
if (irq->active && is_v2_sgi)
@@ -89,36 +91,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
irq->pending_latch = false;
- /*
- * Level-triggered mapped IRQs are special because we only
- * observe rising edges as input to the VGIC.
- *
- * If the guest never acked the interrupt we have to sample
- * the physical line and set the line level, because the
- * device state could have changed or we simply need to
- * process the still pending interrupt later.
- *
- * If this causes us to lower the level, we have to also clear
- * the physical active state, since we will otherwise never be
- * told when the interrupt becomes asserted again.
- *
- * Another case is when the interrupt requires a helping hand
- * on deactivation (no HW deactivation, for example).
- */
- if (vgic_irq_is_mapped_level(irq)) {
- bool resample = false;
-
- if (val & ICH_LR_PENDING_BIT) {
- irq->line_level = vgic_get_phys_line_level(irq);
- resample = !irq->line_level;
- } else if (vgic_irq_needs_resampling(irq) &&
- !(irq->active || irq->pending_latch)) {
- resample = true;
- }
-
- if (resample)
- vgic_irq_set_phys_active(irq, false);
- }
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);
raw_spin_unlock(&irq->irq_lock);
vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 111bff47e471..42a6ac78fe95 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -1022,3 +1022,41 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
return map_is_active;
}
+
+/*
+ * Level-triggered mapped IRQs are special because we only observe rising
+ * edges as input to the VGIC.
+ *
+ * If the guest never acked the interrupt we have to sample the physical
+ * line and set the line level, because the device state could have changed
+ * or we simply need to process the still pending interrupt later.
+ *
+ * We could also have entered the guest with the interrupt active+pending.
+ * On the next exit, we need to re-evaluate the pending state, as it could
+ * otherwise result in a spurious interrupt by injecting a now potentially
+ * stale pending state.
+ *
+ * If this causes us to lower the level, we have to also clear the physical
+ * active state, since we will otherwise never be told when the interrupt
+ * becomes asserted again.
+ *
+ * Another case is when the interrupt requires a helping hand on
+ * deactivation (no HW deactivation, for example).
+ */
+void vgic_irq_handle_resampling(struct vgic_irq *irq,
+ bool lr_deactivated, bool lr_pending)
+{
+ if (vgic_irq_is_mapped_level(irq)) {
+ bool resample = false;
+
+ if (unlikely(vgic_irq_needs_resampling(irq))) {
+ resample = !(irq->active || irq->pending_latch);
+ } else if (lr_pending || (lr_deactivated && irq->line_level)) {
+ irq->line_level = vgic_get_phys_line_level(irq);
+ resample = !irq->line_level;
+ }
+
+ if (resample)
+ vgic_irq_set_phys_active(irq, false);
+ }
+}
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index dc1f3d1657ee..14a9218641f5 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -169,6 +169,8 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
unsigned long flags);
void vgic_kick_vcpus(struct kvm *kvm);
+void vgic_irq_handle_resampling(struct vgic_irq *irq,
+ bool lr_deactivated, bool lr_pending);
int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
phys_addr_t addr, phys_addr_t alignment);
--
2.30.2
From: Mike Kravetz <mike.kravetz(a)oracle.com>
Subject: hugetlb: don't pass page cache pages to restore_reserve_on_error
syzbot hit kernel BUG at fs/hugetlbfs/inode.c:532 as described in [1].
This BUG triggers if the HPageRestoreReserve flag is set on a page in
the page cache. It should never be set, as the routine
huge_add_to_page_cache explicitly clears the flag after adding a page
to the cache.
The only code other than huge page allocation which sets the flag is
restore_reserve_on_error. It will potentially set the flag in rare out
of memory conditions. syzbot was injecting errors to cause memory
allocation errors which exercised this specific path.
The code in restore_reserve_on_error is doing the right thing. However,
there are instances where pages in the page cache were being passed to
restore_reserve_on_error. This is incorrect, as once a page goes into
the cache reservation information will not be modified for the page until
it is removed from the cache. Error paths do not remove pages from the
cache, so even in the case of error, the page will remain in the cache
and no reservation adjustment is needed.
Modify routines that potentially call restore_reserve_on_error with a
page cache page to no longer do so.
Note on fixes tag:
Prior to commit 846be08578ed ("mm/hugetlb: expand restore_reserve_on_error
functionality") the routine would not process page cache pages because
the HPageRestoreReserve flag is not set on such pages. Therefore, this
issue could not be trigggered. The code added by commit 846be08578ed
("mm/hugetlb: expand restore_reserve_on_error functionality") is needed
and correct. It exposed incorrect calls to restore_reserve_on_error which
is the root cause addressed by this commit.
[1] https://lore.kernel.org/linux-mm/00000000000050776d05c9b7c7f0@google.com/
Link: https://lkml.kernel.org/r/20210818213304.37038-1-mike.kravetz@oracle.com
Fixes: 846be08578ed ("mm/hugetlb: expand restore_reserve_on_error functionality")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reported-by: <syzbot+67654e51e54455f1c585(a)syzkaller.appspotmail.com>
Cc: Mina Almasry <almasrymina(a)google.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Naoya Horiguchi <naoya.horiguchi(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
--- a/mm/hugetlb.c~hugetlb-dont-pass-page-cache-pages-to-restore_reserve_on_error
+++ a/mm/hugetlb.c
@@ -2476,7 +2476,7 @@ void restore_reserve_on_error(struct hst
if (!rc) {
/*
* This indicates there is an entry in the reserve map
- * added by alloc_huge_page. We know it was added
+ * not added by alloc_huge_page. We know it was added
* before the alloc_huge_page call, otherwise
* HPageRestoreReserve would be set on the page.
* Remove the entry so that a subsequent allocation
@@ -4660,7 +4660,9 @@ retry_avoidcopy:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
out_release_all:
- restore_reserve_on_error(h, vma, haddr, new_page);
+ /* No restore in case of successful pagetable update (Break COW) */
+ if (new_page != old_page)
+ restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
out_release_old:
put_page(old_page);
@@ -4776,7 +4778,7 @@ static vm_fault_t hugetlb_no_page(struct
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
- bool new_page = false;
+ bool new_page, new_pagecache_page = false;
/*
* Currently, we are forced to kill the process in the event the
@@ -4799,6 +4801,7 @@ static vm_fault_t hugetlb_no_page(struct
goto out;
retry:
+ new_page = false;
page = find_lock_page(mapping, idx);
if (!page) {
/* Check for page in userfault range */
@@ -4842,6 +4845,7 @@ retry:
goto retry;
goto out;
}
+ new_pagecache_page = true;
} else {
lock_page(page);
if (unlikely(anon_vma_prepare(vma))) {
@@ -4926,7 +4930,9 @@ backout:
spin_unlock(ptl);
backout_unlocked:
unlock_page(page);
- restore_reserve_on_error(h, vma, haddr, page);
+ /* restore reserve for newly allocated pages not in page cache */
+ if (new_page && !new_pagecache_page)
+ restore_reserve_on_error(h, vma, haddr, page);
put_page(page);
goto out;
}
@@ -5135,6 +5141,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_s
int ret = -ENOMEM;
struct page *page;
int writable;
+ bool new_pagecache_page = false;
if (is_continue) {
ret = -EFAULT;
@@ -5228,6 +5235,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_s
ret = huge_add_to_page_cache(page, mapping, idx);
if (ret)
goto out_release_nounlock;
+ new_pagecache_page = true;
}
ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
@@ -5291,7 +5299,8 @@ out_release_unlock:
if (vm_shared || is_continue)
unlock_page(page);
out_release_nounlock:
- restore_reserve_on_error(h, dst_vma, dst_addr, page);
+ if (!new_pagecache_page)
+ restore_reserve_on_error(h, dst_vma, dst_addr, page);
put_page(page);
goto out;
}
_
From: Marco Elver <elver(a)google.com>
Subject: kfence: fix is_kfence_address() for addresses below KFENCE_POOL_SIZE
Originally the addr != NULL check was meant to take care of the case where
__kfence_pool == NULL (KFENCE is disabled). However, this does not work
for addresses where addr > 0 && addr < KFENCE_POOL_SIZE.
This can be the case on NULL-deref where addr > 0 && addr < PAGE_SIZE or
any other faulting access with addr < KFENCE_POOL_SIZE. While the kernel
would likely crash, the stack traces and report might be confusing due to
double faults upon KFENCE's attempt to unprotect such an address.
Fix it by just checking that __kfence_pool != NULL instead.
Link: https://lkml.kernel.org/r/20210818130300.2482437-1-elver@google.com
Fixes: 0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure")
Signed-off-by: Marco Elver <elver(a)google.com>
Reported-by: Kuan-Ying Lee <Kuan-Ying.Lee(a)mediatek.com>
Acked-by: Alexander Potapenko <glider(a)google.com>
Cc: Dmitry Vyukov <dvyukov(a)google.com>
Cc: <stable(a)vger.kernel.org> [5.12+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/kfence.h | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
--- a/include/linux/kfence.h~kfence-fix-is_kfence_address-for-addresses-below-kfence_pool_size
+++ a/include/linux/kfence.h
@@ -51,10 +51,11 @@ extern atomic_t kfence_allocation_gate;
static __always_inline bool is_kfence_address(const void *addr)
{
/*
- * The non-NULL check is required in case the __kfence_pool pointer was
- * never initialized; keep it in the slow-path after the range-check.
+ * The __kfence_pool != NULL check is required to deal with the case
+ * where __kfence_pool == NULL && addr < KFENCE_POOL_SIZE. Keep it in
+ * the slow-path after the range-check!
*/
- return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && addr);
+ return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && __kfence_pool);
}
/**
_
From: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Subject: mm/hwpoison: retry with shake_page() for unhandlable pages
HWPoisonHandlable() sometimes returns false for typical user pages due to
races with average memory events like transfers over LRU lists. This
causes failures in hwpoison handling.
There's retry code for such a case but does not work because the retry
loop reaches the retry limit too quickly before the page settles down to
handlable state. Let get_any_page() call shake_page() to fix it.
[naoya.horiguchi(a)nec.com: get_any_page(): return -EIO when retry limit reached]
Link: https://lkml.kernel.org/r/20210819001958.2365157-1-naoya.horiguchi@linux.dev
Link: https://lkml.kernel.org/r/20210817053703.2267588-1-naoya.horiguchi@linux.dev
Fixes: 25182f05ffed ("mm,hwpoison: fix race with hugetlb page allocation")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Reported-by: Tony Luck <tony.luck(a)intel.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: <stable(a)vger.kernel.org> [5.13+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory-failure.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
--- a/mm/memory-failure.c~mm-hwpoison-retry-with-shake_page-for-unhandlable-pages
+++ a/mm/memory-failure.c
@@ -1146,7 +1146,7 @@ static int __get_hwpoison_page(struct pa
* unexpected races caused by taking a page refcount.
*/
if (!HWPoisonHandlable(head))
- return 0;
+ return -EBUSY;
if (PageTransHuge(head)) {
/*
@@ -1199,9 +1199,15 @@ try_again:
}
goto out;
} else if (ret == -EBUSY) {
- /* We raced with freeing huge page to buddy, retry. */
- if (pass++ < 3)
+ /*
+ * We raced with (possibly temporary) unhandlable
+ * page, retry.
+ */
+ if (pass++ < 3) {
+ shake_page(p, 1);
goto try_again;
+ }
+ ret = -EIO;
goto out;
}
}
_
From: Johannes Weiner <hannes(a)cmpxchg.org>
Subject: mm: memcontrol: fix occasional OOMs due to proportional memory.low reclaim
We've noticed occasional OOM killing when memory.low settings are in
effect for cgroups. This is unexpected and undesirable as memory.low
is supposed to express non-OOMing memory priorities between cgroups.
The reason for this is proportional memory.low reclaim. When cgroups
are below their memory.low threshold, reclaim passes them over in the
first round, and then retries if it couldn't find pages anywhere else.
But when cgroups are slightly above their memory.low setting, page scan
force is scaled down and diminished in proportion to the overage, to
the point where it can cause reclaim to fail as well - only in that
case we currently don't retry, and instead trigger OOM.
To fix this, hook proportional reclaim into the same retry logic we
have in place for when cgroups are skipped entirely. This way if
reclaim fails and some cgroups were scanned with diminished pressure,
we'll try another full-force cycle before giving up and OOMing.
[akpm(a)linux-foundation.org: coding-style fixes]
Link: https://lkml.kernel.org/r/20210817180506.220056-1-hannes@cmpxchg.org
Fixes: 9783aa9917f8 ("mm, memcg: proportional memory.{low,min} reclaim")
Signed-off-by: Johannes Weiner <hannes(a)cmpxchg.org>
Reported-by: Leon Yang <lnyng(a)fb.com>
Reviewed-by: Rik van Riel <riel(a)surriel.com>
Reviewed-by: Shakeel Butt <shakeelb(a)google.com>
Acked-by: Roman Gushchin <guro(a)fb.com>
Acked-by: Chris Down <chris(a)chrisdown.name>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: <stable(a)vger.kernel.org> [5.4+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/memcontrol.h | 29 +++++++++++++++--------------
mm/vmscan.c | 27 +++++++++++++++++++--------
2 files changed, 34 insertions(+), 22 deletions(-)
--- a/include/linux/memcontrol.h~mm-memcontrol-fix-occasional-ooms-due-to-proportional-memorylow-reclaim
+++ a/include/linux/memcontrol.h
@@ -612,12 +612,15 @@ static inline bool mem_cgroup_disabled(v
return !cgroup_subsys_enabled(memory_cgrp_subsys);
}
-static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root,
- struct mem_cgroup *memcg,
- bool in_low_reclaim)
+static inline void mem_cgroup_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg,
+ unsigned long *min,
+ unsigned long *low)
{
+ *min = *low = 0;
+
if (mem_cgroup_disabled())
- return 0;
+ return;
/*
* There is no reclaim protection applied to a targeted reclaim.
@@ -653,13 +656,10 @@ static inline unsigned long mem_cgroup_p
*
*/
if (root == memcg)
- return 0;
-
- if (in_low_reclaim)
- return READ_ONCE(memcg->memory.emin);
+ return;
- return max(READ_ONCE(memcg->memory.emin),
- READ_ONCE(memcg->memory.elow));
+ *min = READ_ONCE(memcg->memory.emin);
+ *low = READ_ONCE(memcg->memory.elow);
}
void mem_cgroup_calculate_protection(struct mem_cgroup *root,
@@ -1147,11 +1147,12 @@ static inline void memcg_memory_event_mm
{
}
-static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root,
- struct mem_cgroup *memcg,
- bool in_low_reclaim)
+static inline void mem_cgroup_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg,
+ unsigned long *min,
+ unsigned long *low)
{
- return 0;
+ *min = *low = 0;
}
static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root,
--- a/mm/vmscan.c~mm-memcontrol-fix-occasional-ooms-due-to-proportional-memorylow-reclaim
+++ a/mm/vmscan.c
@@ -100,9 +100,12 @@ struct scan_control {
unsigned int may_swap:1;
/*
- * Cgroups are not reclaimed below their configured memory.low,
- * unless we threaten to OOM. If any cgroups are skipped due to
- * memory.low and nothing was reclaimed, go back for memory.low.
+ * Cgroup memory below memory.low is protected as long as we
+ * don't threaten to OOM. If any cgroup is reclaimed at
+ * reduced force or passed over entirely due to its memory.low
+ * setting (memcg_low_skipped), and nothing is reclaimed as a
+ * result, then go back for one more cycle that reclaims the protected
+ * memory (memcg_low_reclaim) to avert OOM.
*/
unsigned int memcg_low_reclaim:1;
unsigned int memcg_low_skipped:1;
@@ -2537,15 +2540,14 @@ out:
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
unsigned long lruvec_size;
+ unsigned long low, min;
unsigned long scan;
- unsigned long protection;
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- protection = mem_cgroup_protection(sc->target_mem_cgroup,
- memcg,
- sc->memcg_low_reclaim);
+ mem_cgroup_protection(sc->target_mem_cgroup, memcg,
+ &min, &low);
- if (protection) {
+ if (min || low) {
/*
* Scale a cgroup's reclaim pressure by proportioning
* its current usage to its memory.low or memory.min
@@ -2576,6 +2578,15 @@ out:
* hard protection.
*/
unsigned long cgroup_size = mem_cgroup_size(memcg);
+ unsigned long protection;
+
+ /* memory.low scaling, make sure we retry before OOM */
+ if (!sc->memcg_low_reclaim && low > min) {
+ protection = low;
+ sc->memcg_low_skipped = 1;
+ } else {
+ protection = min;
+ }
/* Avoid TOCTOU with earlier protection check */
cgroup_size = max(cgroup_size, protection);
_
Two legacy PCI sysfs objects "legacy_io" and "legacy_mem" were updated
to use an unified address space in the commit 636b21b50152 ("PCI: Revoke
mappings like devmem"). This allows for revocations to be managed from
a single place when drivers want to take over and mmap() a /dev/mem
range.
Following the update, both of the sysfs objects should leverage the
iomem_get_mapping() function to get an appropriate address range, but
only the "legacy_io" has been correctly updated - the second attribute
seems to be using a wrong variable to pass the iomem_get_mapping()
function to.
Thus, correct the variable name used so that the "legacy_mem" sysfs
object would also correctly call the iomem_get_mapping() function.
Fixes: 636b21b50152 ("PCI: Revoke mappings like devmem")
Signed-off-by: Krzysztof Wilczyński <kw(a)linux.com>
---
drivers/pci/pci-sysfs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 5d63df7c1820..7bbf2673c7f2 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -978,7 +978,7 @@ void pci_create_legacy_files(struct pci_bus *b)
b->legacy_mem->size = 1024*1024;
b->legacy_mem->attr.mode = 0600;
b->legacy_mem->mmap = pci_mmap_legacy_mem;
- b->legacy_io->mapping = iomem_get_mapping();
+ b->legacy_mem->mapping = iomem_get_mapping();
pci_adjust_legacy_attr(b, pci_mmap_mem);
error = device_create_bin_file(&b->dev, b->legacy_mem);
if (error)
--
2.32.0
From: Pauli Virtanen <pav(a)iki.fi>
Kernel Version 5.13
commit 55981d3541812234e687062926ff199c83f79a39 upstream.
Some USB BT adapters don't satisfy the MTU requirement mentioned in
commit e848dbd364ac ("Bluetooth: btusb: Add support USB ALT 3 for WBS")
and have ALT 3 setting that produces no/garbled audio. Some adapters
with larger MTU were also reported to have problems with ALT 3.
Add a flag and check it and MTU before selecting ALT 3, falling back to
ALT 1. Enable the flag for Realtek, restoring the previous behavior for
non-Realtek devices.
Tested with USB adapters (mtu<72, no/garbled sound with ALT3, ALT1
works) BCM20702A1 0b05:17cb, CSR8510A10 0a12:0001, and (mtu>=72, ALT3
works) RTL8761BU 0bda:8771, Intel AX200 8087:0029 (after disabling
ALT6). Also got reports for (mtu>=72, ALT 3 reported to produce bad
audio) Intel 8087:0a2b.
Signed-off-by: Pauli Virtanen <pav(a)iki.fi>
Fixes: e848dbd364ac ("Bluetooth: btusb: Add support USB ALT 3 for WBS")
Tested-by: Michał Kępień <kernel(a)kempniu.pl>
Tested-by: Jonathan Lampérth <jon(a)h4n.dev>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz(a)intel.com>
---
drivers/bluetooth/btusb.c | 22 ++++++++++++++--------
1 file changed, 14 insertions(+), 8 deletions(-)
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 488f110e17e27..2336f731dbc7e 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -528,6 +528,7 @@ static const struct dmi_system_id btusb_needs_reset_resume_table[] = {
#define BTUSB_HW_RESET_ACTIVE 12
#define BTUSB_TX_WAIT_VND_EVT 13
#define BTUSB_WAKEUP_DISABLE 14
+#define BTUSB_USE_ALT3_FOR_WBS 15
struct btusb_data {
struct hci_dev *hdev;
@@ -1761,16 +1762,20 @@ static void btusb_work(struct work_struct *work)
/* Bluetooth USB spec recommends alt 6 (63 bytes), but
* many adapters do not support it. Alt 1 appears to
* work for all adapters that do not have alt 6, and
- * which work with WBS at all.
+ * which work with WBS at all. Some devices prefer
+ * alt 3 (HCI payload >= 60 Bytes let air packet
+ * data satisfy 60 bytes), requiring
+ * MTU >= 3 (packets) * 25 (size) - 3 (headers) = 72
+ * see also Core spec 5, vol 4, B 2.1.1 & Table 2.1.
*/
- new_alts = btusb_find_altsetting(data, 6) ? 6 : 1;
- /* Because mSBC frames do not need to be aligned to the
- * SCO packet boundary. If support the Alt 3, use the
- * Alt 3 for HCI payload >= 60 Bytes let air packet
- * data satisfy 60 bytes.
- */
- if (new_alts == 1 && btusb_find_altsetting(data, 3))
+ if (btusb_find_altsetting(data, 6))
+ new_alts = 6;
+ else if (btusb_find_altsetting(data, 3) &&
+ hdev->sco_mtu >= 72 &&
+ test_bit(BTUSB_USE_ALT3_FOR_WBS, &data->flags))
new_alts = 3;
+ else
+ new_alts = 1;
}
if (btusb_switch_alt_setting(hdev, new_alts) < 0)
@@ -3882,6 +3887,7 @@ static int btusb_probe(struct usb_interface *intf,
* (DEVICE_REMOTE_WAKEUP)
*/
set_bit(BTUSB_WAKEUP_DISABLE, &data->flags);
+ set_bit(BTUSB_USE_ALT3_FOR_WBS, &data->flags);
}
if (!reset)
--
cgit 1.2.3-1.el7
Currently there are (at least) two problems in the way pwm_bl starts
managing the enable_gpio pin. Both occur when the backlight is initially
off and the driver finds the pin not already in output mode and, as a
result, unconditionally switches it to output-mode and asserts the signal.
Problem 1: This could cause the backlight to flicker since, at this stage
in driver initialisation, we have no idea what the PWM and regulator are
doing (an unconfigured PWM could easily "rest" at 100% duty cycle).
Problem 2: This will cause us not to correctly honour the
post_pwm_on_delay (which also risks flickers).
Fix this by moving the code to configure the GPIO output mode until after
we have examines the handover state. That allows us to initialize
enable_gpio to off if the backlight is currently off and on if the
backlight is on.
Reported-by: Marek Vasut <marex(a)denx.de>
Signed-off-by: Daniel Thompson <daniel.thompson(a)linaro.org>
Cc: stable(a)vger.kernel.org
Acked-by: Marek Vasut <marex(a)denx.de>
Tested-by: Marek Vasut <marex(a)denx.de>
---
drivers/video/backlight/pwm_bl.c | 54 +++++++++++++++++---------------
1 file changed, 28 insertions(+), 26 deletions(-)
diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c
index e48fded3e414..8d8959a70e44 100644
--- a/drivers/video/backlight/pwm_bl.c
+++ b/drivers/video/backlight/pwm_bl.c
@@ -409,6 +409,33 @@ static bool pwm_backlight_is_linear(struct platform_pwm_backlight_data *data)
static int pwm_backlight_initial_power_state(const struct pwm_bl_data *pb)
{
struct device_node *node = pb->dev->of_node;
+ bool active = true;
+
+ /*
+ * If the enable GPIO is present, observable (either as input
+ * or output) and off then the backlight is not currently active.
+ * */
+ if (pb->enable_gpio && gpiod_get_value_cansleep(pb->enable_gpio) == 0)
+ active = false;
+
+ if (!regulator_is_enabled(pb->power_supply))
+ active = false;
+
+ if (!pwm_is_enabled(pb->pwm))
+ active = false;
+
+ /*
+ * Synchronize the enable_gpio with the observed state of the
+ * hardware.
+ */
+ if (pb->enable_gpio)
+ gpiod_direction_output(pb->enable_gpio, active);
+
+ /*
+ * Do not change pb->enabled here! pb->enabled essentially
+ * tells us if we own one of the regulator's use counts and
+ * right now we do not.
+ */
/* Not booted with device tree or no phandle link to the node */
if (!node || !node->phandle)
@@ -420,20 +447,7 @@ static int pwm_backlight_initial_power_state(const struct pwm_bl_data *pb)
* assume that another driver will enable the backlight at the
* appropriate time. Therefore, if it is disabled, keep it so.
*/
-
- /* if the enable GPIO is disabled, do not enable the backlight */
- if (pb->enable_gpio && gpiod_get_value_cansleep(pb->enable_gpio) == 0)
- return FB_BLANK_POWERDOWN;
-
- /* The regulator is disabled, do not enable the backlight */
- if (!regulator_is_enabled(pb->power_supply))
- return FB_BLANK_POWERDOWN;
-
- /* The PWM is disabled, keep it like this */
- if (!pwm_is_enabled(pb->pwm))
- return FB_BLANK_POWERDOWN;
-
- return FB_BLANK_UNBLANK;
+ return active ? FB_BLANK_UNBLANK: FB_BLANK_POWERDOWN;
}
static int pwm_backlight_probe(struct platform_device *pdev)
@@ -486,18 +500,6 @@ static int pwm_backlight_probe(struct platform_device *pdev)
goto err_alloc;
}
- /*
- * If the GPIO is not known to be already configured as output, that
- * is, if gpiod_get_direction returns either 1 or -EINVAL, change the
- * direction to output and set the GPIO as active.
- * Do not force the GPIO to active when it was already output as it
- * could cause backlight flickering or we would enable the backlight too
- * early. Leave the decision of the initial backlight state for later.
- */
- if (pb->enable_gpio &&
- gpiod_get_direction(pb->enable_gpio) != 0)
- gpiod_direction_output(pb->enable_gpio, 1);
-
pb->power_supply = devm_regulator_get(&pdev->dev, "power");
if (IS_ERR(pb->power_supply)) {
ret = PTR_ERR(pb->power_supply);
base-commit: 2734d6c1b1a089fb593ef6a23d4b70903526fe0c
--
2.30.2
Dear powerpc maintainers,
The script ./scripts/checkkconfigsymbols.py warns on invalid references to
Kconfig symbols (often, minor typos, name confusions or outdated references).
This patch series addresses all issues reported by
./scripts/checkkconfigsymbols.py in ./drivers/usb/ for Kconfig and Makefile
files. Issues in the Kconfig and Makefile files indicate some shortcomings in
the overall build definitions, and often are true actionable issues to address.
These issues can be identified and filtered by:
./scripts/checkkconfigsymbols.py | grep -E "arch/powerpc/.*(Kconfig|Makefile)" -B 1 -A 1
After applying this patch series on linux-next (next-20210817), the command
above yields just two false positives (SHELL, r13) due to tool shortcomings.
As these two patches are fixes, please consider if they are suitable for
backporting to stable.
Lukas
Lukas Bulwahn (2):
powerpc: kvm: rectify selection to PPC_DAWR
powerpc: rectify selection to ARCH_ENABLE_SPLIT_PMD_PTLOCK
arch/powerpc/kvm/Kconfig | 2 +-
arch/powerpc/platforms/Kconfig.cputype | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
--
2.26.2
Hello,
We ran automated tests on a recent commit from this kernel tree:
Kernel repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Commit: f428e49b8cb1 - Linux 5.13.12
The results of these automated tests are provided below.
Overall result: PASSED
Merge: OK
Compile: OK
Tests: OK
All kernel binaries, config files, and logs are available for download here:
https://arr-cki-prod-datawarehouse-public.s3.amazonaws.com/index.html?prefi…
Please reply to this email if you have any questions about the tests that we
ran or if you have any suggestions on how to make future tests more effective.
,-. ,-.
( C ) ( K ) Continuous
`-',-.`-' Kernel
( I ) Integration
`-'
______________________________________________________________________________
Compile testing
---------------
We compiled the kernel for 4 architectures:
aarch64:
make options: make -j24 INSTALL_MOD_STRIP=1 targz-pkg
ppc64le:
make options: make -j24 INSTALL_MOD_STRIP=1 targz-pkg
s390x:
make options: make -j24 INSTALL_MOD_STRIP=1 targz-pkg
x86_64:
make options: make -j24 INSTALL_MOD_STRIP=1 targz-pkg
Hardware testing
----------------
We booted each kernel and ran the following tests:
aarch64:
Host 1:
✅ Boot test
✅ Reboot test
✅ xfstests - ext4
✅ xfstests - xfs
✅ selinux-policy: serge-testsuite
✅ Storage blktests
✅ storage: software RAID testing
✅ Storage: swraid mdadm raid_module test
🚧 ✅ Podman system integration test - as root
🚧 ❌ Podman system integration test - as user
🚧 ✅ xfstests - btrfs
🚧 ✅ IPMI driver test
🚧 ✅ IPMItool loop stress test
🚧 ✅ Storage block - filesystem fio test
🚧 ✅ Storage block - queue scheduler test
🚧 ✅ Storage nvme - tcp
🚧 💥 stress: stress-ng
Host 2:
✅ Boot test
✅ Reboot test
✅ ACPI table test
✅ ACPI enabled test
✅ LTP
✅ CIFS Connectathon
✅ POSIX pjd-fstest suites
✅ Loopdev Sanity
✅ jvm - jcstress tests
✅ Memory: fork_mem
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking: igmp conformance test
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking cki netfilter test
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - transport
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
✅ pciutils: update pci ids test
✅ ALSA PCM loopback test
✅ ALSA Control (mixer) Userspace Element test
✅ storage: SCSI VPD
✅ trace: ftrace/tracer
🚧 ✅ xarray-idr-radixtree-test
🚧 ✅ i2c: i2cdetect sanity
🚧 ✅ NFS Connectathon
🚧 ✅ Firmware test suite
🚧 ✅ Memory function: kaslr
🚧 ✅ audit: audit testsuite test
🚧 ✅ lvm cache test
🚧 ✅ lvm snapper test
ppc64le:
Host 1:
✅ Boot test
✅ Reboot test
✅ xfstests - ext4
✅ xfstests - xfs
✅ selinux-policy: serge-testsuite
✅ Storage blktests
✅ storage: software RAID testing
✅ Storage: swraid mdadm raid_module test
🚧 ✅ Podman system integration test - as root
🚧 ✅ Podman system integration test - as user
🚧 ✅ xfstests - btrfs
🚧 ✅ IPMI driver test
🚧 ✅ IPMItool loop stress test
🚧 ✅ Storage block - filesystem fio test
🚧 ✅ Storage block - queue scheduler test
🚧 ✅ Storage nvme - tcp
🚧 ✅ Storage: lvm device-mapper test - upstream
Host 2:
✅ Boot test
✅ Reboot test
✅ LTP
✅ CIFS Connectathon
✅ POSIX pjd-fstest suites
✅ Loopdev Sanity
✅ jvm - jcstress tests
✅ Memory: fork_mem
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking cki netfilter test
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
✅ pciutils: update pci ids test
✅ ALSA PCM loopback test
✅ ALSA Control (mixer) Userspace Element test
✅ trace: ftrace/tracer
🚧 ✅ xarray-idr-radixtree-test
🚧 ✅ NFS Connectathon
🚧 ✅ Memory function: kaslr
🚧 ✅ audit: audit testsuite test
🚧 ✅ lvm cache test
🚧 ✅ lvm snapper test
s390x:
Host 1:
✅ Boot test
✅ Reboot test
✅ selinux-policy: serge-testsuite
✅ Storage blktests
✅ Storage: swraid mdadm raid_module test
🚧 ✅ Podman system integration test - as root
🚧 ✅ Podman system integration test - as user
🚧 ✅ Storage nvme - tcp
🚧 ❌ stress: stress-ng
Host 2:
✅ Boot test
✅ Reboot test
✅ LTP
✅ CIFS Connectathon
✅ POSIX pjd-fstest suites
✅ Loopdev Sanity
✅ jvm - jcstress tests
✅ Memory: fork_mem
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking cki netfilter test
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - transport
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
✅ trace: ftrace/tracer
🚧 ✅ xarray-idr-radixtree-test
🚧 ✅ NFS Connectathon
🚧 ✅ Memory function: kaslr
🚧 ✅ audit: audit testsuite test
🚧 ✅ lvm cache test
🚧 ✅ lvm snapper test
x86_64:
Host 1:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
⚡⚡⚡ Boot test
⚡⚡⚡ Reboot test
⚡⚡⚡ xfstests - ext4
⚡⚡⚡ xfstests - xfs
⚡⚡⚡ xfstests - nfsv4.2
⚡⚡⚡ selinux-policy: serge-testsuite
⚡⚡⚡ power-management: cpupower/sanity test
⚡⚡⚡ Storage blktests
⚡⚡⚡ storage: software RAID testing
⚡⚡⚡ Storage: swraid mdadm raid_module test
🚧 ⚡⚡⚡ Podman system integration test - as root
🚧 ⚡⚡⚡ Podman system integration test - as user
🚧 ⚡⚡⚡ CPU: Idle Test
🚧 ⚡⚡⚡ xfstests - btrfs
🚧 ⚡⚡⚡ xfstests - cifsv3.11
🚧 ⚡⚡⚡ IPMI driver test
🚧 ⚡⚡⚡ IPMItool loop stress test
🚧 ⚡⚡⚡ Storage block - filesystem fio test
🚧 ⚡⚡⚡ Storage block - queue scheduler test
🚧 ⚡⚡⚡ Storage nvme - tcp
🚧 ⚡⚡⚡ Storage: lvm device-mapper test - upstream
🚧 ⚡⚡⚡ stress: stress-ng
Host 2:
✅ Boot test
✅ Reboot test
✅ ACPI table test
✅ LTP
✅ CIFS Connectathon
✅ POSIX pjd-fstest suites
✅ Loopdev Sanity
✅ jvm - jcstress tests
✅ Memory: fork_mem
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking: igmp conformance test
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking cki netfilter test
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - transport
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
✅ pciutils: sanity smoke test
✅ pciutils: update pci ids test
✅ ALSA PCM loopback test
✅ ALSA Control (mixer) Userspace Element test
✅ storage: SCSI VPD
✅ trace: ftrace/tracer
🚧 ✅ xarray-idr-radixtree-test
🚧 ✅ i2c: i2cdetect sanity
🚧 ✅ NFS Connectathon
🚧 ✅ Firmware test suite
🚧 ✅ Memory function: kaslr
🚧 ✅ audit: audit testsuite test
🚧 ✅ lvm cache test
🚧 ✅ lvm snapper test
Host 3:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
⚡⚡⚡ Boot test
⚡⚡⚡ Reboot test
⚡⚡⚡ xfstests - ext4
⚡⚡⚡ xfstests - xfs
⚡⚡⚡ xfstests - nfsv4.2
⚡⚡⚡ selinux-policy: serge-testsuite
⚡⚡⚡ power-management: cpupower/sanity test
⚡⚡⚡ Storage blktests
⚡⚡⚡ storage: software RAID testing
⚡⚡⚡ Storage: swraid mdadm raid_module test
🚧 ⚡⚡⚡ Podman system integration test - as root
🚧 ⚡⚡⚡ Podman system integration test - as user
🚧 ⚡⚡⚡ CPU: Idle Test
🚧 ⚡⚡⚡ xfstests - btrfs
🚧 ⚡⚡⚡ xfstests - cifsv3.11
🚧 ⚡⚡⚡ IPMI driver test
🚧 ⚡⚡⚡ IPMItool loop stress test
🚧 ⚡⚡⚡ Storage block - filesystem fio test
🚧 ⚡⚡⚡ Storage block - queue scheduler test
🚧 ⚡⚡⚡ Storage nvme - tcp
🚧 ⚡⚡⚡ Storage: lvm device-mapper test - upstream
🚧 ⚡⚡⚡ stress: stress-ng
Host 4:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
⚡⚡⚡ Boot test
⚡⚡⚡ Reboot test
⚡⚡⚡ xfstests - ext4
⚡⚡⚡ xfstests - xfs
⚡⚡⚡ xfstests - nfsv4.2
⚡⚡⚡ selinux-policy: serge-testsuite
⚡⚡⚡ power-management: cpupower/sanity test
⚡⚡⚡ Storage blktests
⚡⚡⚡ storage: software RAID testing
⚡⚡⚡ Storage: swraid mdadm raid_module test
🚧 ⚡⚡⚡ Podman system integration test - as root
🚧 ⚡⚡⚡ Podman system integration test - as user
🚧 ⚡⚡⚡ CPU: Idle Test
🚧 ⚡⚡⚡ xfstests - btrfs
🚧 ⚡⚡⚡ xfstests - cifsv3.11
🚧 ⚡⚡⚡ IPMI driver test
🚧 ⚡⚡⚡ IPMItool loop stress test
🚧 ⚡⚡⚡ Storage block - filesystem fio test
🚧 ⚡⚡⚡ Storage block - queue scheduler test
🚧 ⚡⚡⚡ Storage nvme - tcp
🚧 ⚡⚡⚡ Storage: lvm device-mapper test - upstream
🚧 ⚡⚡⚡ stress: stress-ng
Test sources: https://gitlab.com/cki-project/kernel-tests
💚 Pull requests are welcome for new tests or improvements to existing tests!
Aborted tests
-------------
Tests that didn't complete running successfully are marked with ⚡⚡⚡.
If this was caused by an infrastructure issue, we try to mark that
explicitly in the report.
Waived tests
------------
If the test run included waived tests, they are marked with 🚧. Such tests are
executed but their results are not taken into account. Tests are waived when
their results are not reliable enough, e.g. when they're just introduced or are
being fixed.
Testing timeout
---------------
We aim to provide a report within reasonable timeframe. Tests that haven't
finished running yet are marked with ⏱.
The patch titled
Subject: hugetlb: don't pass page cache pages to restore_reserve_on_error
has been added to the -mm tree. Its filename is
hugetlb-dont-pass-page-cache-pages-to-restore_reserve_on_error.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/hugetlb-dont-pass-page-cache-page…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/hugetlb-dont-pass-page-cache-page…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Mike Kravetz <mike.kravetz(a)oracle.com>
Subject: hugetlb: don't pass page cache pages to restore_reserve_on_error
syzbot hit kernel BUG at fs/hugetlbfs/inode.c:532 as described in [1].
This BUG triggers if the HPageRestoreReserve flag is set on a page in
the page cache. It should never be set, as the routine
huge_add_to_page_cache explicitly clears the flag after adding a page
to the cache.
The only code other than huge page allocation which sets the flag is
restore_reserve_on_error. It will potentially set the flag in rare out
of memory conditions. syzbot was injecting errors to cause memory
allocation errors which exercised this specific path.
The code in restore_reserve_on_error is doing the right thing. However,
there are instances where pages in the page cache were being passed to
restore_reserve_on_error. This is incorrect, as once a page goes into
the cache reservation information will not be modified for the page until
it is removed from the cache. Error paths do not remove pages from the
cache, so even in the case of error, the page will remain in the cache
and no reservation adjustment is needed.
Modify routines that potentially call restore_reserve_on_error with a
page cache page to no longer do so.
Note on fixes tag:
Prior to commit 846be08578ed ("mm/hugetlb: expand restore_reserve_on_error
functionality") the routine would not process page cache pages because
the HPageRestoreReserve flag is not set on such pages. Therefore, this
issue could not be trigggered. The code added by commit 846be08578ed
("mm/hugetlb: expand restore_reserve_on_error functionality") is needed
and correct. It exposed incorrect calls to restore_reserve_on_error which
is the root cause addressed by this commit.
[1] https://lore.kernel.org/linux-mm/00000000000050776d05c9b7c7f0@google.com/
Link: https://lkml.kernel.org/r/20210818213304.37038-1-mike.kravetz@oracle.com
Fixes: 846be08578ed ("mm/hugetlb: expand restore_reserve_on_error functionality")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reported-by: <syzbot+67654e51e54455f1c585(a)syzkaller.appspotmail.com>
Cc: Mina Almasry <almasrymina(a)google.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Naoya Horiguchi <naoya.horiguchi(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
--- a/mm/hugetlb.c~hugetlb-dont-pass-page-cache-pages-to-restore_reserve_on_error
+++ a/mm/hugetlb.c
@@ -2476,7 +2476,7 @@ void restore_reserve_on_error(struct hst
if (!rc) {
/*
* This indicates there is an entry in the reserve map
- * added by alloc_huge_page. We know it was added
+ * not added by alloc_huge_page. We know it was added
* before the alloc_huge_page call, otherwise
* HPageRestoreReserve would be set on the page.
* Remove the entry so that a subsequent allocation
@@ -4660,7 +4660,9 @@ retry_avoidcopy:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
out_release_all:
- restore_reserve_on_error(h, vma, haddr, new_page);
+ /* No restore in case of successful pagetable update (Break COW) */
+ if (new_page != old_page)
+ restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
out_release_old:
put_page(old_page);
@@ -4776,7 +4778,7 @@ static vm_fault_t hugetlb_no_page(struct
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
- bool new_page = false;
+ bool new_page, new_pagecache_page = false;
/*
* Currently, we are forced to kill the process in the event the
@@ -4799,6 +4801,7 @@ static vm_fault_t hugetlb_no_page(struct
goto out;
retry:
+ new_page = false;
page = find_lock_page(mapping, idx);
if (!page) {
/* Check for page in userfault range */
@@ -4842,6 +4845,7 @@ retry:
goto retry;
goto out;
}
+ new_pagecache_page = true;
} else {
lock_page(page);
if (unlikely(anon_vma_prepare(vma))) {
@@ -4926,7 +4930,9 @@ backout:
spin_unlock(ptl);
backout_unlocked:
unlock_page(page);
- restore_reserve_on_error(h, vma, haddr, page);
+ /* restore reserve for newly allocated pages not in page cache */
+ if (new_page && !new_pagecache_page)
+ restore_reserve_on_error(h, vma, haddr, page);
put_page(page);
goto out;
}
@@ -5135,6 +5141,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_s
int ret = -ENOMEM;
struct page *page;
int writable;
+ bool new_pagecache_page = false;
if (is_continue) {
ret = -EFAULT;
@@ -5228,6 +5235,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_s
ret = huge_add_to_page_cache(page, mapping, idx);
if (ret)
goto out_release_nounlock;
+ new_pagecache_page = true;
}
ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
@@ -5291,7 +5299,8 @@ out_release_unlock:
if (vm_shared || is_continue)
unlock_page(page);
out_release_nounlock:
- restore_reserve_on_error(h, dst_vma, dst_addr, page);
+ if (!new_pagecache_page)
+ restore_reserve_on_error(h, dst_vma, dst_addr, page);
put_page(page);
goto out;
}
_
Patches currently in -mm which might be from mike.kravetz(a)oracle.com are
hugetlb-dont-pass-page-cache-pages-to-restore_reserve_on_error.patch
hugetlb-simplify-prep_compound_gigantic_page-ref-count-racing-code.patch
hugetlb-drop-ref-count-earlier-after-page-allocation.patch
hugetlb-before-freeing-hugetlb-page-set-dtor-to-appropriate-value.patch
syzbot hit kernel BUG at fs/hugetlbfs/inode.c:532 as described in [1].
This BUG triggers if the HPageRestoreReserve flag is set on a page in
the page cache. It should never be set, as the routine
huge_add_to_page_cache explicitly clears the flag after adding a page
to the cache.
The only code other than huge page allocation which sets the flag is
restore_reserve_on_error. It will potentially set the flag in rare out
of memory conditions. syzbot was injecting errors to cause memory
allocation errors which exercised this specific path.
The code in restore_reserve_on_error is doing the right thing. However,
there are instances where pages in the page cache were being passed to
restore_reserve_on_error. This is incorrect, as once a page goes into
the cache reservation information will not be modified for the page until
it is removed from the cache. Error paths do not remove pages from the
cache, so even in the case of error, the page will remain in the cache
and no reservation adjustment is needed.
Modify routines that potentially call restore_reserve_on_error with a
page cache page to no longer do so.
Note on fixes tag:
Prior to commit 846be08578ed ("mm/hugetlb: expand restore_reserve_on_error
functionality") the routine would not process page cache pages because
the HPageRestoreReserve flag is not set on such pages. Therefore, this
issue could not be trigggered. The code added by commit 846be08578ed
("mm/hugetlb: expand restore_reserve_on_error functionality") is needed
and correct. It exposed incorrect calls to restore_reserve_on_error which
is the root cause addressed by this commit.
[1] https://lore.kernel.org/linux-mm/00000000000050776d05c9b7c7f0@google.com/
Fixes: 846be08578ed ("mm/hugetlb: expand restore_reserve_on_error functionality")
Reported-by: syzbot+67654e51e54455f1c585(a)syzkaller.appspotmail.com
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
---
mm/hugetlb.c | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6337697f7ee4..54c715e7e362 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2546,7 +2546,7 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
if (!rc) {
/*
* This indicates there is an entry in the reserve map
- * added by alloc_huge_page. We know it was added
+ * not added by alloc_huge_page. We know it was added
* before the alloc_huge_page call, otherwise
* HPageRestoreReserve would be set on the page.
* Remove the entry so that a subsequent allocation
@@ -4753,7 +4753,9 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
out_release_all:
- restore_reserve_on_error(h, vma, haddr, new_page);
+ /* No restore in case of successful pagetable update (Break COW) */
+ if (new_page != old_page)
+ restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
out_release_old:
put_page(old_page);
@@ -4869,7 +4871,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
- bool new_page = false;
+ bool new_page, new_pagecache_page = false;
/*
* Currently, we are forced to kill the process in the event the
@@ -4892,6 +4894,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
goto out;
retry:
+ new_page = false;
page = find_lock_page(mapping, idx);
if (!page) {
/* Check for page in userfault range */
@@ -4935,6 +4938,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
goto retry;
goto out;
}
+ new_pagecache_page = true;
} else {
lock_page(page);
if (unlikely(anon_vma_prepare(vma))) {
@@ -5019,7 +5023,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
spin_unlock(ptl);
backout_unlocked:
unlock_page(page);
- restore_reserve_on_error(h, vma, haddr, page);
+ /* restore reserve for newly allocated pages not in page cache */
+ if (new_page && !new_pagecache_page)
+ restore_reserve_on_error(h, vma, haddr, page);
put_page(page);
goto out;
}
@@ -5228,6 +5234,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
int ret = -ENOMEM;
struct page *page;
int writable;
+ bool new_pagecache_page = false;
if (is_continue) {
ret = -EFAULT;
@@ -5321,6 +5328,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
ret = huge_add_to_page_cache(page, mapping, idx);
if (ret)
goto out_release_nounlock;
+ new_pagecache_page = true;
}
ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
@@ -5384,7 +5392,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
if (vm_shared || is_continue)
unlock_page(page);
out_release_nounlock:
- restore_reserve_on_error(h, dst_vma, dst_addr, page);
+ if (!new_pagecache_page)
+ restore_reserve_on_error(h, dst_vma, dst_addr, page);
put_page(page);
goto out;
}
--
2.31.1
When a mapped level interrupt (a timer, for example) is deactivated
by the guest, the corresponding host interrupt is equally deactivated.
However, the fate of the pending state still needs to be dealt
with in SW.
This is specially true when the interrupt was in the active+pending
state in the virtual distributor at the point where the guest
was entered. On exit, the pending state is potentially stale
(the guest may have put the interrupt in a non-pending state).
If we don't do anything, the interrupt will be spuriously injected
in the guest. Although this shouldn't have any ill effect (spurious
interrupts are always possible), we can improve the emulation by
detecting the deactivation-while-pending case and resample the
interrupt.
Fixes: e40cc57bac79 ("KVM: arm/arm64: vgic: Support level-triggered mapped interrupts")
Reported-by: Raghavendra Rao Ananta <rananta(a)google.com>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
arch/arm64/kvm/vgic/vgic-v2.c | 25 ++++++++++++++++++-------
arch/arm64/kvm/vgic/vgic-v3.c | 25 ++++++++++++++++++-------
2 files changed, 36 insertions(+), 14 deletions(-)
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 2c580204f1dc..3e52ea86a87f 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -60,6 +60,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
u32 val = cpuif->vgic_lr[lr];
u32 cpuid, intid = val & GICH_LR_VIRTUALID;
struct vgic_irq *irq;
+ bool deactivated;
/* Extract the source vCPU id from the LR */
cpuid = val & GICH_LR_PHYSID_CPUID;
@@ -75,7 +76,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
raw_spin_lock(&irq->irq_lock);
- /* Always preserve the active bit */
+ /* Always preserve the active bit, note deactivation */
+ deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT);
irq->active = !!(val & GICH_LR_ACTIVE_BIT);
if (irq->active && vgic_irq_is_sgi(intid))
@@ -105,6 +107,12 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
* device state could have changed or we simply need to
* process the still pending interrupt later.
*
+ * We could also have entered the guest with the interrupt
+ * active+pending. On the next exit, we need to re-evaluate
+ * the pending state, as it could otherwise result in a
+ * spurious interrupt by injecting a now potentially stale
+ * pending state.
+ *
* If this causes us to lower the level, we have to also clear
* the physical active state, since we will otherwise never be
* told when the interrupt becomes asserted again.
@@ -115,12 +123,15 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
if (vgic_irq_is_mapped_level(irq)) {
bool resample = false;
- if (val & GICH_LR_PENDING_BIT) {
- irq->line_level = vgic_get_phys_line_level(irq);
- resample = !irq->line_level;
- } else if (vgic_irq_needs_resampling(irq) &&
- !(irq->active || irq->pending_latch)) {
- resample = true;
+ if (unlikely(vgic_irq_needs_resampling(irq))) {
+ if (!(irq->active || irq->pending_latch))
+ resample = true;
+ } else {
+ if ((val & GICH_LR_PENDING_BIT) ||
+ (deactivated && irq->line_level)) {
+ irq->line_level = vgic_get_phys_line_level(irq);
+ resample = !irq->line_level;
+ }
}
if (resample)
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 66004f61cd83..74f9aefffd5e 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -46,6 +46,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
u32 intid, cpuid;
struct vgic_irq *irq;
bool is_v2_sgi = false;
+ bool deactivated;
cpuid = val & GICH_LR_PHYSID_CPUID;
cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
@@ -68,7 +69,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
raw_spin_lock(&irq->irq_lock);
- /* Always preserve the active bit */
+ /* Always preserve the active bit, note deactivation */
+ deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
irq->active = !!(val & ICH_LR_ACTIVE_BIT);
if (irq->active && is_v2_sgi)
@@ -98,6 +100,12 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
* device state could have changed or we simply need to
* process the still pending interrupt later.
*
+ * We could also have entered the guest with the interrupt
+ * active+pending. On the next exit, we need to re-evaluate
+ * the pending state, as it could otherwise result in a
+ * spurious interrupt by injecting a now potentially stale
+ * pending state.
+ *
* If this causes us to lower the level, we have to also clear
* the physical active state, since we will otherwise never be
* told when the interrupt becomes asserted again.
@@ -108,12 +116,15 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
if (vgic_irq_is_mapped_level(irq)) {
bool resample = false;
- if (val & ICH_LR_PENDING_BIT) {
- irq->line_level = vgic_get_phys_line_level(irq);
- resample = !irq->line_level;
- } else if (vgic_irq_needs_resampling(irq) &&
- !(irq->active || irq->pending_latch)) {
- resample = true;
+ if (unlikely(vgic_irq_needs_resampling(irq))) {
+ if (!(irq->active || irq->pending_latch))
+ resample = true;
+ } else {
+ if ((val & ICH_LR_PENDING_BIT) ||
+ (deactivated && irq->line_level)) {
+ irq->line_level = vgic_get_phys_line_level(irq);
+ resample = !irq->line_level;
+ }
}
if (resample)
--
2.30.2
The patch titled
Subject: kfence: fix is_kfence_address() for addresses below KFENCE_POOL_SIZE
has been added to the -mm tree. Its filename is
kfence-fix-is_kfence_address-for-addresses-below-kfence_pool_size.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/kfence-fix-is_kfence_address-for-…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/kfence-fix-is_kfence_address-for-…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Marco Elver <elver(a)google.com>
Subject: kfence: fix is_kfence_address() for addresses below KFENCE_POOL_SIZE
Originally the addr != NULL check was meant to take care of the case where
__kfence_pool == NULL (KFENCE is disabled). However, this does not work
for addresses where addr > 0 && addr < KFENCE_POOL_SIZE.
This can be the case on NULL-deref where addr > 0 && addr < PAGE_SIZE or
any other faulting access with addr < KFENCE_POOL_SIZE. While the kernel
would likely crash, the stack traces and report might be confusing due to
double faults upon KFENCE's attempt to unprotect such an address.
Fix it by just checking that __kfence_pool != NULL instead.
Link: https://lkml.kernel.org/r/20210818130300.2482437-1-elver@google.com
Fixes: 0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure")
Signed-off-by: Marco Elver <elver(a)google.com>
Reported-by: Kuan-Ying Lee <Kuan-Ying.Lee(a)mediatek.com>
Acked-by: Alexander Potapenko <glider(a)google.com>
Cc: Dmitry Vyukov <dvyukov(a)google.com>
Cc: <stable(a)vger.kernel.org> [5.12+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/kfence.h | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
--- a/include/linux/kfence.h~kfence-fix-is_kfence_address-for-addresses-below-kfence_pool_size
+++ a/include/linux/kfence.h
@@ -51,10 +51,11 @@ extern atomic_t kfence_allocation_gate;
static __always_inline bool is_kfence_address(const void *addr)
{
/*
- * The non-NULL check is required in case the __kfence_pool pointer was
- * never initialized; keep it in the slow-path after the range-check.
+ * The __kfence_pool != NULL check is required to deal with the case
+ * where __kfence_pool == NULL && addr < KFENCE_POOL_SIZE. Keep it in
+ * the slow-path after the range-check!
*/
- return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && addr);
+ return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && __kfence_pool);
}
/**
_
Patches currently in -mm which might be from elver(a)google.com are
kfence-fix-is_kfence_address-for-addresses-below-kfence_pool_size.patch
kfence-show-cpu-and-timestamp-in-alloc-free-info.patch
The patch titled
Subject: mm/filemap.c: remove bogus VM_BUG_ON
has been added to the -mm tree. Its filename is
mm-remove-bogus-vm_bug_on.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-remove-bogus-vm_bug_on.patch
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-remove-bogus-vm_bug_on.patch
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: "Matthew Wilcox (Oracle)" <willy(a)infradead.org>
Subject: mm/filemap.c: remove bogus VM_BUG_ON
It is not safe to check page->index without holding the page lock. It can
be changed if the page is moved between the swap cache and the page cache
for a shmem file, for example. There is a VM_BUG_ON below which checks
page->index is correct after taking the page lock.
Link: https://lkml.kernel.org/r/20210818144932.940640-1-willy@infradead.org
Fixes: 5c211ba29deb ("mm: add and use find_lock_entries")
Signed-off-by: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Reported-by: <syzbot+c87be4f669d920c76330(a)syzkaller.appspotmail.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/filemap.c | 1 -
1 file changed, 1 deletion(-)
--- a/mm/filemap.c~mm-remove-bogus-vm_bug_on
+++ a/mm/filemap.c
@@ -2038,7 +2038,6 @@ unsigned find_lock_entries(struct addres
if (!xa_is_value(page)) {
if (page->index < start)
goto put;
- VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
if (page->index + thp_nr_pages(page) - 1 > end)
goto put;
if (!trylock_page(page))
_
Patches currently in -mm which might be from willy(a)infradead.org are
mm-remove-bogus-vm_bug_on.patch
mm-report-a-more-useful-address-for-reclaim-acquisition.patch
mm-mark-idle-page-tracking-as-broken.patch
avoid-a-warning-in-sparse-memory-support.patch
mm-move-kvmalloc-related-functions-to-slabh.patch
This is the start of the stable review cycle for the 5.4.142 release.
There are 62 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Wed, 18 Aug 2021 12:54:12 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.4.142-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.4.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 5.4.142-rc1
Saeed Mirzamohammadi <saeed.mirzamohammadi(a)oracle.com>
iommu/vt-d: Fix agaw for a supported 48 bit guest address width
Nathan Chancellor <nathan(a)kernel.org>
vmlinux.lds.h: Handle clang's module.{c,d}tor sections
Jeff Layton <jlayton(a)kernel.org>
ceph: take snap_empty_lock atomically with snaprealm refcount change
Jeff Layton <jlayton(a)kernel.org>
ceph: clean up locking annotation for ceph_get_snap_realm and __lookup_snap_realm
Jeff Layton <jlayton(a)kernel.org>
ceph: add some lockdep assertions around snaprealm handling
Sean Christopherson <seanjc(a)google.com>
KVM: VMX: Use current VMCS to query WAITPKG support for MSR emulation
Thomas Gleixner <tglx(a)linutronix.de>
PCI/MSI: Protect msi_desc::masked for multi-MSI
Thomas Gleixner <tglx(a)linutronix.de>
PCI/MSI: Use msi_mask_irq() in pci_msi_shutdown()
Thomas Gleixner <tglx(a)linutronix.de>
PCI/MSI: Correct misleading comments
Thomas Gleixner <tglx(a)linutronix.de>
PCI/MSI: Do not set invalid bits in MSI mask
Thomas Gleixner <tglx(a)linutronix.de>
PCI/MSI: Enforce MSI[X] entry updates to be visible
Thomas Gleixner <tglx(a)linutronix.de>
PCI/MSI: Enforce that MSI-X table entry is masked for update
Thomas Gleixner <tglx(a)linutronix.de>
PCI/MSI: Mask all unused MSI-X entries
Thomas Gleixner <tglx(a)linutronix.de>
PCI/MSI: Enable and mask MSI-X early
Ben Dai <ben.dai(a)unisoc.com>
genirq/timings: Prevent potential array overflow in __irq_timings_store()
Bixuan Cui <cuibixuan(a)huawei.com>
genirq/msi: Ensure deactivation on teardown
Babu Moger <Babu.Moger(a)amd.com>
x86/resctrl: Fix default monitoring groups reporting
Thomas Gleixner <tglx(a)linutronix.de>
x86/ioapic: Force affinity setup before startup
Thomas Gleixner <tglx(a)linutronix.de>
x86/msi: Force affinity setup before startup
Thomas Gleixner <tglx(a)linutronix.de>
genirq: Provide IRQCHIP_AFFINITY_PRE_STARTUP
Randy Dunlap <rdunlap(a)infradead.org>
x86/tools: Fix objdump version check again
Pu Lehui <pulehui(a)huawei.com>
powerpc/kprobes: Fix kprobe Oops happens in booke
Xie Yongji <xieyongji(a)bytedance.com>
nbd: Aovid double completion of a request
Longpeng(Mike) <longpeng2(a)huawei.com>
vsock/virtio: avoid potential deadlock when vsock device remove
Maximilian Heyne <mheyne(a)amazon.de>
xen/events: Fix race in set_evtchn_to_irq
Eric Dumazet <edumazet(a)google.com>
net: igmp: increase size of mr_ifc_count
Neal Cardwell <ncardwell(a)google.com>
tcp_bbr: fix u32 wrap bug in round logic if bbr_init() called after 2B packets
Willy Tarreau <w(a)1wt.eu>
net: linkwatch: fix failure to restore device state across suspend/resume
Yang Yingliang <yangyingliang(a)huawei.com>
net: bridge: fix memleak in br_add_if()
Vladimir Oltean <vladimir.oltean(a)nxp.com>
net: dsa: sja1105: fix broken backpressure in .port_fdb_dump
Vladimir Oltean <vladimir.oltean(a)nxp.com>
net: dsa: lantiq: fix broken backpressure in .port_fdb_dump
Vladimir Oltean <vladimir.oltean(a)nxp.com>
net: dsa: lan9303: fix broken backpressure in .port_fdb_dump
Eric Dumazet <edumazet(a)google.com>
net: igmp: fix data-race in igmp_ifc_timer_expire()
Takeshi Misawa <jeliantsurux(a)gmail.com>
net: Fix memory leak in ieee802154_raw_deliver
Ben Hutchings <ben.hutchings(a)mind.be>
net: dsa: microchip: Fix ksz_read64()
Christian Hewitt <christianshewitt(a)gmail.com>
drm/meson: fix colour distortion from HDR set during vendor u-boot
Aya Levin <ayal(a)nvidia.com>
net/mlx5: Fix return value from tracer initialization
Roi Dayan <roid(a)nvidia.com>
psample: Add a fwd declaration for skbuff
Md Fahad Iqbal Polash <md.fahad.iqbal.polash(a)intel.com>
iavf: Set RSS LUT and key in reset handle path
Hangbin Liu <liuhangbin(a)gmail.com>
net: sched: act_mirred: Reset ct info when mirror/redirect skb
Pali Rohár <pali(a)kernel.org>
ppp: Fix generating ifname when empty IFLA_IFNAME is specified
Ben Hutchings <ben.hutchings(a)mind.be>
net: phy: micrel: Fix link detection on ksz87xx switch"
Hans de Goede <hdegoede(a)redhat.com>
platform/x86: pcengines-apuv2: Add missing terminating entries to gpio-lookup tables
Florian Eckert <fe(a)dev.tdt.de>
platform/x86: pcengines-apuv2: revert wiring up simswitch GPIO as LED
DENG Qingfang <dqfext(a)gmail.com>
net: dsa: mt7530: add the missing RxUnicast MIB counter
Richard Fitzgerald <rf(a)opensource.cirrus.com>
ASoC: cs42l42: Fix LRCLK frame start edge
Yajun Deng <yajun.deng(a)linux.dev>
netfilter: nf_conntrack_bridge: Fix memory leak when error
Richard Fitzgerald <rf(a)opensource.cirrus.com>
ASoC: cs42l42: Remove duplicate control for WNF filter frequency
Richard Fitzgerald <rf(a)opensource.cirrus.com>
ASoC: cs42l42: Fix inversion of ADC Notch Switch control
Richard Fitzgerald <rf(a)opensource.cirrus.com>
ASoC: cs42l42: Don't allow SND_SOC_DAIFMT_LEFT_J
Richard Fitzgerald <rf(a)opensource.cirrus.com>
ASoC: cs42l42: Correct definition of ADC Volume control
Dongliang Mu <mudongliangabcd(a)gmail.com>
ieee802154: hwsim: fix GPF in hwsim_new_edge_nl
Dongliang Mu <mudongliangabcd(a)gmail.com>
ieee802154: hwsim: fix GPF in hwsim_set_edge_lqi
Dan Williams <dan.j.williams(a)intel.com>
libnvdimm/region: Fix label activation vs errors
Dan Williams <dan.j.williams(a)intel.com>
ACPI: NFIT: Fix support for virtual SPA ranges
Luis Henriques <lhenriques(a)suse.de>
ceph: reduce contention in ceph_check_delayed_caps()
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
i2c: dev: zero out array used for i2c reads from userspace
Takashi Iwai <tiwai(a)suse.de>
ASoC: intel: atom: Fix reference to PCM buffer address
Takashi Iwai <tiwai(a)suse.de>
ASoC: xilinx: Fix reference to PCM buffer address
Colin Ian King <colin.king(a)canonical.com>
iio: adc: Fix incorrect exit of for-loop
Chris Lesiak <chris.lesiak(a)licor.com>
iio: humidity: hdc100x: Add margin to the conversion time
Uwe Kleine-König <u.kleine-koenig(a)pengutronix.de>
iio: adc: ti-ads7950: Ensure CS is deasserted after reading channels
-------------
Diffstat:
Makefile | 4 +-
arch/powerpc/kernel/kprobes.c | 3 +-
arch/x86/kernel/apic/io_apic.c | 6 +-
arch/x86/kernel/apic/msi.c | 13 ++-
arch/x86/kernel/cpu/resctrl/monitor.c | 27 +++--
arch/x86/kvm/vmx/vmx.h | 2 +-
arch/x86/tools/chkobjdump.awk | 1 +
drivers/acpi/nfit/core.c | 3 +
drivers/base/core.c | 1 +
drivers/block/nbd.c | 14 ++-
drivers/gpu/drm/meson/meson_registers.h | 5 +
drivers/gpu/drm/meson/meson_viu.c | 7 +-
drivers/i2c/i2c-dev.c | 5 +-
drivers/iio/adc/palmas_gpadc.c | 4 +-
drivers/iio/adc/ti-ads7950.c | 1 -
drivers/iio/humidity/hdc100x.c | 6 +-
drivers/iommu/intel-iommu.c | 7 +-
drivers/net/dsa/lan9303-core.c | 34 +++---
drivers/net/dsa/lantiq_gswip.c | 14 ++-
drivers/net/dsa/microchip/ksz_common.h | 8 +-
drivers/net/dsa/mt7530.c | 1 +
drivers/net/dsa/sja1105/sja1105_main.c | 4 +-
drivers/net/ethernet/intel/iavf/iavf_main.c | 13 ++-
.../ethernet/mellanox/mlx5/core/diag/fw_tracer.c | 11 +-
drivers/net/ieee802154/mac802154_hwsim.c | 6 +-
drivers/net/phy/micrel.c | 2 -
drivers/net/ppp/ppp_generic.c | 2 +-
drivers/nvdimm/namespace_devs.c | 17 ++-
drivers/pci/msi.c | 125 +++++++++++++--------
drivers/platform/x86/pcengines-apuv2.c | 5 +-
drivers/xen/events/events_base.c | 20 +++-
fs/ceph/caps.c | 17 ++-
fs/ceph/mds_client.c | 25 +++--
fs/ceph/snap.c | 54 +++++----
fs/ceph/super.h | 2 +-
include/asm-generic/vmlinux.lds.h | 1 +
include/linux/device.h | 1 +
include/linux/inetdevice.h | 2 +-
include/linux/irq.h | 2 +
include/linux/msi.h | 2 +-
include/net/psample.h | 2 +
kernel/irq/chip.c | 5 +-
kernel/irq/msi.c | 13 ++-
kernel/irq/timings.c | 5 +
net/bridge/br_if.c | 2 +
net/bridge/netfilter/nf_conntrack_bridge.c | 6 +
net/core/link_watch.c | 5 +-
net/ieee802154/socket.c | 7 +-
net/ipv4/igmp.c | 21 ++--
net/ipv4/tcp_bbr.c | 2 +-
net/sched/act_mirred.c | 3 +
net/vmw_vsock/virtio_transport.c | 7 +-
sound/soc/codecs/cs42l42.c | 39 +++----
sound/soc/intel/atom/sst-mfld-platform-pcm.c | 3 +-
sound/soc/xilinx/xlnx_formatter_pcm.c | 4 +-
55 files changed, 382 insertions(+), 219 deletions(-)
It is not safe to check page->index without holding the page lock.
It can be changed if the page is moved between the swap cache and the
page cache for a shmem file, for example. There is a VM_BUG_ON below
which checks page->index is correct after taking the page lock.
Cc: stable(a)vger.kernel.org
Fixes: 5c211ba29deb ("mm: add and use find_lock_entries")
Signed-off-by: Matthew Wilcox (Oracle) <willy(a)infradead.org>
---
mm/filemap.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index d1458ecf2f51..34de0b14aaa9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2033,17 +2033,16 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
XA_STATE(xas, &mapping->i_pages, start);
struct page *page;
rcu_read_lock();
while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
if (!xa_is_value(page)) {
if (page->index < start)
goto put;
- VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
if (page->index + thp_nr_pages(page) - 1 > end)
goto put;
if (!trylock_page(page))
goto put;
if (page->mapping != mapping || PageWriteback(page))
goto unlock;
VM_BUG_ON_PAGE(!thp_contains(page, xas.xa_index),
page);
--
2.30.2
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 826da771291fc25a428e871f9e7fb465e390f852 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Thu, 29 Jul 2021 23:51:48 +0200
Subject: [PATCH] genirq: Provide IRQCHIP_AFFINITY_PRE_STARTUP
X86 IO/APIC and MSI interrupts (when used without interrupts remapping)
require that the affinity setup on startup is done before the interrupt is
enabled for the first time as the non-remapped operation mode cannot safely
migrate enabled interrupts from arbitrary contexts. Provide a new irq chip
flag which allows affected hardware to request this.
This has to be opt-in because there have been reports in the past that some
interrupt chips cannot handle affinity setting before startup.
Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)")
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.779791738@linutronix.de
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 8e9a9ae471a6..c8293c817646 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -569,6 +569,7 @@ struct irq_chip {
* IRQCHIP_SUPPORTS_NMI: Chip can deliver NMIs, only for root irqchips
* IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND: Invokes __enable_irq()/__disable_irq() for wake irqs
* in the suspend path if they are in disabled state
+ * IRQCHIP_AFFINITY_PRE_STARTUP: Default affinity update before startup
*/
enum {
IRQCHIP_SET_TYPE_MASKED = (1 << 0),
@@ -581,6 +582,7 @@ enum {
IRQCHIP_SUPPORTS_LEVEL_MSI = (1 << 7),
IRQCHIP_SUPPORTS_NMI = (1 << 8),
IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND = (1 << 9),
+ IRQCHIP_AFFINITY_PRE_STARTUP = (1 << 10),
};
#include <linux/irqdesc.h>
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 7f04c7d8296e..a98bcfc4be7b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -265,8 +265,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
} else {
switch (__irq_startup_managed(desc, aff, force)) {
case IRQ_STARTUP_NORMAL:
+ if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)
+ irq_setup_affinity(desc);
ret = __irq_startup(desc);
- irq_setup_affinity(desc);
+ if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP))
+ irq_setup_affinity(desc);
break;
case IRQ_STARTUP_MANAGED:
irq_do_set_affinity(d, aff, false);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From dbbc93576e03fbe24b365fab0e901eb442237a8a Mon Sep 17 00:00:00 2001
From: Bixuan Cui <cuibixuan(a)huawei.com>
Date: Tue, 18 May 2021 11:31:17 +0800
Subject: [PATCH] genirq/msi: Ensure deactivation on teardown
msi_domain_alloc_irqs() invokes irq_domain_activate_irq(), but
msi_domain_free_irqs() does not enforce deactivation before tearing down
the interrupts.
This happens when PCI/MSI interrupts are set up and never used before being
torn down again, e.g. in error handling pathes. The only place which cleans
that up is the error handling path in msi_domain_alloc_irqs().
Move the cleanup from msi_domain_alloc_irqs() into msi_domain_free_irqs()
to cure that.
Fixes: f3b0946d629c ("genirq/msi: Make sure PCI MSIs are activated early")
Signed-off-by: Bixuan Cui <cuibixuan(a)huawei.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210518033117.78104-1-cuibixuan@huawei.com
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index c41965e348b5..85df3ca03efe 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -476,11 +476,6 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
return 0;
cleanup:
- for_each_msi_vector(desc, i, dev) {
- irq_data = irq_domain_get_irq_data(domain, i);
- if (irqd_is_activated(irq_data))
- irq_domain_deactivate_irq(irq_data);
- }
msi_domain_free_irqs(domain, dev);
return ret;
}
@@ -505,7 +500,15 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
{
+ struct irq_data *irq_data;
struct msi_desc *desc;
+ int i;
+
+ for_each_msi_vector(desc, i, dev) {
+ irq_data = irq_domain_get_irq_data(domain, i);
+ if (irqd_is_activated(irq_data))
+ irq_domain_deactivate_irq(irq_data);
+ }
for_each_msi_entry(desc, dev) {
/*
This is a note to let you know that I've just added the patch titled
usb: typec: tcpm: Fix VDMs sometimes not being forwarded to alt-mode
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 5571ea3117ca22849072adb58074fb5a2fd12c00 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede(a)redhat.com>
Date: Mon, 16 Aug 2021 17:46:32 +0200
Subject: usb: typec: tcpm: Fix VDMs sometimes not being forwarded to alt-mode
drivers
Commit a20dcf53ea98 ("usb: typec: tcpm: Respond Not_Supported if no
snk_vdo"), stops tcpm_pd_data_request() calling tcpm_handle_vdm_request()
when port->nr_snk_vdo is not set. But the VDM might be intended for an
altmode-driver, in which case nr_snk_vdo does not matter.
This change breaks the forwarding of connector hotplug (HPD) events
for displayport altmode on devices which don't set nr_snk_vdo.
tcpm_pd_data_request() is the only caller of tcpm_handle_vdm_request(),
so we can move the nr_snk_vdo check to inside it, at which point we
have already looked up the altmode device so we can check for this too.
Doing this check here also ensures that vdm_state gets set to
VDM_STATE_DONE if it was VDM_STATE_BUSY, even if we end up with
responding with PD_MSG_CTRL_NOT_SUPP later.
Note that tcpm_handle_vdm_request() was already sending
PD_MSG_CTRL_NOT_SUPP in some circumstances, after moving the nr_snk_vdo
check the same error-path is now taken when that check fails. So that
we have only one error-path for this and not two. Replace the
tcpm_queue_message(PD_MSG_CTRL_NOT_SUPP) used by the existing error-path
with the more robust tcpm_pd_handle_msg() from the (now removed) second
error-path.
Fixes: a20dcf53ea98 ("usb: typec: tcpm: Respond Not_Supported if no snk_vdo")
Cc: stable <stable(a)vger.kernel.org>
Cc: Kyle Tso <kyletso(a)google.com>
Acked-by: Heikki Krogerus <heikki.krogerus(a)linux.intel.com>
Acked-by: Kyle Tso <kyletso(a)google.com>
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
Link: https://lore.kernel.org/r/20210816154632.381968-1-hdegoede@redhat.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/typec/tcpm/tcpm.c | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index b9bb63d749ec..f4079b5cb26d 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -1737,6 +1737,10 @@ static int tcpm_pd_svdm(struct tcpm_port *port, struct typec_altmode *adev,
return rlen;
}
+static void tcpm_pd_handle_msg(struct tcpm_port *port,
+ enum pd_msg_request message,
+ enum tcpm_ams ams);
+
static void tcpm_handle_vdm_request(struct tcpm_port *port,
const __le32 *payload, int cnt)
{
@@ -1764,11 +1768,11 @@ static void tcpm_handle_vdm_request(struct tcpm_port *port,
port->vdm_state = VDM_STATE_DONE;
}
- if (PD_VDO_SVDM(p[0])) {
+ if (PD_VDO_SVDM(p[0]) && (adev || tcpm_vdm_ams(port) || port->nr_snk_vdo)) {
rlen = tcpm_pd_svdm(port, adev, p, cnt, response, &adev_action);
} else {
if (port->negotiated_rev >= PD_REV30)
- tcpm_queue_message(port, PD_MSG_CTRL_NOT_SUPP);
+ tcpm_pd_handle_msg(port, PD_MSG_CTRL_NOT_SUPP, NONE_AMS);
}
/*
@@ -2471,10 +2475,7 @@ static void tcpm_pd_data_request(struct tcpm_port *port,
NONE_AMS);
break;
case PD_DATA_VENDOR_DEF:
- if (tcpm_vdm_ams(port) || port->nr_snk_vdo)
- tcpm_handle_vdm_request(port, msg->payload, cnt);
- else if (port->negotiated_rev > PD_REV20)
- tcpm_pd_handle_msg(port, PD_MSG_CTRL_NOT_SUPP, NONE_AMS);
+ tcpm_handle_vdm_request(port, msg->payload, cnt);
break;
case PD_DATA_BIST:
port->bist_request = le32_to_cpu(msg->payload[0]);
--
2.32.0
Originally the addr != NULL check was meant to take care of the case
where __kfence_pool == NULL (KFENCE is disabled). However, this does not
work for addresses where addr > 0 && addr < KFENCE_POOL_SIZE.
This can be the case on NULL-deref where addr > 0 && addr < PAGE_SIZE or
any other faulting access with addr < KFENCE_POOL_SIZE. While the kernel
would likely crash, the stack traces and report might be confusing due
to double faults upon KFENCE's attempt to unprotect such an address.
Fix it by just checking that __kfence_pool != NULL instead.
Fixes: 0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure")
Reported-by: Kuan-Ying Lee <Kuan-Ying.Lee(a)mediatek.com>
Signed-off-by: Marco Elver <elver(a)google.com>
Cc: <stable(a)vger.kernel.org> [5.12+]
---
include/linux/kfence.h | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index a70d1ea03532..3fe6dd8a18c1 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -51,10 +51,11 @@ extern atomic_t kfence_allocation_gate;
static __always_inline bool is_kfence_address(const void *addr)
{
/*
- * The non-NULL check is required in case the __kfence_pool pointer was
- * never initialized; keep it in the slow-path after the range-check.
+ * The __kfence_pool != NULL check is required to deal with the case
+ * where __kfence_pool == NULL && addr < KFENCE_POOL_SIZE. Keep it in
+ * the slow-path after the range-check!
*/
- return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && addr);
+ return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && __kfence_pool);
}
/**
--
2.33.0.rc1.237.g0d66db33f3-goog
v1: https://lkml.org/lkml/2021/7/26/1509
Changelog v1-->v2:
Based on comments from Gautham,
1. Included a #define for MAX_NR_CHIPS instead of hardcoding the
allocation.
Pratik R. Sampat (1):
cpufreq:powernv: Fix init_chip_info initialization in numa=off
drivers/cpufreq/powernv-cpufreq.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
--
2.31.1
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f9dfb5e390fab2df9f7944bb91e7705aba14cd26 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Fri, 18 Jun 2021 16:18:25 +0200
Subject: [PATCH] x86/fpu: Make init_fpstate correct with optimized XSAVE
The XSAVE init code initializes all enabled and supported components with
XRSTOR(S) to init state. Then it XSAVEs the state of the components back
into init_fpstate which is used in several places to fill in the init state
of components.
This works correctly with XSAVE, but not with XSAVEOPT and XSAVES because
those use the init optimization and skip writing state of components which
are in init state. So init_fpstate.xsave still contains all zeroes after
this operation.
There are two ways to solve that:
1) Use XSAVE unconditionally, but that requires to reshuffle the buffer when
XSAVES is enabled because XSAVES uses compacted format.
2) Save the components which are known to have a non-zero init state by other
means.
Looking deeper, #2 is the right thing to do because all components the
kernel supports have all-zeroes init state except the legacy features (FP,
SSE). Those cannot be hard coded because the states are not identical on all
CPUs, but they can be saved with FXSAVE which avoids all conditionals.
Use FXSAVE to save the legacy FP/SSE components in init_fpstate along with
a BUILD_BUG_ON() which reminds developers to validate that a newly added
component has all zeroes init state. As a bonus remove the now unused
copy_xregs_to_kernel_booting() crutch.
The XSAVE and reshuffle method can still be implemented in the unlikely
case that components are added which have a non-zero init state and no
other means to save them. For now, FXSAVE is just simple and good enough.
[ bp: Fix a typo or two in the text. ]
Fixes: 6bad06b76892 ("x86, xsave: Use xsaveopt in context-switch path when supported")
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Reviewed-by: Borislav Petkov <bp(a)suse.de>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/20210618143444.587311343@linutronix.de
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index fdee23ea4e17..16bf4d4a8159 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -204,6 +204,14 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
}
+static inline void fxsave(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
+ else
+ asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
+}
+
/* These macros all use (%edi)/(%rdi) as the single memory argument. */
#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
@@ -268,28 +276,6 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
-{
- u64 mask = xfeatures_mask_all;
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- WARN_ON(system_state != SYSTEM_BOOTING);
-
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
- else
- XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
-
- /* We should never fault when copying to a kernel buffer: */
- WARN_ON_FPU(err);
-}
-
/*
* This function is called only during boot time when x86 caps are not set
* up and alternative can not be used yet.
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d0eef963aad1..1cadb2faf740 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -440,6 +440,25 @@ static void __init print_xstate_offset_size(void)
}
}
+/*
+ * All supported features have either init state all zeros or are
+ * handled in setup_init_fpu() individually. This is an explicit
+ * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
+ * newly added supported features at build time and make people
+ * actually look at the init state for the new feature.
+ */
+#define XFEATURES_INIT_FPSTATE_HANDLED \
+ (XFEATURE_MASK_FP | \
+ XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_YMM | \
+ XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_ZMM_Hi256 | \
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU | \
+ XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_PASID)
+
/*
* setup the xstate image representing the init state
*/
@@ -447,6 +466,10 @@ static void __init setup_init_fpu_buf(void)
{
static int on_boot_cpu __initdata = 1;
+ BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
+ XFEATURES_INIT_FPSTATE_HANDLED);
+
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -466,10 +489,22 @@ static void __init setup_init_fpu_buf(void)
copy_kernel_to_xregs_booting(&init_fpstate.xsave);
/*
- * Dump the init state again. This is to identify the init state
- * of any feature which is not represented by all zero's.
+ * All components are now in init state. Read the state back so
+ * that init_fpstate contains all non-zero init state. This only
+ * works with XSAVE, but not with XSAVEOPT and XSAVES because
+ * those use the init optimization which skips writing data for
+ * components in init state.
+ *
+ * XSAVE could be used, but that would require to reshuffle the
+ * data when XSAVES is available because XSAVES uses xstate
+ * compaction. But doing so is a pointless exercise because most
+ * components have an all zeros init state except for the legacy
+ * ones (FP and SSE). Those can be saved with FXSAVE into the
+ * legacy area. Adding new features requires to ensure that init
+ * state is all zeroes or if not to add the necessary handling
+ * here.
*/
- copy_xregs_to_kernel_booting(&init_fpstate.xsave);
+ fxsave(&init_fpstate.fxsave);
}
static int xfeature_uncompacted_offset(int xfeature_nr)
This is a note to let you know that I've just added the patch titled
iio: ltc2983: fix device probe
to my staging git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git
in the staging-next branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will also be merged in the next major kernel release
during the merge window.
If you have any questions about this process, please let me know.
>From b76d26d69ecc97ebb24aaf40427a13c808a4f488 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nuno=20S=C3=A1?= <nuno.sa(a)analog.com>
Date: Wed, 11 Aug 2021 15:32:20 +0200
Subject: iio: ltc2983: fix device probe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There is no reason to assume that the IRQ rising edge (indicating that
the device start up phase is done) will happen after we request the IRQ.
If the device is already up by the time we request it, the call to
'wait_for_completion_timeout()' will timeout and we will fail the device
probe even though there's nothing wrong.
Fix it by just polling the status register until we get the indication that
the device is up and running. As a side effect of this fix, requesting the
IRQ is also moved to after the setup function.
Fixes: f110f3188e563 ("iio: temperature: Add support for LTC2983")
Reported-and-tested-by: Drew Fustini <drew(a)pdp7.com>
Reviewed-by: Drew Fustini <drew(a)pdp7.com>
Signed-off-by: Nuno Sá <nuno.sa(a)analog.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko(a)gmail.com>
Cc: <Stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20210811133220.190264-2-nuno.sa@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/temperature/ltc2983.c | 30 ++++++++++++++----------------
1 file changed, 14 insertions(+), 16 deletions(-)
diff --git a/drivers/iio/temperature/ltc2983.c b/drivers/iio/temperature/ltc2983.c
index 3b5ba26d7d86..3b4a0e60e605 100644
--- a/drivers/iio/temperature/ltc2983.c
+++ b/drivers/iio/temperature/ltc2983.c
@@ -89,6 +89,8 @@
#define LTC2983_STATUS_START_MASK BIT(7)
#define LTC2983_STATUS_START(x) FIELD_PREP(LTC2983_STATUS_START_MASK, x)
+#define LTC2983_STATUS_UP_MASK GENMASK(7, 6)
+#define LTC2983_STATUS_UP(reg) FIELD_GET(LTC2983_STATUS_UP_MASK, reg)
#define LTC2983_STATUS_CHAN_SEL_MASK GENMASK(4, 0)
#define LTC2983_STATUS_CHAN_SEL(x) \
@@ -1362,17 +1364,16 @@ static int ltc2983_parse_dt(struct ltc2983_data *st)
static int ltc2983_setup(struct ltc2983_data *st, bool assign_iio)
{
- u32 iio_chan_t = 0, iio_chan_v = 0, chan, iio_idx = 0;
+ u32 iio_chan_t = 0, iio_chan_v = 0, chan, iio_idx = 0, status;
int ret;
- unsigned long time;
-
- /* make sure the device is up */
- time = wait_for_completion_timeout(&st->completion,
- msecs_to_jiffies(250));
- if (!time) {
+ /* make sure the device is up: start bit (7) is 0 and done bit (6) is 1 */
+ ret = regmap_read_poll_timeout(st->regmap, LTC2983_STATUS_REG, status,
+ LTC2983_STATUS_UP(status) == 1, 25000,
+ 25000 * 10);
+ if (ret) {
dev_err(&st->spi->dev, "Device startup timed out\n");
- return -ETIMEDOUT;
+ return ret;
}
st->iio_chan = devm_kzalloc(&st->spi->dev,
@@ -1492,10 +1493,11 @@ static int ltc2983_probe(struct spi_device *spi)
ret = ltc2983_parse_dt(st);
if (ret)
return ret;
- /*
- * let's request the irq now so it is used to sync the device
- * startup in ltc2983_setup()
- */
+
+ ret = ltc2983_setup(st, true);
+ if (ret)
+ return ret;
+
ret = devm_request_irq(&spi->dev, spi->irq, ltc2983_irq_handler,
IRQF_TRIGGER_RISING, name, st);
if (ret) {
@@ -1503,10 +1505,6 @@ static int ltc2983_probe(struct spi_device *spi)
return ret;
}
- ret = ltc2983_setup(st, true);
- if (ret)
- return ret;
-
indio_dev->name = name;
indio_dev->num_channels = st->iio_channels;
indio_dev->channels = st->iio_chan;
--
2.32.0
The patch titled
Subject: mm/hwpoison: retry with shake_page() for unhandlable pages
has been added to the -mm tree. Its filename is
mm-hwpoison-retry-with-shake_page-for-unhandlable-pages.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-hwpoison-retry-with-shake_page…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-hwpoison-retry-with-shake_page…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Subject: mm/hwpoison: retry with shake_page() for unhandlable pages
HWPoisonHandlable() sometimes returns false for typical user pages due to
races with average memory events like transfers over LRU lists. This
causes failures in hwpoison handling.
There's retry code for such a case but does not work because the retry
loop reaches the retry limit too quickly before the page settles down to
handlable state. Let get_any_page() call shake_page() to fix it.
Link: https://lkml.kernel.org/r/20210817053703.2267588-1-naoya.horiguchi@linux.dev
Fixes: 25182f05ffed ("mm,hwpoison: fix race with hugetlb page allocation")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Reported-by: Tony Luck <tony.luck(a)intel.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: <stable(a)vger.kernel.org> [5.13]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory-failure.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
--- a/mm/memory-failure.c~mm-hwpoison-retry-with-shake_page-for-unhandlable-pages
+++ a/mm/memory-failure.c
@@ -1146,7 +1146,7 @@ static int __get_hwpoison_page(struct pa
* unexpected races caused by taking a page refcount.
*/
if (!HWPoisonHandlable(head))
- return 0;
+ return -EBUSY;
if (PageTransHuge(head)) {
/*
@@ -1199,9 +1199,14 @@ try_again:
}
goto out;
} else if (ret == -EBUSY) {
- /* We raced with freeing huge page to buddy, retry. */
- if (pass++ < 3)
+ /*
+ * We raced with (possibly temporary) unhandlable
+ * page, retry.
+ */
+ if (pass++ < 3) {
+ shake_page(p, 1);
goto try_again;
+ }
goto out;
}
}
_
Patches currently in -mm which might be from naoya.horiguchi(a)nec.com are
mm-hwpoison-retry-with-shake_page-for-unhandlable-pages.patch
mm-sparse-set-section_nid_shift-to-6.patch
This is the start of the stable review cycle for the 5.4.142 release.
There are 64 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Wed, 18 Aug 2021 17:13:49 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.4.142-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.4.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 5.4.142-rc2
Maxim Levitsky <mlevitsk(a)redhat.com>
KVM: nSVM: always intercept VMLOAD/VMSAVE when nested (CVE-2021-3656)
Maxim Levitsky <mlevitsk(a)redhat.com>
KVM: nSVM: avoid picking up unsupported bits from L2 in int_ctl (CVE-2021-3653)
Saeed Mirzamohammadi <saeed.mirzamohammadi(a)oracle.com>
iommu/vt-d: Fix agaw for a supported 48 bit guest address width
Nathan Chancellor <nathan(a)kernel.org>
vmlinux.lds.h: Handle clang's module.{c,d}tor sections
Jeff Layton <jlayton(a)kernel.org>
ceph: take snap_empty_lock atomically with snaprealm refcount change
Jeff Layton <jlayton(a)kernel.org>
ceph: clean up locking annotation for ceph_get_snap_realm and __lookup_snap_realm
Jeff Layton <jlayton(a)kernel.org>
ceph: add some lockdep assertions around snaprealm handling
Sean Christopherson <seanjc(a)google.com>
KVM: VMX: Use current VMCS to query WAITPKG support for MSR emulation
Thomas Gleixner <tglx(a)linutronix.de>
PCI/MSI: Protect msi_desc::masked for multi-MSI
Thomas Gleixner <tglx(a)linutronix.de>
PCI/MSI: Use msi_mask_irq() in pci_msi_shutdown()
Thomas Gleixner <tglx(a)linutronix.de>
PCI/MSI: Correct misleading comments
Thomas Gleixner <tglx(a)linutronix.de>
PCI/MSI: Do not set invalid bits in MSI mask
Thomas Gleixner <tglx(a)linutronix.de>
PCI/MSI: Enforce MSI[X] entry updates to be visible
Thomas Gleixner <tglx(a)linutronix.de>
PCI/MSI: Enforce that MSI-X table entry is masked for update
Thomas Gleixner <tglx(a)linutronix.de>
PCI/MSI: Mask all unused MSI-X entries
Thomas Gleixner <tglx(a)linutronix.de>
PCI/MSI: Enable and mask MSI-X early
Ben Dai <ben.dai(a)unisoc.com>
genirq/timings: Prevent potential array overflow in __irq_timings_store()
Bixuan Cui <cuibixuan(a)huawei.com>
genirq/msi: Ensure deactivation on teardown
Babu Moger <Babu.Moger(a)amd.com>
x86/resctrl: Fix default monitoring groups reporting
Thomas Gleixner <tglx(a)linutronix.de>
x86/ioapic: Force affinity setup before startup
Thomas Gleixner <tglx(a)linutronix.de>
x86/msi: Force affinity setup before startup
Thomas Gleixner <tglx(a)linutronix.de>
genirq: Provide IRQCHIP_AFFINITY_PRE_STARTUP
Randy Dunlap <rdunlap(a)infradead.org>
x86/tools: Fix objdump version check again
Pu Lehui <pulehui(a)huawei.com>
powerpc/kprobes: Fix kprobe Oops happens in booke
Xie Yongji <xieyongji(a)bytedance.com>
nbd: Aovid double completion of a request
Longpeng(Mike) <longpeng2(a)huawei.com>
vsock/virtio: avoid potential deadlock when vsock device remove
Maximilian Heyne <mheyne(a)amazon.de>
xen/events: Fix race in set_evtchn_to_irq
Eric Dumazet <edumazet(a)google.com>
net: igmp: increase size of mr_ifc_count
Neal Cardwell <ncardwell(a)google.com>
tcp_bbr: fix u32 wrap bug in round logic if bbr_init() called after 2B packets
Willy Tarreau <w(a)1wt.eu>
net: linkwatch: fix failure to restore device state across suspend/resume
Yang Yingliang <yangyingliang(a)huawei.com>
net: bridge: fix memleak in br_add_if()
Vladimir Oltean <vladimir.oltean(a)nxp.com>
net: dsa: sja1105: fix broken backpressure in .port_fdb_dump
Vladimir Oltean <vladimir.oltean(a)nxp.com>
net: dsa: lantiq: fix broken backpressure in .port_fdb_dump
Vladimir Oltean <vladimir.oltean(a)nxp.com>
net: dsa: lan9303: fix broken backpressure in .port_fdb_dump
Eric Dumazet <edumazet(a)google.com>
net: igmp: fix data-race in igmp_ifc_timer_expire()
Takeshi Misawa <jeliantsurux(a)gmail.com>
net: Fix memory leak in ieee802154_raw_deliver
Ben Hutchings <ben.hutchings(a)mind.be>
net: dsa: microchip: Fix ksz_read64()
Christian Hewitt <christianshewitt(a)gmail.com>
drm/meson: fix colour distortion from HDR set during vendor u-boot
Aya Levin <ayal(a)nvidia.com>
net/mlx5: Fix return value from tracer initialization
Roi Dayan <roid(a)nvidia.com>
psample: Add a fwd declaration for skbuff
Md Fahad Iqbal Polash <md.fahad.iqbal.polash(a)intel.com>
iavf: Set RSS LUT and key in reset handle path
Hangbin Liu <liuhangbin(a)gmail.com>
net: sched: act_mirred: Reset ct info when mirror/redirect skb
Pali Rohár <pali(a)kernel.org>
ppp: Fix generating ifname when empty IFLA_IFNAME is specified
Ben Hutchings <ben.hutchings(a)mind.be>
net: phy: micrel: Fix link detection on ksz87xx switch"
Hans de Goede <hdegoede(a)redhat.com>
platform/x86: pcengines-apuv2: Add missing terminating entries to gpio-lookup tables
Florian Eckert <fe(a)dev.tdt.de>
platform/x86: pcengines-apuv2: revert wiring up simswitch GPIO as LED
DENG Qingfang <dqfext(a)gmail.com>
net: dsa: mt7530: add the missing RxUnicast MIB counter
Richard Fitzgerald <rf(a)opensource.cirrus.com>
ASoC: cs42l42: Fix LRCLK frame start edge
Yajun Deng <yajun.deng(a)linux.dev>
netfilter: nf_conntrack_bridge: Fix memory leak when error
Richard Fitzgerald <rf(a)opensource.cirrus.com>
ASoC: cs42l42: Remove duplicate control for WNF filter frequency
Richard Fitzgerald <rf(a)opensource.cirrus.com>
ASoC: cs42l42: Fix inversion of ADC Notch Switch control
Richard Fitzgerald <rf(a)opensource.cirrus.com>
ASoC: cs42l42: Don't allow SND_SOC_DAIFMT_LEFT_J
Richard Fitzgerald <rf(a)opensource.cirrus.com>
ASoC: cs42l42: Correct definition of ADC Volume control
Dongliang Mu <mudongliangabcd(a)gmail.com>
ieee802154: hwsim: fix GPF in hwsim_new_edge_nl
Dongliang Mu <mudongliangabcd(a)gmail.com>
ieee802154: hwsim: fix GPF in hwsim_set_edge_lqi
Dan Williams <dan.j.williams(a)intel.com>
libnvdimm/region: Fix label activation vs errors
Dan Williams <dan.j.williams(a)intel.com>
ACPI: NFIT: Fix support for virtual SPA ranges
Luis Henriques <lhenriques(a)suse.de>
ceph: reduce contention in ceph_check_delayed_caps()
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
i2c: dev: zero out array used for i2c reads from userspace
Takashi Iwai <tiwai(a)suse.de>
ASoC: intel: atom: Fix reference to PCM buffer address
Takashi Iwai <tiwai(a)suse.de>
ASoC: xilinx: Fix reference to PCM buffer address
Colin Ian King <colin.king(a)canonical.com>
iio: adc: Fix incorrect exit of for-loop
Chris Lesiak <chris.lesiak(a)licor.com>
iio: humidity: hdc100x: Add margin to the conversion time
Uwe Kleine-König <u.kleine-koenig(a)pengutronix.de>
iio: adc: ti-ads7950: Ensure CS is deasserted after reading channels
-------------
Diffstat:
Makefile | 4 +-
arch/powerpc/kernel/kprobes.c | 3 +-
arch/x86/include/asm/svm.h | 2 +
arch/x86/kernel/apic/io_apic.c | 6 +-
arch/x86/kernel/apic/msi.c | 13 ++-
arch/x86/kernel/cpu/resctrl/monitor.c | 27 +++--
arch/x86/kvm/svm.c | 18 +--
arch/x86/kvm/vmx/vmx.h | 2 +-
arch/x86/tools/chkobjdump.awk | 1 +
drivers/acpi/nfit/core.c | 3 +
drivers/base/core.c | 1 +
drivers/block/nbd.c | 14 ++-
drivers/gpu/drm/meson/meson_registers.h | 5 +
drivers/gpu/drm/meson/meson_viu.c | 7 +-
drivers/i2c/i2c-dev.c | 5 +-
drivers/iio/adc/palmas_gpadc.c | 4 +-
drivers/iio/adc/ti-ads7950.c | 1 -
drivers/iio/humidity/hdc100x.c | 6 +-
drivers/iommu/intel-iommu.c | 7 +-
drivers/net/dsa/lan9303-core.c | 34 +++---
drivers/net/dsa/lantiq_gswip.c | 14 ++-
drivers/net/dsa/microchip/ksz_common.h | 8 +-
drivers/net/dsa/mt7530.c | 1 +
drivers/net/dsa/sja1105/sja1105_main.c | 4 +-
drivers/net/ethernet/intel/iavf/iavf_main.c | 13 ++-
.../ethernet/mellanox/mlx5/core/diag/fw_tracer.c | 11 +-
drivers/net/ieee802154/mac802154_hwsim.c | 6 +-
drivers/net/phy/micrel.c | 2 -
drivers/net/ppp/ppp_generic.c | 2 +-
drivers/nvdimm/namespace_devs.c | 17 ++-
drivers/pci/msi.c | 125 +++++++++++++--------
drivers/platform/x86/pcengines-apuv2.c | 5 +-
drivers/xen/events/events_base.c | 20 +++-
fs/ceph/caps.c | 17 ++-
fs/ceph/mds_client.c | 25 +++--
fs/ceph/snap.c | 54 +++++----
fs/ceph/super.h | 2 +-
include/asm-generic/vmlinux.lds.h | 1 +
include/linux/device.h | 1 +
include/linux/inetdevice.h | 2 +-
include/linux/irq.h | 2 +
include/linux/msi.h | 2 +-
include/net/psample.h | 2 +
kernel/irq/chip.c | 5 +-
kernel/irq/msi.c | 13 ++-
kernel/irq/timings.c | 5 +
net/bridge/br_if.c | 2 +
net/bridge/netfilter/nf_conntrack_bridge.c | 6 +
net/core/link_watch.c | 5 +-
net/ieee802154/socket.c | 7 +-
net/ipv4/igmp.c | 21 ++--
net/ipv4/tcp_bbr.c | 2 +-
net/sched/act_mirred.c | 3 +
net/vmw_vsock/virtio_transport.c | 7 +-
sound/soc/codecs/cs42l42.c | 39 +++----
sound/soc/intel/atom/sst-mfld-platform-pcm.c | 3 +-
sound/soc/xilinx/xlnx_formatter_pcm.c | 4 +-
57 files changed, 395 insertions(+), 226 deletions(-)
This is the start of the stable review cycle for the 5.10.59 release.
There are 19 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sun, 15 Aug 2021 15:05:12 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.10.59-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.10.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 5.10.59-rc1
YueHaibing <yuehaibing(a)huawei.com>
net: xilinx_emaclite: Do not print real IOMEM pointer
Miklos Szeredi <mszeredi(a)redhat.com>
ovl: prevent private clone if bind mount is not allowed
Pali Rohár <pali(a)kernel.org>
ppp: Fix generating ppp unit id when ifname is not specified
Luke D Jones <luke(a)ljones.dev>
ALSA: hda: Add quirk for ASUS Flow x13
Jeremy Szu <jeremy.szu(a)canonical.com>
ALSA: hda/realtek: fix mute/micmute LEDs for HP ProBook 650 G8 Notebook PC
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Fix mmap breakage without explicit buffer setup
Longfang Liu <liulongfang(a)huawei.com>
USB:ehci:fix Kunpeng920 ehci hardware problem
Hans de Goede <hdegoede(a)redhat.com>
vboxsf: Make vboxsf_dir_create() return the handle for the created file
Hans de Goede <hdegoede(a)redhat.com>
vboxsf: Honor excl flag to the dir-inode create op
Adam Ford <aford173(a)gmail.com>
arm64: dts: renesas: beacon: Fix USB ref clock references
Adam Ford <aford173(a)gmail.com>
arm64: dts: renesas: beacon: Fix USB extal reference
Adam Ford <aford173(a)gmail.com>
arm64: dts: renesas: rzg2: Add usb2_clksel to RZ/G2 M/N/H
Mike Rapoport <rppt(a)kernel.org>
mm: make zone_to_nid() and zone_set_nid() available for DISCONTIGMEM
Reinette Chatre <reinette.chatre(a)intel.com>
Revert "selftests/resctrl: Use resctrl/info for feature detection"
Daniel Borkmann <daniel(a)iogearbox.net>
bpf: Add lockdown check for probe_write_user helper
Daniel Borkmann <daniel(a)iogearbox.net>
bpf: Add _kernel suffix to internal lockdown_bpf_read
Allen Pais <apais(a)linux.microsoft.com>
firmware: tee_bnxt: Release TEE shm, session, and context during kexec
Sumit Garg <sumit.garg(a)linaro.org>
tee: Correct inappropriate usage of TEE_SHM_DMA_BUF flag
Sean Christopherson <seanjc(a)google.com>
KVM: SVM: Fix off-by-one indexing when nullifying last used SEV VMCB
-------------
Diffstat:
Makefile | 4 +-
.../boot/dts/renesas/beacon-renesom-baseboard.dtsi | 4 +-
.../arm64/boot/dts/renesas/beacon-renesom-som.dtsi | 6 ++-
arch/arm64/boot/dts/renesas/r8a774a1.dtsi | 15 +++++++
arch/arm64/boot/dts/renesas/r8a774b1.dtsi | 15 +++++++
arch/arm64/boot/dts/renesas/r8a774e1.dtsi | 15 +++++++
arch/x86/kvm/svm/sev.c | 2 +-
drivers/firmware/broadcom/tee_bnxt_fw.c | 14 ++++--
drivers/net/ethernet/xilinx/xilinx_emaclite.c | 5 +--
drivers/net/ppp/ppp_generic.c | 19 ++++++--
drivers/tee/optee/call.c | 2 +-
drivers/tee/optee/core.c | 3 +-
drivers/tee/optee/rpc.c | 5 ++-
drivers/tee/optee/shm_pool.c | 8 +++-
drivers/tee/tee_shm.c | 4 +-
drivers/usb/host/ehci-pci.c | 3 ++
fs/namespace.c | 42 +++++++++++------
fs/vboxsf/dir.c | 28 +++++++-----
include/linux/mmzone.h | 4 +-
include/linux/security.h | 3 +-
include/linux/tee_drv.h | 1 +
kernel/bpf/helpers.c | 4 +-
kernel/trace/bpf_trace.c | 13 +++---
security/security.c | 3 +-
sound/core/pcm_native.c | 5 ++-
sound/pci/hda/patch_realtek.c | 2 +
tools/testing/selftests/resctrl/resctrl.h | 6 +--
tools/testing/selftests/resctrl/resctrlfs.c | 52 +++++-----------------
28 files changed, 178 insertions(+), 109 deletions(-)
This is a note to let you know that I've just added the patch titled
iio: ltc2983: fix device probe
to my staging git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git
in the staging-testing branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will be merged to the staging-next branch sometime soon,
after it passes testing, and the merge window is open.
If you have any questions about this process, please let me know.
>From b76d26d69ecc97ebb24aaf40427a13c808a4f488 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nuno=20S=C3=A1?= <nuno.sa(a)analog.com>
Date: Wed, 11 Aug 2021 15:32:20 +0200
Subject: iio: ltc2983: fix device probe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There is no reason to assume that the IRQ rising edge (indicating that
the device start up phase is done) will happen after we request the IRQ.
If the device is already up by the time we request it, the call to
'wait_for_completion_timeout()' will timeout and we will fail the device
probe even though there's nothing wrong.
Fix it by just polling the status register until we get the indication that
the device is up and running. As a side effect of this fix, requesting the
IRQ is also moved to after the setup function.
Fixes: f110f3188e563 ("iio: temperature: Add support for LTC2983")
Reported-and-tested-by: Drew Fustini <drew(a)pdp7.com>
Reviewed-by: Drew Fustini <drew(a)pdp7.com>
Signed-off-by: Nuno Sá <nuno.sa(a)analog.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko(a)gmail.com>
Cc: <Stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20210811133220.190264-2-nuno.sa@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/temperature/ltc2983.c | 30 ++++++++++++++----------------
1 file changed, 14 insertions(+), 16 deletions(-)
diff --git a/drivers/iio/temperature/ltc2983.c b/drivers/iio/temperature/ltc2983.c
index 3b5ba26d7d86..3b4a0e60e605 100644
--- a/drivers/iio/temperature/ltc2983.c
+++ b/drivers/iio/temperature/ltc2983.c
@@ -89,6 +89,8 @@
#define LTC2983_STATUS_START_MASK BIT(7)
#define LTC2983_STATUS_START(x) FIELD_PREP(LTC2983_STATUS_START_MASK, x)
+#define LTC2983_STATUS_UP_MASK GENMASK(7, 6)
+#define LTC2983_STATUS_UP(reg) FIELD_GET(LTC2983_STATUS_UP_MASK, reg)
#define LTC2983_STATUS_CHAN_SEL_MASK GENMASK(4, 0)
#define LTC2983_STATUS_CHAN_SEL(x) \
@@ -1362,17 +1364,16 @@ static int ltc2983_parse_dt(struct ltc2983_data *st)
static int ltc2983_setup(struct ltc2983_data *st, bool assign_iio)
{
- u32 iio_chan_t = 0, iio_chan_v = 0, chan, iio_idx = 0;
+ u32 iio_chan_t = 0, iio_chan_v = 0, chan, iio_idx = 0, status;
int ret;
- unsigned long time;
-
- /* make sure the device is up */
- time = wait_for_completion_timeout(&st->completion,
- msecs_to_jiffies(250));
- if (!time) {
+ /* make sure the device is up: start bit (7) is 0 and done bit (6) is 1 */
+ ret = regmap_read_poll_timeout(st->regmap, LTC2983_STATUS_REG, status,
+ LTC2983_STATUS_UP(status) == 1, 25000,
+ 25000 * 10);
+ if (ret) {
dev_err(&st->spi->dev, "Device startup timed out\n");
- return -ETIMEDOUT;
+ return ret;
}
st->iio_chan = devm_kzalloc(&st->spi->dev,
@@ -1492,10 +1493,11 @@ static int ltc2983_probe(struct spi_device *spi)
ret = ltc2983_parse_dt(st);
if (ret)
return ret;
- /*
- * let's request the irq now so it is used to sync the device
- * startup in ltc2983_setup()
- */
+
+ ret = ltc2983_setup(st, true);
+ if (ret)
+ return ret;
+
ret = devm_request_irq(&spi->dev, spi->irq, ltc2983_irq_handler,
IRQF_TRIGGER_RISING, name, st);
if (ret) {
@@ -1503,10 +1505,6 @@ static int ltc2983_probe(struct spi_device *spi)
return ret;
}
- ret = ltc2983_setup(st, true);
- if (ret)
- return ret;
-
indio_dev->name = name;
indio_dev->num_channels = st->iio_channels;
indio_dev->channels = st->iio_chan;
--
2.32.0
The return value need to be either ignored or acted upon, otherwise 'deadstore'
clang check would yell at us. I think it's better to just ignore what this
particular call of set_registers() returns. The adapter defaults are sane and
it would be operational even if the register write fail.
Signed-off-by: Petko Manolov <petko.manolov(a)konsulko.com>
---
drivers/net/usb/pegasus.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/usb/pegasus.c b/drivers/net/usb/pegasus.c
index 652e9fcf0b77..49cfc720d78f 100644
--- a/drivers/net/usb/pegasus.c
+++ b/drivers/net/usb/pegasus.c
@@ -433,7 +433,7 @@ static int enable_net_traffic(struct net_device *dev, struct usb_device *usb)
data[2] = loopback ? 0x09 : 0x01;
memcpy(pegasus->eth_regs, data, sizeof(data));
- ret = set_registers(pegasus, EthCtrl0, 3, data);
+ set_registers(pegasus, EthCtrl0, 3, data);
if (usb_dev_id[pegasus->dev_index].vendor == VENDOR_LINKSYS ||
usb_dev_id[pegasus->dev_index].vendor == VENDOR_LINKSYS2 ||
--
2.30.2
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f9dfb5e390fab2df9f7944bb91e7705aba14cd26 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Fri, 18 Jun 2021 16:18:25 +0200
Subject: [PATCH] x86/fpu: Make init_fpstate correct with optimized XSAVE
The XSAVE init code initializes all enabled and supported components with
XRSTOR(S) to init state. Then it XSAVEs the state of the components back
into init_fpstate which is used in several places to fill in the init state
of components.
This works correctly with XSAVE, but not with XSAVEOPT and XSAVES because
those use the init optimization and skip writing state of components which
are in init state. So init_fpstate.xsave still contains all zeroes after
this operation.
There are two ways to solve that:
1) Use XSAVE unconditionally, but that requires to reshuffle the buffer when
XSAVES is enabled because XSAVES uses compacted format.
2) Save the components which are known to have a non-zero init state by other
means.
Looking deeper, #2 is the right thing to do because all components the
kernel supports have all-zeroes init state except the legacy features (FP,
SSE). Those cannot be hard coded because the states are not identical on all
CPUs, but they can be saved with FXSAVE which avoids all conditionals.
Use FXSAVE to save the legacy FP/SSE components in init_fpstate along with
a BUILD_BUG_ON() which reminds developers to validate that a newly added
component has all zeroes init state. As a bonus remove the now unused
copy_xregs_to_kernel_booting() crutch.
The XSAVE and reshuffle method can still be implemented in the unlikely
case that components are added which have a non-zero init state and no
other means to save them. For now, FXSAVE is just simple and good enough.
[ bp: Fix a typo or two in the text. ]
Fixes: 6bad06b76892 ("x86, xsave: Use xsaveopt in context-switch path when supported")
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Reviewed-by: Borislav Petkov <bp(a)suse.de>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/20210618143444.587311343@linutronix.de
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index fdee23ea4e17..16bf4d4a8159 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -204,6 +204,14 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
}
+static inline void fxsave(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
+ else
+ asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
+}
+
/* These macros all use (%edi)/(%rdi) as the single memory argument. */
#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
@@ -268,28 +276,6 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
-{
- u64 mask = xfeatures_mask_all;
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- WARN_ON(system_state != SYSTEM_BOOTING);
-
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
- else
- XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
-
- /* We should never fault when copying to a kernel buffer: */
- WARN_ON_FPU(err);
-}
-
/*
* This function is called only during boot time when x86 caps are not set
* up and alternative can not be used yet.
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d0eef963aad1..1cadb2faf740 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -440,6 +440,25 @@ static void __init print_xstate_offset_size(void)
}
}
+/*
+ * All supported features have either init state all zeros or are
+ * handled in setup_init_fpu() individually. This is an explicit
+ * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
+ * newly added supported features at build time and make people
+ * actually look at the init state for the new feature.
+ */
+#define XFEATURES_INIT_FPSTATE_HANDLED \
+ (XFEATURE_MASK_FP | \
+ XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_YMM | \
+ XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_ZMM_Hi256 | \
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU | \
+ XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_PASID)
+
/*
* setup the xstate image representing the init state
*/
@@ -447,6 +466,10 @@ static void __init setup_init_fpu_buf(void)
{
static int on_boot_cpu __initdata = 1;
+ BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
+ XFEATURES_INIT_FPSTATE_HANDLED);
+
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -466,10 +489,22 @@ static void __init setup_init_fpu_buf(void)
copy_kernel_to_xregs_booting(&init_fpstate.xsave);
/*
- * Dump the init state again. This is to identify the init state
- * of any feature which is not represented by all zero's.
+ * All components are now in init state. Read the state back so
+ * that init_fpstate contains all non-zero init state. This only
+ * works with XSAVE, but not with XSAVEOPT and XSAVES because
+ * those use the init optimization which skips writing data for
+ * components in init state.
+ *
+ * XSAVE could be used, but that would require to reshuffle the
+ * data when XSAVES is available because XSAVES uses xstate
+ * compaction. But doing so is a pointless exercise because most
+ * components have an all zeros init state except for the legacy
+ * ones (FP and SSE). Those can be saved with FXSAVE into the
+ * legacy area. Adding new features requires to ensure that init
+ * state is all zeroes or if not to add the necessary handling
+ * here.
*/
- copy_xregs_to_kernel_booting(&init_fpstate.xsave);
+ fxsave(&init_fpstate.fxsave);
}
static int xfeature_uncompacted_offset(int xfeature_nr)
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f9dfb5e390fab2df9f7944bb91e7705aba14cd26 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Fri, 18 Jun 2021 16:18:25 +0200
Subject: [PATCH] x86/fpu: Make init_fpstate correct with optimized XSAVE
The XSAVE init code initializes all enabled and supported components with
XRSTOR(S) to init state. Then it XSAVEs the state of the components back
into init_fpstate which is used in several places to fill in the init state
of components.
This works correctly with XSAVE, but not with XSAVEOPT and XSAVES because
those use the init optimization and skip writing state of components which
are in init state. So init_fpstate.xsave still contains all zeroes after
this operation.
There are two ways to solve that:
1) Use XSAVE unconditionally, but that requires to reshuffle the buffer when
XSAVES is enabled because XSAVES uses compacted format.
2) Save the components which are known to have a non-zero init state by other
means.
Looking deeper, #2 is the right thing to do because all components the
kernel supports have all-zeroes init state except the legacy features (FP,
SSE). Those cannot be hard coded because the states are not identical on all
CPUs, but they can be saved with FXSAVE which avoids all conditionals.
Use FXSAVE to save the legacy FP/SSE components in init_fpstate along with
a BUILD_BUG_ON() which reminds developers to validate that a newly added
component has all zeroes init state. As a bonus remove the now unused
copy_xregs_to_kernel_booting() crutch.
The XSAVE and reshuffle method can still be implemented in the unlikely
case that components are added which have a non-zero init state and no
other means to save them. For now, FXSAVE is just simple and good enough.
[ bp: Fix a typo or two in the text. ]
Fixes: 6bad06b76892 ("x86, xsave: Use xsaveopt in context-switch path when supported")
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Reviewed-by: Borislav Petkov <bp(a)suse.de>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/20210618143444.587311343@linutronix.de
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index fdee23ea4e17..16bf4d4a8159 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -204,6 +204,14 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
}
+static inline void fxsave(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
+ else
+ asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
+}
+
/* These macros all use (%edi)/(%rdi) as the single memory argument. */
#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
@@ -268,28 +276,6 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
-{
- u64 mask = xfeatures_mask_all;
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- WARN_ON(system_state != SYSTEM_BOOTING);
-
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
- else
- XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
-
- /* We should never fault when copying to a kernel buffer: */
- WARN_ON_FPU(err);
-}
-
/*
* This function is called only during boot time when x86 caps are not set
* up and alternative can not be used yet.
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d0eef963aad1..1cadb2faf740 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -440,6 +440,25 @@ static void __init print_xstate_offset_size(void)
}
}
+/*
+ * All supported features have either init state all zeros or are
+ * handled in setup_init_fpu() individually. This is an explicit
+ * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
+ * newly added supported features at build time and make people
+ * actually look at the init state for the new feature.
+ */
+#define XFEATURES_INIT_FPSTATE_HANDLED \
+ (XFEATURE_MASK_FP | \
+ XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_YMM | \
+ XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_ZMM_Hi256 | \
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU | \
+ XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_PASID)
+
/*
* setup the xstate image representing the init state
*/
@@ -447,6 +466,10 @@ static void __init setup_init_fpu_buf(void)
{
static int on_boot_cpu __initdata = 1;
+ BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
+ XFEATURES_INIT_FPSTATE_HANDLED);
+
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -466,10 +489,22 @@ static void __init setup_init_fpu_buf(void)
copy_kernel_to_xregs_booting(&init_fpstate.xsave);
/*
- * Dump the init state again. This is to identify the init state
- * of any feature which is not represented by all zero's.
+ * All components are now in init state. Read the state back so
+ * that init_fpstate contains all non-zero init state. This only
+ * works with XSAVE, but not with XSAVEOPT and XSAVES because
+ * those use the init optimization which skips writing data for
+ * components in init state.
+ *
+ * XSAVE could be used, but that would require to reshuffle the
+ * data when XSAVES is available because XSAVES uses xstate
+ * compaction. But doing so is a pointless exercise because most
+ * components have an all zeros init state except for the legacy
+ * ones (FP and SSE). Those can be saved with FXSAVE into the
+ * legacy area. Adding new features requires to ensure that init
+ * state is all zeroes or if not to add the necessary handling
+ * here.
*/
- copy_xregs_to_kernel_booting(&init_fpstate.xsave);
+ fxsave(&init_fpstate.fxsave);
}
static int xfeature_uncompacted_offset(int xfeature_nr)
Hi All,
While I was testing v5.4.142-rc2 I noticed mips build of
decstation_r4k_defconfig fails with binutils-2_37. The error is:
arch/mips/dec/prom/locore.S: Assembler messages:
arch/mips/dec/prom/locore.S:29: Error: opcode not supported on this
processor: r4600 (mips3) `rfe'
I have also reported this at https://sourceware.org/bugzilla/show_bug.cgi?id=28241
--
Regards
Sudip
Commit 2860cd8a2353 ("livepatch: Use the default ftrace_ops instead of
REGS when ARGS is available") intends to enable config LIVEPATCH when
ftrace with ARGS is available. However, the chain of configs to enable
LIVEPATCH is incomplete, as HAVE_DYNAMIC_FTRACE_WITH_ARGS is available,
but the definition of DYNAMIC_FTRACE_WITH_ARGS, combining DYNAMIC_FTRACE
and HAVE_DYNAMIC_FTRACE_WITH_ARGS, needed to enable LIVEPATCH, is missing
in the commit.
Fortunately, ./scripts/checkkconfigsymbols.py detects this and warns:
DYNAMIC_FTRACE_WITH_ARGS
Referencing files: kernel/livepatch/Kconfig
So, define the config DYNAMIC_FTRACE_WITH_ARGS analogously to the already
existing similar configs, DYNAMIC_FTRACE_WITH_REGS and
DYNAMIC_FTRACE_WITH_DIRECT_CALLS, in ./kernel/trace/Kconfig to connect the
chain of configs.
Fixes: 2860cd8a2353 ("livepatch: Use the default ftrace_ops instead of REGS when ARGS is available")
Cc: <stable(a)vger.kernel.org> # 5.10.x
Signed-off-by: Lukas Bulwahn <lukas.bulwahn(a)gmail.com>
---
Steven, thanks for the quick response; please pick this quick config fix.
kernel/trace/Kconfig | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d567b1717c4c..3ee23f4d437f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -219,6 +219,11 @@ config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
depends on DYNAMIC_FTRACE_WITH_REGS
depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+config DYNAMIC_FTRACE_WITH_ARGS
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
+
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
--
2.17.1
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f9dfb5e390fab2df9f7944bb91e7705aba14cd26 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Fri, 18 Jun 2021 16:18:25 +0200
Subject: [PATCH] x86/fpu: Make init_fpstate correct with optimized XSAVE
The XSAVE init code initializes all enabled and supported components with
XRSTOR(S) to init state. Then it XSAVEs the state of the components back
into init_fpstate which is used in several places to fill in the init state
of components.
This works correctly with XSAVE, but not with XSAVEOPT and XSAVES because
those use the init optimization and skip writing state of components which
are in init state. So init_fpstate.xsave still contains all zeroes after
this operation.
There are two ways to solve that:
1) Use XSAVE unconditionally, but that requires to reshuffle the buffer when
XSAVES is enabled because XSAVES uses compacted format.
2) Save the components which are known to have a non-zero init state by other
means.
Looking deeper, #2 is the right thing to do because all components the
kernel supports have all-zeroes init state except the legacy features (FP,
SSE). Those cannot be hard coded because the states are not identical on all
CPUs, but they can be saved with FXSAVE which avoids all conditionals.
Use FXSAVE to save the legacy FP/SSE components in init_fpstate along with
a BUILD_BUG_ON() which reminds developers to validate that a newly added
component has all zeroes init state. As a bonus remove the now unused
copy_xregs_to_kernel_booting() crutch.
The XSAVE and reshuffle method can still be implemented in the unlikely
case that components are added which have a non-zero init state and no
other means to save them. For now, FXSAVE is just simple and good enough.
[ bp: Fix a typo or two in the text. ]
Fixes: 6bad06b76892 ("x86, xsave: Use xsaveopt in context-switch path when supported")
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Reviewed-by: Borislav Petkov <bp(a)suse.de>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/20210618143444.587311343@linutronix.de
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index fdee23ea4e17..16bf4d4a8159 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -204,6 +204,14 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
}
+static inline void fxsave(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
+ else
+ asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
+}
+
/* These macros all use (%edi)/(%rdi) as the single memory argument. */
#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
@@ -268,28 +276,6 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
-{
- u64 mask = xfeatures_mask_all;
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- WARN_ON(system_state != SYSTEM_BOOTING);
-
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
- else
- XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
-
- /* We should never fault when copying to a kernel buffer: */
- WARN_ON_FPU(err);
-}
-
/*
* This function is called only during boot time when x86 caps are not set
* up and alternative can not be used yet.
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d0eef963aad1..1cadb2faf740 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -440,6 +440,25 @@ static void __init print_xstate_offset_size(void)
}
}
+/*
+ * All supported features have either init state all zeros or are
+ * handled in setup_init_fpu() individually. This is an explicit
+ * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
+ * newly added supported features at build time and make people
+ * actually look at the init state for the new feature.
+ */
+#define XFEATURES_INIT_FPSTATE_HANDLED \
+ (XFEATURE_MASK_FP | \
+ XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_YMM | \
+ XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_ZMM_Hi256 | \
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU | \
+ XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_PASID)
+
/*
* setup the xstate image representing the init state
*/
@@ -447,6 +466,10 @@ static void __init setup_init_fpu_buf(void)
{
static int on_boot_cpu __initdata = 1;
+ BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
+ XFEATURES_INIT_FPSTATE_HANDLED);
+
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -466,10 +489,22 @@ static void __init setup_init_fpu_buf(void)
copy_kernel_to_xregs_booting(&init_fpstate.xsave);
/*
- * Dump the init state again. This is to identify the init state
- * of any feature which is not represented by all zero's.
+ * All components are now in init state. Read the state back so
+ * that init_fpstate contains all non-zero init state. This only
+ * works with XSAVE, but not with XSAVEOPT and XSAVES because
+ * those use the init optimization which skips writing data for
+ * components in init state.
+ *
+ * XSAVE could be used, but that would require to reshuffle the
+ * data when XSAVES is available because XSAVES uses xstate
+ * compaction. But doing so is a pointless exercise because most
+ * components have an all zeros init state except for the legacy
+ * ones (FP and SSE). Those can be saved with FXSAVE into the
+ * legacy area. Adding new features requires to ensure that init
+ * state is all zeroes or if not to add the necessary handling
+ * here.
*/
- copy_xregs_to_kernel_booting(&init_fpstate.xsave);
+ fxsave(&init_fpstate.fxsave);
}
static int xfeature_uncompacted_offset(int xfeature_nr)
Bus bandwidth array access is based on esit, increase one
will cause out-of-bounds issue; for example, when esit is
XHCI_MTK_MAX_ESIT, will overstep boundary.
Fixes: 7c986fbc16ae ("usb: xhci-mtk: get the microframe boundary for ESIT")
Cc: <stable(a)vger.kernel.org>
Reported-by: Stan Lu <stan.lu(a)mediatek.com>
Signed-off-by: Chunfeng Yun <chunfeng.yun(a)mediatek.com>
---
drivers/usb/host/xhci-mtk-sch.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/drivers/usb/host/xhci-mtk-sch.c b/drivers/usb/host/xhci-mtk-sch.c
index cffcaf4dfa9f..0bb1a6295d64 100644
--- a/drivers/usb/host/xhci-mtk-sch.c
+++ b/drivers/usb/host/xhci-mtk-sch.c
@@ -575,10 +575,12 @@ static u32 get_esit_boundary(struct mu3h_sch_ep_info *sch_ep)
u32 boundary = sch_ep->esit;
if (sch_ep->sch_tt) { /* LS/FS with TT */
- /* tune for CS */
- if (sch_ep->ep_type != ISOC_OUT_EP)
- boundary++;
- else if (boundary > 1) /* normally esit >= 8 for FS/LS */
+ /*
+ * tune for CS, normally esit >= 8 for FS/LS,
+ * not add one for other types to avoid access array
+ * out of boundary
+ */
+ if (sch_ep->ep_type == ISOC_OUT_EP && boundary > 1)
boundary--;
}
--
2.18.0
On Fri, 29 Jan 2021 17:12:08 +0100,
Michael Catanzaro wrote:
>
> On Fri, Jan 29, 2021 at 9:30 am, Michael Catanzaro
> <mcatanzaro(a)redhat.com> wrote:
> > OK, I found "ALSA: hda/via: Apply the workaround generically for
> > Clevo machines" which was just merged yesterday. So I will test
> > again to find out.
>
> Hi Takashi, hi Harsha,
>
> I can confirm that the problem is fixed by this commit:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
Thanks, good to hear.
Then I think we can drop the entry from power_save_denylist in
hda_intel.c. Could you try that it still works with the patch below?
thanks,
Takashi
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -2217,8 +2217,6 @@ static const struct snd_pci_quirk power_save_denylist[] = {
/* https://bugzilla.redhat.com/show_bug.cgi?id=1525104 */
SND_PCI_QUIRK(0x1043, 0x8733, "Asus Prime X370-Pro", 0),
/* https://bugzilla.redhat.com/show_bug.cgi?id=1525104 */
- SND_PCI_QUIRK(0x1558, 0x6504, "Clevo W65_67SB", 0),
- /* https://bugzilla.redhat.com/show_bug.cgi?id=1525104 */
SND_PCI_QUIRK(0x1028, 0x0497, "Dell Precision T3600", 0),
/* https://bugzilla.redhat.com/show_bug.cgi?id=1525104 */
/* Note the P55A-UD3 and Z87-D3HP share the subsys id for the HDA dev */
Hello,
We ran automated tests on a recent commit from this kernel tree:
Kernel repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Commit: 13f088214330 - Linux 5.13.12-rc2
The results of these automated tests are provided below.
Overall result: PASSED
Merge: OK
Compile: OK
Tests: OK
All kernel binaries, config files, and logs are available for download here:
https://arr-cki-prod-datawarehouse-public.s3.amazonaws.com/index.html?prefi…
Please reply to this email if you have any questions about the tests that we
ran or if you have any suggestions on how to make future tests more effective.
,-. ,-.
( C ) ( K ) Continuous
`-',-.`-' Kernel
( I ) Integration
`-'
______________________________________________________________________________
Compile testing
---------------
We compiled the kernel for 4 architectures:
aarch64:
make options: make -j24 INSTALL_MOD_STRIP=1 targz-pkg
ppc64le:
make options: make -j24 INSTALL_MOD_STRIP=1 targz-pkg
s390x:
make options: make -j24 INSTALL_MOD_STRIP=1 targz-pkg
x86_64:
make options: make -j24 INSTALL_MOD_STRIP=1 targz-pkg
Hardware testing
----------------
We booted each kernel and ran the following tests:
aarch64:
Host 1:
✅ Boot test
✅ Reboot test
✅ xfstests - ext4
✅ xfstests - xfs
✅ selinux-policy: serge-testsuite
✅ storage: software RAID testing
✅ Storage: swraid mdadm raid_module test
🚧 ✅ Podman system integration test - as root
🚧 ✅ Podman system integration test - as user
🚧 ✅ xfstests - btrfs
🚧 ✅ IPMI driver test
🚧 ✅ IPMItool loop stress test
🚧 ✅ Storage blktests
🚧 ✅ Storage block - filesystem fio test
🚧 ✅ Storage block - queue scheduler test
🚧 ✅ Storage nvme - tcp
🚧 💥 stress: stress-ng
Host 2:
✅ Boot test
✅ Reboot test
✅ ACPI table test
✅ ACPI enabled test
✅ LTP
✅ CIFS Connectathon
✅ POSIX pjd-fstest suites
✅ Loopdev Sanity
✅ jvm - jcstress tests
✅ Memory: fork_mem
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking: igmp conformance test
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking cki netfilter test
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - transport
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
✅ pciutils: update pci ids test
✅ ALSA PCM loopback test
✅ ALSA Control (mixer) Userspace Element test
✅ storage: SCSI VPD
✅ trace: ftrace/tracer
🚧 ✅ xarray-idr-radixtree-test
🚧 ✅ i2c: i2cdetect sanity
🚧 ✅ NFS Connectathon
🚧 ✅ Firmware test suite
🚧 ✅ Memory function: kaslr
🚧 ✅ audit: audit testsuite test
🚧 ✅ lvm cache test
🚧 ✅ lvm snapper test
ppc64le:
Host 1:
✅ Boot test
✅ Reboot test
✅ LTP
✅ CIFS Connectathon
✅ POSIX pjd-fstest suites
✅ Loopdev Sanity
✅ jvm - jcstress tests
✅ Memory: fork_mem
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking cki netfilter test
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
✅ pciutils: update pci ids test
✅ ALSA PCM loopback test
✅ ALSA Control (mixer) Userspace Element test
✅ trace: ftrace/tracer
🚧 ✅ xarray-idr-radixtree-test
🚧 ✅ NFS Connectathon
🚧 ✅ Memory function: kaslr
🚧 ✅ audit: audit testsuite test
🚧 ✅ lvm cache test
🚧 ✅ lvm snapper test
Host 2:
✅ Boot test
✅ Reboot test
✅ xfstests - ext4
✅ xfstests - xfs
✅ selinux-policy: serge-testsuite
✅ storage: software RAID testing
✅ Storage: swraid mdadm raid_module test
🚧 ✅ Podman system integration test - as root
🚧 ✅ Podman system integration test - as user
🚧 ✅ xfstests - btrfs
🚧 ✅ IPMI driver test
🚧 ✅ IPMItool loop stress test
🚧 ✅ Storage blktests
🚧 ✅ Storage block - filesystem fio test
🚧 ✅ Storage block - queue scheduler test
🚧 ✅ Storage nvme - tcp
🚧 ✅ Storage: lvm device-mapper test - upstream
s390x:
Host 1:
✅ Boot test
✅ Reboot test
✅ LTP
✅ CIFS Connectathon
✅ POSIX pjd-fstest suites
✅ Loopdev Sanity
✅ jvm - jcstress tests
✅ Memory: fork_mem
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking cki netfilter test
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - transport
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
✅ trace: ftrace/tracer
🚧 ✅ xarray-idr-radixtree-test
🚧 ✅ NFS Connectathon
🚧 ✅ Memory function: kaslr
🚧 ✅ audit: audit testsuite test
🚧 ✅ lvm cache test
🚧 ✅ lvm snapper test
Host 2:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
✅ Boot test
✅ Reboot test
✅ selinux-policy: serge-testsuite
✅ Storage: swraid mdadm raid_module test
🚧 ✅ Podman system integration test - as root
🚧 ✅ Podman system integration test - as user
🚧 ✅ Storage blktests
🚧 ✅ Storage nvme - tcp
🚧 ⚡⚡⚡ stress: stress-ng
x86_64:
Host 1:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
⚡⚡⚡ Boot test
⚡⚡⚡ Reboot test
⚡⚡⚡ xfstests - ext4
⚡⚡⚡ xfstests - xfs
⚡⚡⚡ xfstests - nfsv4.2
⚡⚡⚡ selinux-policy: serge-testsuite
⚡⚡⚡ power-management: cpupower/sanity test
⚡⚡⚡ storage: software RAID testing
⚡⚡⚡ Storage: swraid mdadm raid_module test
🚧 ⚡⚡⚡ Podman system integration test - as root
🚧 ⚡⚡⚡ Podman system integration test - as user
🚧 ⚡⚡⚡ CPU: Idle Test
🚧 ⚡⚡⚡ xfstests - btrfs
🚧 ⚡⚡⚡ xfstests - cifsv3.11
🚧 ⚡⚡⚡ IPMI driver test
🚧 ⚡⚡⚡ IPMItool loop stress test
🚧 ⚡⚡⚡ Storage blktests
🚧 ⚡⚡⚡ Storage block - filesystem fio test
🚧 ⚡⚡⚡ Storage block - queue scheduler test
🚧 ⚡⚡⚡ Storage nvme - tcp
🚧 ⚡⚡⚡ Storage: lvm device-mapper test - upstream
🚧 ⚡⚡⚡ stress: stress-ng
Host 2:
✅ Boot test
✅ Reboot test
✅ ACPI table test
✅ LTP
✅ CIFS Connectathon
✅ POSIX pjd-fstest suites
✅ Loopdev Sanity
✅ jvm - jcstress tests
✅ Memory: fork_mem
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking: igmp conformance test
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking cki netfilter test
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - transport
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
✅ pciutils: sanity smoke test
✅ pciutils: update pci ids test
✅ ALSA PCM loopback test
✅ ALSA Control (mixer) Userspace Element test
✅ storage: SCSI VPD
✅ trace: ftrace/tracer
🚧 ❌ xarray-idr-radixtree-test
🚧 ✅ i2c: i2cdetect sanity
🚧 ✅ NFS Connectathon
🚧 ✅ Firmware test suite
🚧 ✅ Memory function: kaslr
🚧 ✅ audit: audit testsuite test
🚧 ✅ lvm cache test
🚧 ✅ lvm snapper test
Host 3:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
⚡⚡⚡ Boot test
⚡⚡⚡ Reboot test
⚡⚡⚡ xfstests - ext4
⚡⚡⚡ xfstests - xfs
⚡⚡⚡ xfstests - nfsv4.2
⚡⚡⚡ selinux-policy: serge-testsuite
⚡⚡⚡ power-management: cpupower/sanity test
⚡⚡⚡ storage: software RAID testing
⚡⚡⚡ Storage: swraid mdadm raid_module test
🚧 ⚡⚡⚡ Podman system integration test - as root
🚧 ⚡⚡⚡ Podman system integration test - as user
🚧 ⚡⚡⚡ CPU: Idle Test
🚧 ⚡⚡⚡ xfstests - btrfs
🚧 ⚡⚡⚡ xfstests - cifsv3.11
🚧 ⚡⚡⚡ IPMI driver test
🚧 ⚡⚡⚡ IPMItool loop stress test
🚧 ⚡⚡⚡ Storage blktests
🚧 ⚡⚡⚡ Storage block - filesystem fio test
🚧 ⚡⚡⚡ Storage block - queue scheduler test
🚧 ⚡⚡⚡ Storage nvme - tcp
🚧 ⚡⚡⚡ Storage: lvm device-mapper test - upstream
🚧 ⚡⚡⚡ stress: stress-ng
Host 4:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
⚡⚡⚡ Boot test
⚡⚡⚡ Reboot test
⚡⚡⚡ xfstests - ext4
⚡⚡⚡ xfstests - xfs
⚡⚡⚡ xfstests - nfsv4.2
⚡⚡⚡ selinux-policy: serge-testsuite
⚡⚡⚡ power-management: cpupower/sanity test
⚡⚡⚡ storage: software RAID testing
⚡⚡⚡ Storage: swraid mdadm raid_module test
🚧 ⚡⚡⚡ Podman system integration test - as root
🚧 ⚡⚡⚡ Podman system integration test - as user
🚧 ⚡⚡⚡ CPU: Idle Test
🚧 ⚡⚡⚡ xfstests - btrfs
🚧 ⚡⚡⚡ xfstests - cifsv3.11
🚧 ⚡⚡⚡ IPMI driver test
🚧 ⚡⚡⚡ IPMItool loop stress test
🚧 ⚡⚡⚡ Storage blktests
🚧 ⚡⚡⚡ Storage block - filesystem fio test
🚧 ⚡⚡⚡ Storage block - queue scheduler test
🚧 ⚡⚡⚡ Storage nvme - tcp
🚧 ⚡⚡⚡ Storage: lvm device-mapper test - upstream
🚧 ⚡⚡⚡ stress: stress-ng
Test sources: https://gitlab.com/cki-project/kernel-tests
💚 Pull requests are welcome for new tests or improvements to existing tests!
Aborted tests
-------------
Tests that didn't complete running successfully are marked with ⚡⚡⚡.
If this was caused by an infrastructure issue, we try to mark that
explicitly in the report.
Waived tests
------------
If the test run included waived tests, they are marked with 🚧. Such tests are
executed but their results are not taken into account. Tests are waived when
their results are not reliable enough, e.g. when they're just introduced or are
being fixed.
Testing timeout
---------------
We aim to provide a report within reasonable timeframe. Tests that haven't
finished running yet are marked with ⏱.
The patch titled
Subject: lib: use PFN_PHYS() in devmem_is_allowed()
has been removed from the -mm tree. Its filename was
lib-use-pfn_phys-in-devmem_is_allowed.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Liang Wang <wangliang101(a)huawei.com>
Subject: lib: use PFN_PHYS() in devmem_is_allowed()
The physical address may exceed 32 bits on 32-bit systems with more than
32 bits of physcial address. Use PFN_PHYS() in devmem_is_allowed(), or
the physical address may overflow and be truncated.
We found this bug when mapping a high addresses through devmem tool, when
CONFIG_STRICT_DEVMEM is enabled on the ARM with ARM_LPAE and devmem is
used to map a high address that is not in the iomem address range, an
unexpected error indicating no permission is returned.
This bug was initially introduced from v2.6.37, and the function was moved
to lib when v5.11.
Link: https://lkml.kernel.org/r/20210731025057.78825-1-wangliang101@huawei.com
Fixes: 087aaffcdf9c ("ARM: implement CONFIG_STRICT_DEVMEM by disabling access to RAM via /dev/mem")
Fixes: 527701eda5f1 ("lib: Add a generic version of devmem_is_allowed()")
Signed-off-by: Liang Wang <wangliang101(a)huawei.com>
Reviewed-by: Luis Chamberlain <mcgrof(a)kernel.org>
Cc: Palmer Dabbelt <palmerdabbelt(a)google.com>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Russell King <linux(a)armlinux.org.uk>
Cc: Liang Wang <wangliang101(a)huawei.com>
Cc: Xiaoming Ni <nixiaoming(a)huawei.com>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: <stable(a)vger.kernel.org> [2.6.37+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
lib/devmem_is_allowed.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/lib/devmem_is_allowed.c~lib-use-pfn_phys-in-devmem_is_allowed
+++ a/lib/devmem_is_allowed.c
@@ -19,7 +19,7 @@
*/
int devmem_is_allowed(unsigned long pfn)
{
- if (iomem_is_exclusive(pfn << PAGE_SHIFT))
+ if (iomem_is_exclusive(PFN_PHYS(pfn)))
return 0;
if (!page_is_ram(pfn))
return 1;
_
Patches currently in -mm which might be from wangliang101(a)huawei.com are
From: Maxim Levitsky <mlevitsk(a)redhat.com>
[ upstream commit c7dfa4009965a9b2d7b329ee970eb8da0d32f0bc ]
If L1 disables VMLOAD/VMSAVE intercepts, and doesn't enable
Virtual VMLOAD/VMSAVE (currently not supported for the nested hypervisor),
then VMLOAD/VMSAVE must operate on the L1 physical memory, which is only
possible by making L0 intercept these instructions.
Failure to do so allowed the nested guest to run VMLOAD/VMSAVE unintercepted,
and thus read/write portions of the host physical memory.
Fixes: 89c8a4984fc9 ("KVM: SVM: Enable Virtual VMLOAD VMSAVE feature")
Suggested-by: Paolo Bonzini <pbonzini(a)redhat.com>
Signed-off-by: Maxim Levitsky <mlevitsk(a)redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
---
The above upstream SHA1 is still on its way to Linus
arch/x86/kvm/svm.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 515d0b03bf03..387c1dafee2a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -516,6 +516,9 @@ static void recalc_intercepts(struct vcpu_svm *svm)
c->intercept_dr = h->intercept_dr | g->intercept_dr;
c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
c->intercept = h->intercept | g->intercept;
+
+ c->intercept |= (1ULL << INTERCEPT_VMLOAD);
+ c->intercept |= (1ULL << INTERCEPT_VMSAVE);
}
static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
--
2.26.3
From: Maxim Levitsky <mlevitsk(a)redhat.com>
[ upstream commit c7dfa4009965a9b2d7b329ee970eb8da0d32f0bc ]
If L1 disables VMLOAD/VMSAVE intercepts, and doesn't enable
Virtual VMLOAD/VMSAVE (currently not supported for the nested hypervisor),
then VMLOAD/VMSAVE must operate on the L1 physical memory, which is only
possible by making L0 intercept these instructions.
Failure to do so allowed the nested guest to run VMLOAD/VMSAVE unintercepted,
and thus read/write portions of the host physical memory.
Fixes: 89c8a4984fc9 ("KVM: SVM: Enable Virtual VMLOAD VMSAVE feature")
Suggested-by: Paolo Bonzini <pbonzini(a)redhat.com>
Signed-off-by: Maxim Levitsky <mlevitsk(a)redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
---
arch/x86/kvm/svm/nested.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 7f3b55561ae8..61f418644235 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -149,6 +149,9 @@ void recalc_intercepts(struct vcpu_svm *svm)
for (i = 0; i < MAX_INTERCEPT; i++)
c->intercepts[i] |= g->intercepts[i];
+
+ vmcb_set_intercept(c, INTERCEPT_VMLOAD);
+ vmcb_set_intercept(c, INTERCEPT_VMSAVE);
}
static void copy_vmcb_control_area(struct vmcb_control_area *dst,
--
2.27.0
From: Maxim Levitsky <mlevitsk(a)redhat.com>
[ upstream commit c7dfa4009965a9b2d7b329ee970eb8da0d32f0bc ]
If L1 disables VMLOAD/VMSAVE intercepts, and doesn't enable
Virtual VMLOAD/VMSAVE (currently not supported for the nested hypervisor),
then VMLOAD/VMSAVE must operate on the L1 physical memory, which is only
possible by making L0 intercept these instructions.
Failure to do so allowed the nested guest to run VMLOAD/VMSAVE unintercepted,
and thus read/write portions of the host physical memory.
Fixes: 89c8a4984fc9 ("KVM: SVM: Enable Virtual VMLOAD VMSAVE feature")
Suggested-by: Paolo Bonzini <pbonzini(a)redhat.com>
Signed-off-by: Maxim Levitsky <mlevitsk(a)redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
---
The above upstream SHA1 is still on its way to Linus
arch/x86/kvm/svm.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0dfd0af61a29..439aecbbdd1e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -389,6 +389,9 @@ static void recalc_intercepts(struct vcpu_svm *svm)
c->intercept_dr = h->intercept_dr | g->intercept_dr;
c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
c->intercept = h->intercept | g->intercept;
+
+ c->intercept |= (1ULL << INTERCEPT_VMLOAD);
+ c->intercept |= (1ULL << INTERCEPT_VMSAVE);
}
static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
--
2.26.3
From: Maxim Levitsky <mlevitsk(a)redhat.com>
[ upstream commit c7dfa4009965a9b2d7b329ee970eb8da0d32f0bc ]
If L1 disables VMLOAD/VMSAVE intercepts, and doesn't enable
Virtual VMLOAD/VMSAVE (currently not supported for the nested hypervisor),
then VMLOAD/VMSAVE must operate on the L1 physical memory, which is only
possible by making L0 intercept these instructions.
Failure to do so allowed the nested guest to run VMLOAD/VMSAVE unintercepted,
and thus read/write portions of the host physical memory.
Fixes: 89c8a4984fc9 ("KVM: SVM: Enable Virtual VMLOAD VMSAVE feature")
Suggested-by: Paolo Bonzini <pbonzini(a)redhat.com>
Signed-off-by: Maxim Levitsky <mlevitsk(a)redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
---
The above upstream SHA1 is still on its way to Linus
arch/x86/kvm/svm/nested.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 4b8635d2296a..8d96c28deadb 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -147,6 +147,9 @@ void recalc_intercepts(struct vcpu_svm *svm)
for (i = 0; i < MAX_INTERCEPT; i++)
c->intercepts[i] |= g->intercepts[i];
+
+ vmcb_set_intercept(c, INTERCEPT_VMLOAD);
+ vmcb_set_intercept(c, INTERCEPT_VMSAVE);
}
static void copy_vmcb_control_area(struct vmcb_control_area *dst,
--
2.26.3
From: Maxim Levitsky <mlevitsk(a)redhat.com>
[ upstream commit c7dfa4009965a9b2d7b329ee970eb8da0d32f0bc ]
If L1 disables VMLOAD/VMSAVE intercepts, and doesn't enable
Virtual VMLOAD/VMSAVE (currently not supported for the nested hypervisor),
then VMLOAD/VMSAVE must operate on the L1 physical memory, which is only
possible by making L0 intercept these instructions.
Failure to do so allowed the nested guest to run VMLOAD/VMSAVE unintercepted,
and thus read/write portions of the host physical memory.
Fixes: 89c8a4984fc9 ("KVM: SVM: Enable Virtual VMLOAD VMSAVE feature")
Suggested-by: Paolo Bonzini <pbonzini(a)redhat.com>
Signed-off-by: Maxim Levitsky <mlevitsk(a)redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
---
The above upstream SHA1 is still on its way to Linus
arch/x86/kvm/svm/nested.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 8a229db85302..df17146e841f 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -147,6 +147,9 @@ void recalc_intercepts(struct vcpu_svm *svm)
for (i = 0; i < MAX_INTERCEPT; i++)
c->intercepts[i] |= g->intercepts[i];
+
+ vmcb_set_intercept(c, INTERCEPT_VMLOAD);
+ vmcb_set_intercept(c, INTERCEPT_VMSAVE);
}
static void copy_vmcb_control_area(struct vmcb_control_area *dst,
--
2.26.3
From: Maxim Levitsky <mlevitsk(a)redhat.com>
[ upstream commit c7dfa4009965a9b2d7b329ee970eb8da0d32f0bc ]
If L1 disables VMLOAD/VMSAVE intercepts, and doesn't enable
Virtual VMLOAD/VMSAVE (currently not supported for the nested hypervisor),
then VMLOAD/VMSAVE must operate on the L1 physical memory, which is only
possible by making L0 intercept these instructions.
Failure to do so allowed the nested guest to run VMLOAD/VMSAVE unintercepted,
and thus read/write portions of the host physical memory.
Fixes: 89c8a4984fc9 ("KVM: SVM: Enable Virtual VMLOAD VMSAVE feature")
Suggested-by: Paolo Bonzini <pbonzini(a)redhat.com>
Signed-off-by: Maxim Levitsky <mlevitsk(a)redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
---
The above upstream SHA1 is still on its way to Linus
arch/x86/kvm/svm.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5ddf63896d01..0398819410f1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -513,6 +513,9 @@ static void recalc_intercepts(struct vcpu_svm *svm)
c->intercept_dr = h->intercept_dr | g->intercept_dr;
c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
c->intercept = h->intercept | g->intercept;
+
+ c->intercept |= (1ULL << INTERCEPT_VMLOAD);
+ c->intercept |= (1ULL << INTERCEPT_VMSAVE);
}
static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
--
2.26.3
Hi
[This is an automated email]
This commit has been processed because it contains a -stable tag.
The stable tag indicates that it's relevant for the following trees: all
The bot has tested the following trees: v5.5.11, v5.4.27, v4.19.112, v4.14.174, v4.9.217, v4.4.217.
v5.5.11: Build OK!
v5.4.27: Build OK!
v4.19.112: Failed to apply! Possible dependencies:
03512ceb60ae ("ieee80211: remove redundant leading zeroes")
09b4a4faf9d0 ("mac80211: introduce capability flags for VHT EXT NSS support")
0eeb2b674f05 ("mac80211: add an option for station management TXQ")
1c9559734eca ("mac80211: remove unnecessary key condition")
57a3a454f303 ("iwlwifi: split HE capabilities between AP and STA")
62872a9b9a10 ("mac80211: Fix PTK rekey freezes and clear text leak")
77f7ffdc335d ("mac80211: minstrel_ht: add flag to indicate missing/inaccurate tx A-MPDU length")
77ff2c6b4984 ("mac80211: update HE IEs to D3.3")
80aaa9c16415 ("mac80211: Add he_capa debugfs entry")
96fc6efb9ad9 ("mac80211: IEEE 802.11 Extended Key ID support")
add7453ad62f ("wireless: align to draft 11ax D3.0")
adf8ed01e4fd ("mac80211: add an optional TXQ for other PS-buffered frames")
caf56338c22f ("mac80211: indicate support for multiple BSSID")
daa5b83513a7 ("mac80211: update HE operation fields to D3.0")
v4.14.174: Failed to apply! Possible dependencies:
110b32f065f3 ("iwlwifi: mvm: rs: add basic implementation of the new RS API handlers")
1c73acf58bd6 ("iwlwifi: acpi: move ACPI method definitions to acpi.h")
28e9c00fe1f0 ("iwlwifi: remove upper case letters in sku_capa_band_*_enable")
4ae80f6c8d86 ("iwlwifi: support api ver2 of NVM_GET_INFO resp")
4b82455ca51e ("iwlwifi: use flags to denote modifiers for the channel maps")
4c625c564ba2 ("iwlwifi: get rid of fw/nvm.c")
514c30696fbc ("iwlwifi: add support for IEEE802.11ax")
57a3a454f303 ("iwlwifi: split HE capabilities between AP and STA")
77ff2c6b4984 ("mac80211: update HE IEs to D3.3")
813df5cef3bb ("iwlwifi: acpi: add common code to read from ACPI")
8a6171a7b601 ("iwlwifi: fw: add FW APIs for HE")
9c4f7d512740 ("iwlwifi: move all NVM parsing code to the common files")
9f66a397c877 ("iwlwifi: mvm: rs: add ops for the new rate scaling in the FW")
e7a3b8d87910 ("iwlwifi: acpi: move ACPI-related definitions to acpi.h")
v4.9.217: Failed to apply! Possible dependencies:
01796ff2fa6e ("iwlwifi: mvm: always free inactive queue when moving ownership")
0aaece81114e ("iwlwifi: split firmware API from iwl-trans.h")
1ea423b0e047 ("iwlwifi: remove unnecessary dev_cmd_headroom parameter")
310181ec34e2 ("iwlwifi: move to TVQM mode")
5594d80e9bf4 ("iwlwifi: support two phys for a000 devices")
623e7766be90 ("iwlwifi: pcie: introduce split point to a000 devices")
65e254821cee ("iwlwifi: mvm: use firmware station PM notification for AP_LINK_PS")
6b35ff91572f ("iwlwifi: pcie: introduce a000 TX queues management")
727c02dfb848 ("iwlwifi: pcie: cleanup rfkill checks")
77ff2c6b4984 ("mac80211: update HE IEs to D3.3")
8236f7db2724 ("iwlwifi: mvm: assign cab queue to the correct station")
87d0e1af9db3 ("iwlwifi: mvm: separate queue mapping from queue enablement")
bb49701b41de ("iwlwifi: mvm: support a000 SCD queue configuration")
cf90da352a32 ("iwlwifi: mvm: use mvm_disable_queue instead of sharing logic")
d172a5eff629 ("iwlwifi: reorganize firmware API")
df88c08d5c7e ("iwlwifi: mvm: release static queues on bcast release")
eda50cde58de ("iwlwifi: pcie: add context information support")
v4.4.217: Failed to apply! Possible dependencies:
0aaece81114e ("iwlwifi: split firmware API from iwl-trans.h")
13555e8ba2f4 ("iwlwifi: mvm: add 9000-series RX API")
1a616dd2f171 ("iwlwifi: dump prph registers in a common place for all transports")
1e0bbebaae66 ("mac80211: enable starting BA session with custom timeout")
2f89a5d7d377 ("iwlwifi: mvm: move fw-dbg code to separate file")
39bdb17ebb5b ("iwlwifi: update host command messages to new format")
41837ca962ec ("iwlwifi: pcie: allow to pretend to have Tx CSUM for debug")
4707fde5cdef ("iwlwifi: mvm: use build-time assertion for fw trigger ID")
5b88792cd850 ("iwlwifi: move to wide ID for all commands")
6c4fbcbc1c95 ("iwlwifi: add support for 12K Receive Buffers")
77ff2c6b4984 ("mac80211: update HE IEs to D3.3")
92fe83430b89 ("iwlwifi: uninline iwl_trans_send_cmd")
d172a5eff629 ("iwlwifi: reorganize firmware API")
dcbb4746286a ("iwlwifi: trans: support a callback for ASYNC commands")
NOTE: The patch will not be queued to stable trees until it is upstream.
How should we proceed with this patch?
--
Thanks
Sasha
From: Saeed Mirzamohammadi <saeed.mirzamohammadi(a)oracle.com>
[ Upstream commit 327d5b2fee91c404a3956c324193892cf2cc9528 ]
The IOMMU driver calculates the guest addressability for a DMA request
based on the value of the mgaw reported from the IOMMU. However, this
is a fused value and as mentioned in the spec, the guest width
should be calculated based on the minimum of supported adjusted guest
address width (SAGAW) and MGAW.
This is from specification:
"Guest addressability for a given DMA request is limited to the
minimum of the value reported through this field and the adjusted
guest address width of the corresponding page-table structure.
(Adjusted guest address widths supported by hardware are reported
through the SAGAW field)."
This causes domain initialization to fail and following
errors appear for EHCI PCI driver:
[ 2.486393] ehci-pci 0000:01:00.4: EHCI Host Controller
[ 2.486624] ehci-pci 0000:01:00.4: new USB bus registered, assigned bus
number 1
[ 2.489127] ehci-pci 0000:01:00.4: DMAR: Allocating domain failed
[ 2.489350] ehci-pci 0000:01:00.4: DMAR: 32bit DMA uses non-identity
mapping
[ 2.489359] ehci-pci 0000:01:00.4: can't setup: -12
[ 2.489531] ehci-pci 0000:01:00.4: USB bus 1 deregistered
[ 2.490023] ehci-pci 0000:01:00.4: init 0000:01:00.4 fail, -12
[ 2.490358] ehci-pci: probe of 0000:01:00.4 failed with error -12
This issue happens when the value of the sagaw corresponds to a
48-bit agaw. This fix updates the calculation of the agaw based on
the minimum of IOMMU's sagaw value and MGAW.
This issue happens on the code path of getting a private domain for a
device. A private domain was needed when the domain of an iommu group
couldn't meet the requirement of a device. The IOMMU core has been
evolved to eliminate the need for private domain, hence this code path
has alreay been removed from the upstream since commit 327d5b2fee91c
("iommu/vt-d: Allow 32bit devices to uses DMA domain"). Instead of back
porting all patches that are required for removing the private domain,
this simply fixes it in the affected stable kernel between v4.16 and v5.7.
[baolu: The orignal patch could be found here
https://lore.kernel.org/linux-iommu/20210412202736.70765-1-saeed.mirzamoham….
I added commit message according to Greg's comments at
https://lore.kernel.org/linux-iommu/YHZ%2FT9x7Xjf1r6fI@kroah.com/.]
Cc: Joerg Roedel <joro(a)8bytes.org>
Cc: Ashok Raj <ashok.raj(a)intel.com>
Cc: stable(a)vger.kernel.org #v4.16+
Signed-off-by: Saeed Mirzamohammadi <saeed.mirzamohammadi(a)oracle.com>
Tested-by: Camille Lu <camille.lu(a)hpe.com>
Signed-off-by: Lu Baolu <baolu.lu(a)linux.intel.com>
---
drivers/iommu/intel-iommu.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 953d86ca6d2b..a2a03df97704 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1853,7 +1853,7 @@ static inline int guestwidth_to_adjustwidth(int gaw)
static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
int guest_width)
{
- int adjust_width, agaw;
+ int adjust_width, agaw, cap_width;
unsigned long sagaw;
int err;
@@ -1867,8 +1867,9 @@ static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
domain_reserve_special_ranges(domain);
/* calculate AGAW */
- if (guest_width > cap_mgaw(iommu->cap))
- guest_width = cap_mgaw(iommu->cap);
+ cap_width = min_t(int, cap_mgaw(iommu->cap), agaw_to_width(iommu->agaw));
+ if (guest_width > cap_width)
+ guest_width = cap_width;
domain->gaw = guest_width;
adjust_width = guestwidth_to_adjustwidth(guest_width);
agaw = width_to_agaw(adjust_width);
--
2.25.1
The patch below does not apply to the 5.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 20c0b380f971e7d48f5d978bc27d827f7eabb21a Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit(a)vmware.com>
Date: Sat, 7 Aug 2021 17:13:42 -0700
Subject: [PATCH] io_uring: Use WRITE_ONCE() when writing to sq_flags
The compiler should be forbidden from any strange optimization for async
writes to user visible data-structures. Without proper protection, the
compiler can cause write-tearing or invent writes that would confuse the
userspace.
However, there are writes to sq_flags which are not protected by
WRITE_ONCE(). Use WRITE_ONCE() for these writes.
This is purely a theoretical issue. Presumably, any compiler is very
unlikely to do such optimizations.
Fixes: 75b28affdd6a ("io_uring: allocate the two rings together")
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Pavel Begunkov <asml.silence(a)gmail.com>
Signed-off-by: Nadav Amit <namit(a)vmware.com>
Link: https://lore.kernel.org/r/20210808001342.964634-3-namit@vmware.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1093df3977b8..ca064486cb41 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1500,7 +1500,8 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
all_flushed = list_empty(&ctx->cq_overflow_list);
if (all_flushed) {
clear_bit(0, &ctx->check_cq_overflow);
- ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
}
if (posted)
@@ -1579,7 +1580,9 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
}
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(0, &ctx->check_cq_overflow);
- ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
+
}
ocqe->cqe.user_data = user_data;
ocqe->cqe.res = res;
@@ -6804,14 +6807,16 @@ static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
{
/* Tell userspace we may need a wakeup call */
spin_lock_irq(&ctx->completion_lock);
- ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
spin_unlock_irq(&ctx->completion_lock);
}
static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
{
spin_lock_irq(&ctx->completion_lock);
- ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
spin_unlock_irq(&ctx->completion_lock);
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 20c0b380f971e7d48f5d978bc27d827f7eabb21a Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit(a)vmware.com>
Date: Sat, 7 Aug 2021 17:13:42 -0700
Subject: [PATCH] io_uring: Use WRITE_ONCE() when writing to sq_flags
The compiler should be forbidden from any strange optimization for async
writes to user visible data-structures. Without proper protection, the
compiler can cause write-tearing or invent writes that would confuse the
userspace.
However, there are writes to sq_flags which are not protected by
WRITE_ONCE(). Use WRITE_ONCE() for these writes.
This is purely a theoretical issue. Presumably, any compiler is very
unlikely to do such optimizations.
Fixes: 75b28affdd6a ("io_uring: allocate the two rings together")
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Pavel Begunkov <asml.silence(a)gmail.com>
Signed-off-by: Nadav Amit <namit(a)vmware.com>
Link: https://lore.kernel.org/r/20210808001342.964634-3-namit@vmware.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1093df3977b8..ca064486cb41 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1500,7 +1500,8 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
all_flushed = list_empty(&ctx->cq_overflow_list);
if (all_flushed) {
clear_bit(0, &ctx->check_cq_overflow);
- ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
}
if (posted)
@@ -1579,7 +1580,9 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
}
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(0, &ctx->check_cq_overflow);
- ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
+
}
ocqe->cqe.user_data = user_data;
ocqe->cqe.res = res;
@@ -6804,14 +6807,16 @@ static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
{
/* Tell userspace we may need a wakeup call */
spin_lock_irq(&ctx->completion_lock);
- ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
spin_unlock_irq(&ctx->completion_lock);
}
static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
{
spin_lock_irq(&ctx->completion_lock);
- ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
spin_unlock_irq(&ctx->completion_lock);
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 20c0b380f971e7d48f5d978bc27d827f7eabb21a Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit(a)vmware.com>
Date: Sat, 7 Aug 2021 17:13:42 -0700
Subject: [PATCH] io_uring: Use WRITE_ONCE() when writing to sq_flags
The compiler should be forbidden from any strange optimization for async
writes to user visible data-structures. Without proper protection, the
compiler can cause write-tearing or invent writes that would confuse the
userspace.
However, there are writes to sq_flags which are not protected by
WRITE_ONCE(). Use WRITE_ONCE() for these writes.
This is purely a theoretical issue. Presumably, any compiler is very
unlikely to do such optimizations.
Fixes: 75b28affdd6a ("io_uring: allocate the two rings together")
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Pavel Begunkov <asml.silence(a)gmail.com>
Signed-off-by: Nadav Amit <namit(a)vmware.com>
Link: https://lore.kernel.org/r/20210808001342.964634-3-namit@vmware.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1093df3977b8..ca064486cb41 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1500,7 +1500,8 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
all_flushed = list_empty(&ctx->cq_overflow_list);
if (all_flushed) {
clear_bit(0, &ctx->check_cq_overflow);
- ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
}
if (posted)
@@ -1579,7 +1580,9 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
}
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(0, &ctx->check_cq_overflow);
- ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
+
}
ocqe->cqe.user_data = user_data;
ocqe->cqe.res = res;
@@ -6804,14 +6807,16 @@ static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
{
/* Tell userspace we may need a wakeup call */
spin_lock_irq(&ctx->completion_lock);
- ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
spin_unlock_irq(&ctx->completion_lock);
}
static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
{
spin_lock_irq(&ctx->completion_lock);
- ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ WRITE_ONCE(ctx->rings->sq_flags,
+ ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
spin_unlock_irq(&ctx->completion_lock);
}
Reading status register can fail in the interrupt handler. In such
case, the regmap_read() will not store anything useful under passed
'val' variable and random stack value will be used to determine type of
interrupt.
Handle the regmap_read() failure to avoid handling interrupt type and
triggering changed power supply event based on random stack value.
Fixes: 39e7213edc4f ("max17042_battery: Support regmap to access device's registers")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)canonical.com>
---
drivers/power/supply/max17042_battery.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/power/supply/max17042_battery.c b/drivers/power/supply/max17042_battery.c
index ce2041b30a06..858ae97600d4 100644
--- a/drivers/power/supply/max17042_battery.c
+++ b/drivers/power/supply/max17042_battery.c
@@ -869,8 +869,12 @@ static irqreturn_t max17042_thread_handler(int id, void *dev)
{
struct max17042_chip *chip = dev;
u32 val;
+ int ret;
+
+ ret = regmap_read(chip->regmap, MAX17042_STATUS, &val);
+ if (ret)
+ return IRQ_HANDLED;
- regmap_read(chip->regmap, MAX17042_STATUS, &val);
if ((val & STATUS_INTR_SOCMIN_BIT) ||
(val & STATUS_INTR_SOCMAX_BIT)) {
dev_info(&chip->client->dev, "SOC threshold INTR\n");
--
2.30.2
Hi Greg,
On Sun, Aug 15, 2021 at 02:54:26PM +0200, gregkh(a)linuxfoundation.org wrote:
>
> This is a note to let you know that I've just added the patch titled
>
> vmlinux.lds.h: Handle clang's module.{c,d}tor sections
>
> to the 5.13-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> vmlinux.lds.h-handle-clang-s-module.-c-d-tor-sections.patch
> and it can be found in the queue-5.13 subdirectory.
Attached are backports for 4.4 to 5.10. I am not sure if anyone is
actually using KASAN with clang on 4.4 (ChromeOS maybe?) but it does not
hurt to have it just in case.
I did not get any emails that the patch failed to apply on the older
versions, I assume this is because I did just a "Cc: stable(a)vger.kernel.org"
without any version or fixes tag. Is there any "official" way to notate
that I want a particular patch applied to all supported kernel versions
aside from adding "# v4.4+" to the Cc tag so that I can provide manual
backports for those versions?
Cheers,
Nathan
The patch below does not apply to the 5.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 8434ffe71c874b9c4e184b88d25de98c2bf5fe3f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton(a)kernel.org>
Date: Tue, 3 Aug 2021 12:47:34 -0400
Subject: [PATCH] ceph: take snap_empty_lock atomically with snaprealm refcount
change
There is a race in ceph_put_snap_realm. The change to the nref and the
spinlock acquisition are not done atomically, so you could decrement
nref, and before you take the spinlock, the nref is incremented again.
At that point, you end up putting it on the empty list when it
shouldn't be there. Eventually __cleanup_empty_realms runs and frees
it when it's still in-use.
Fix this by protecting the 1->0 transition with atomic_dec_and_lock,
and just drop the spinlock if we can get the rwsem.
Because these objects can also undergo a 0->1 refcount transition, we
must protect that change as well with the spinlock. Increment locklessly
unless the value is at 0, in which case we take the spinlock, increment
and then take it off the empty list if it did the 0->1 transition.
With these changes, I'm removing the dout() messages from these
functions, as well as in __put_snap_realm. They've always been racy, and
it's better to not print values that may be misleading.
Cc: stable(a)vger.kernel.org
URL: https://tracker.ceph.com/issues/46419
Reported-by: Mark Nelson <mnelson(a)redhat.com>
Signed-off-by: Jeff Layton <jlayton(a)kernel.org>
Reviewed-by: Luis Henriques <lhenriques(a)suse.de>
Signed-off-by: Ilya Dryomov <idryomov(a)gmail.com>
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 4ac0606dcbd4..4c6bd1042c94 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -67,19 +67,19 @@ void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
{
lockdep_assert_held(&mdsc->snap_rwsem);
- dout("get_realm %p %d -> %d\n", realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
/*
- * since we _only_ increment realm refs or empty the empty
- * list with snap_rwsem held, adjusting the empty list here is
- * safe. we do need to protect against concurrent empty list
- * additions, however.
+ * The 0->1 and 1->0 transitions must take the snap_empty_lock
+ * atomically with the refcount change. Go ahead and bump the
+ * nref here, unless it's 0, in which case we take the spinlock
+ * and then do the increment and remove it from the list.
*/
- if (atomic_inc_return(&realm->nref) == 1) {
- spin_lock(&mdsc->snap_empty_lock);
+ if (atomic_inc_not_zero(&realm->nref))
+ return;
+
+ spin_lock(&mdsc->snap_empty_lock);
+ if (atomic_inc_return(&realm->nref) == 1)
list_del_init(&realm->empty_item);
- spin_unlock(&mdsc->snap_empty_lock);
- }
+ spin_unlock(&mdsc->snap_empty_lock);
}
static void __insert_snap_realm(struct rb_root *root,
@@ -208,28 +208,28 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc,
{
lockdep_assert_held_write(&mdsc->snap_rwsem);
- dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+ /*
+ * We do not require the snap_empty_lock here, as any caller that
+ * increments the value must hold the snap_rwsem.
+ */
if (atomic_dec_and_test(&realm->nref))
__destroy_snap_realm(mdsc, realm);
}
/*
- * caller needn't hold any locks
+ * See comments in ceph_get_snap_realm. Caller needn't hold any locks.
*/
void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
- dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
- if (!atomic_dec_and_test(&realm->nref))
+ if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock))
return;
if (down_write_trylock(&mdsc->snap_rwsem)) {
+ spin_unlock(&mdsc->snap_empty_lock);
__destroy_snap_realm(mdsc, realm);
up_write(&mdsc->snap_rwsem);
} else {
- spin_lock(&mdsc->snap_empty_lock);
list_add(&realm->empty_item, &mdsc->snap_empty);
spin_unlock(&mdsc->snap_empty_lock);
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 524a1e4e381fc5e7781008d5bd420fd1357c0113 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Thu, 12 Aug 2021 11:14:13 -0700
Subject: [PATCH] KVM: x86/mmu: Don't leak non-leaf SPTEs when zapping all
SPTEs
Pass "all ones" as the end GFN to signal "zap all" for the TDP MMU and
really zap all SPTEs in this case. As is, zap_gfn_range() skips non-leaf
SPTEs whose range exceeds the range to be zapped. If shadow_phys_bits is
not aligned to the range size of top-level SPTEs, e.g. 512gb with 4-level
paging, the "zap all" flows will skip top-level SPTEs whose range extends
beyond shadow_phys_bits and leak their SPs when the VM is destroyed.
Use the current upper bound (based on host.MAXPHYADDR) to detect that the
caller wants to zap all SPTEs, e.g. instead of using the max theoretical
gfn, 1 << (52 - 12). The more precise upper bound allows the TDP iterator
to terminate its walk earlier when running on hosts with MAXPHYADDR < 52.
Add a WARN on kmv->arch.tdp_mmu_pages when the TDP MMU is destroyed to
help future debuggers should KVM decide to leak SPTEs again.
The bug is most easily reproduced by running (and unloading!) KVM in a
VM whose host.MAXPHYADDR < 39, as the SPTE for gfn=0 will be skipped.
=============================================================================
BUG kvm_mmu_page_header (Not tainted): Objects remaining in kvm_mmu_page_header on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
Slab 0x000000004d8f7af1 objects=22 used=2 fp=0x00000000624d29ac flags=0x4000000000000200(slab|zone=1)
CPU: 0 PID: 1582 Comm: rmmod Not tainted 5.14.0-rc2+ #420
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
Call Trace:
dump_stack_lvl+0x45/0x59
slab_err+0x95/0xc9
__kmem_cache_shutdown.cold+0x3c/0x158
kmem_cache_destroy+0x3d/0xf0
kvm_mmu_module_exit+0xa/0x30 [kvm]
kvm_arch_exit+0x5d/0x90 [kvm]
kvm_exit+0x78/0x90 [kvm]
vmx_exit+0x1a/0x50 [kvm_intel]
__x64_sys_delete_module+0x13f/0x220
do_syscall_64+0x3b/0xc0
entry_SYSCALL_64_after_hwframe+0x44/0xae
Fixes: faaf05b00aec ("kvm: x86/mmu: Support zapping SPTEs in the TDP MMU")
Cc: stable(a)vger.kernel.org
Cc: Ben Gardon <bgardon(a)google.com>
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20210812181414.3376143-2-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 0853370bd811..8783b9eb2b33 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -43,6 +43,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
if (!kvm->arch.tdp_mmu_enabled)
return;
+ WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
/*
@@ -81,8 +82,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared)
{
- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
-
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
@@ -94,7 +93,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
list_del_rcu(&root->link);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
- zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
+ zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
@@ -724,8 +723,17 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, bool can_yield, bool flush,
bool shared)
{
+ gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+ bool zap_all = (start == 0 && end >= max_gfn_host);
struct tdp_iter iter;
+ /*
+ * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
+ * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
+ * and so KVM will never install a SPTE for such addresses.
+ */
+ end = min(end, max_gfn_host);
+
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
rcu_read_lock();
@@ -744,9 +752,10 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
/*
* If this is a non-last-level SPTE that covers a larger range
* than should be zapped, continue, and zap the mappings at a
- * lower level.
+ * lower level, except when zapping all SPTEs.
*/
- if ((iter.gfn < start ||
+ if (!zap_all &&
+ (iter.gfn < start ||
iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
!is_last_spte(iter.old_spte, iter.level))
continue;
@@ -794,12 +803,11 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
bool flush = false;
int i;
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
+ flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull,
flush, false);
if (flush)
@@ -838,7 +846,6 @@ static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
*/
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
{
- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
struct kvm_mmu_page *next_root;
struct kvm_mmu_page *root;
bool flush = false;
@@ -854,8 +861,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
rcu_read_unlock();
- flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
- true);
+ flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
/*
* Put the reference acquired in
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b9255a7cb51754e8d2645b65dd31805e282b4f3e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Thu, 29 Jul 2021 23:51:43 +0200
Subject: [PATCH] PCI/MSI: Enforce MSI[X] entry updates to be visible
Nothing enforces the posted writes to be visible when the function
returns. Flush them even if the flush might be redundant when the entry is
masked already as the unmask will flush as well. This is either setup or a
rare affinity change event so the extra flush is not the end of the world.
While this is more a theoretical issue especially the logic in the X86
specific msi_set_affinity() function relies on the assumption that the
update has reached the hardware when the function returns.
Again, as this never has been enforced the Fixes tag refers to a commit in:
git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.515188147@linutronix.de
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 7ee1ac47caa7..434c7041b176 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -311,6 +311,9 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
if (unmasked)
__pci_msix_desc_mask_irq(entry, 0);
+
+ /* Ensure that the writes are visible in the device */
+ readl(base + PCI_MSIX_ENTRY_DATA);
} else {
int pos = dev->msi_cap;
u16 msgctl;
@@ -331,6 +334,8 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
pci_write_config_word(dev, pos + PCI_MSI_DATA_32,
msg->data);
}
+ /* Ensure that the writes are visible in the device */
+ pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
}
skip:
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b9255a7cb51754e8d2645b65dd31805e282b4f3e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Thu, 29 Jul 2021 23:51:43 +0200
Subject: [PATCH] PCI/MSI: Enforce MSI[X] entry updates to be visible
Nothing enforces the posted writes to be visible when the function
returns. Flush them even if the flush might be redundant when the entry is
masked already as the unmask will flush as well. This is either setup or a
rare affinity change event so the extra flush is not the end of the world.
While this is more a theoretical issue especially the logic in the X86
specific msi_set_affinity() function relies on the assumption that the
update has reached the hardware when the function returns.
Again, as this never has been enforced the Fixes tag refers to a commit in:
git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.515188147@linutronix.de
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 7ee1ac47caa7..434c7041b176 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -311,6 +311,9 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
if (unmasked)
__pci_msix_desc_mask_irq(entry, 0);
+
+ /* Ensure that the writes are visible in the device */
+ readl(base + PCI_MSIX_ENTRY_DATA);
} else {
int pos = dev->msi_cap;
u16 msgctl;
@@ -331,6 +334,8 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
pci_write_config_word(dev, pos + PCI_MSI_DATA_32,
msg->data);
}
+ /* Ensure that the writes are visible in the device */
+ pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
}
skip:
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b9255a7cb51754e8d2645b65dd31805e282b4f3e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Thu, 29 Jul 2021 23:51:43 +0200
Subject: [PATCH] PCI/MSI: Enforce MSI[X] entry updates to be visible
Nothing enforces the posted writes to be visible when the function
returns. Flush them even if the flush might be redundant when the entry is
masked already as the unmask will flush as well. This is either setup or a
rare affinity change event so the extra flush is not the end of the world.
While this is more a theoretical issue especially the logic in the X86
specific msi_set_affinity() function relies on the assumption that the
update has reached the hardware when the function returns.
Again, as this never has been enforced the Fixes tag refers to a commit in:
git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.515188147@linutronix.de
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 7ee1ac47caa7..434c7041b176 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -311,6 +311,9 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
if (unmasked)
__pci_msix_desc_mask_irq(entry, 0);
+
+ /* Ensure that the writes are visible in the device */
+ readl(base + PCI_MSIX_ENTRY_DATA);
} else {
int pos = dev->msi_cap;
u16 msgctl;
@@ -331,6 +334,8 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
pci_write_config_word(dev, pos + PCI_MSI_DATA_32,
msg->data);
}
+ /* Ensure that the writes are visible in the device */
+ pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
}
skip:
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b9255a7cb51754e8d2645b65dd31805e282b4f3e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Thu, 29 Jul 2021 23:51:43 +0200
Subject: [PATCH] PCI/MSI: Enforce MSI[X] entry updates to be visible
Nothing enforces the posted writes to be visible when the function
returns. Flush them even if the flush might be redundant when the entry is
masked already as the unmask will flush as well. This is either setup or a
rare affinity change event so the extra flush is not the end of the world.
While this is more a theoretical issue especially the logic in the X86
specific msi_set_affinity() function relies on the assumption that the
update has reached the hardware when the function returns.
Again, as this never has been enforced the Fixes tag refers to a commit in:
git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.515188147@linutronix.de
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 7ee1ac47caa7..434c7041b176 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -311,6 +311,9 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
if (unmasked)
__pci_msix_desc_mask_irq(entry, 0);
+
+ /* Ensure that the writes are visible in the device */
+ readl(base + PCI_MSIX_ENTRY_DATA);
} else {
int pos = dev->msi_cap;
u16 msgctl;
@@ -331,6 +334,8 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
pci_write_config_word(dev, pos + PCI_MSI_DATA_32,
msg->data);
}
+ /* Ensure that the writes are visible in the device */
+ pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
}
skip:
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From da181dc974ad667579baece33c2c8d2d1e4558d5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Thu, 29 Jul 2021 23:51:42 +0200
Subject: [PATCH] PCI/MSI: Enforce that MSI-X table entry is masked for update
The specification (PCIe r5.0, sec 6.1.4.5) states:
For MSI-X, a function is permitted to cache Address and Data values
from unmasked MSI-X Table entries. However, anytime software unmasks a
currently masked MSI-X Table entry either by clearing its Mask bit or
by clearing the Function Mask bit, the function must update any Address
or Data values that it cached from that entry. If software changes the
Address or Data value of an entry while the entry is unmasked, the
result is undefined.
The Linux kernel's MSI-X support never enforced that the entry is masked
before the entry is modified hence the Fixes tag refers to a commit in:
git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Enforce the entry to be masked across the update.
There is no point in enforcing this to be handled at all possible call
sites as this is just pointless code duplication and the common update
function is the obvious place to enforce this.
Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
Reported-by: Kevin Tian <kevin.tian(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.462096385@linutronix.de
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 57c9ec9b976b..7ee1ac47caa7 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -289,13 +289,28 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
/* Don't touch the hardware now */
} else if (entry->msi_attrib.is_msix) {
void __iomem *base = pci_msix_desc_addr(entry);
+ bool unmasked = !(entry->masked & PCI_MSIX_ENTRY_CTRL_MASKBIT);
if (!base)
goto skip;
+ /*
+ * The specification mandates that the entry is masked
+ * when the message is modified:
+ *
+ * "If software changes the Address or Data value of an
+ * entry while the entry is unmasked, the result is
+ * undefined."
+ */
+ if (unmasked)
+ __pci_msix_desc_mask_irq(entry, PCI_MSIX_ENTRY_CTRL_MASKBIT);
+
writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
writel(msg->data, base + PCI_MSIX_ENTRY_DATA);
+
+ if (unmasked)
+ __pci_msix_desc_mask_irq(entry, 0);
} else {
int pos = dev->msi_cap;
u16 msgctl;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From da181dc974ad667579baece33c2c8d2d1e4558d5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Thu, 29 Jul 2021 23:51:42 +0200
Subject: [PATCH] PCI/MSI: Enforce that MSI-X table entry is masked for update
The specification (PCIe r5.0, sec 6.1.4.5) states:
For MSI-X, a function is permitted to cache Address and Data values
from unmasked MSI-X Table entries. However, anytime software unmasks a
currently masked MSI-X Table entry either by clearing its Mask bit or
by clearing the Function Mask bit, the function must update any Address
or Data values that it cached from that entry. If software changes the
Address or Data value of an entry while the entry is unmasked, the
result is undefined.
The Linux kernel's MSI-X support never enforced that the entry is masked
before the entry is modified hence the Fixes tag refers to a commit in:
git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Enforce the entry to be masked across the update.
There is no point in enforcing this to be handled at all possible call
sites as this is just pointless code duplication and the common update
function is the obvious place to enforce this.
Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
Reported-by: Kevin Tian <kevin.tian(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.462096385@linutronix.de
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 57c9ec9b976b..7ee1ac47caa7 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -289,13 +289,28 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
/* Don't touch the hardware now */
} else if (entry->msi_attrib.is_msix) {
void __iomem *base = pci_msix_desc_addr(entry);
+ bool unmasked = !(entry->masked & PCI_MSIX_ENTRY_CTRL_MASKBIT);
if (!base)
goto skip;
+ /*
+ * The specification mandates that the entry is masked
+ * when the message is modified:
+ *
+ * "If software changes the Address or Data value of an
+ * entry while the entry is unmasked, the result is
+ * undefined."
+ */
+ if (unmasked)
+ __pci_msix_desc_mask_irq(entry, PCI_MSIX_ENTRY_CTRL_MASKBIT);
+
writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
writel(msg->data, base + PCI_MSIX_ENTRY_DATA);
+
+ if (unmasked)
+ __pci_msix_desc_mask_irq(entry, 0);
} else {
int pos = dev->msi_cap;
u16 msgctl;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From da181dc974ad667579baece33c2c8d2d1e4558d5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Thu, 29 Jul 2021 23:51:42 +0200
Subject: [PATCH] PCI/MSI: Enforce that MSI-X table entry is masked for update
The specification (PCIe r5.0, sec 6.1.4.5) states:
For MSI-X, a function is permitted to cache Address and Data values
from unmasked MSI-X Table entries. However, anytime software unmasks a
currently masked MSI-X Table entry either by clearing its Mask bit or
by clearing the Function Mask bit, the function must update any Address
or Data values that it cached from that entry. If software changes the
Address or Data value of an entry while the entry is unmasked, the
result is undefined.
The Linux kernel's MSI-X support never enforced that the entry is masked
before the entry is modified hence the Fixes tag refers to a commit in:
git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Enforce the entry to be masked across the update.
There is no point in enforcing this to be handled at all possible call
sites as this is just pointless code duplication and the common update
function is the obvious place to enforce this.
Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
Reported-by: Kevin Tian <kevin.tian(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.462096385@linutronix.de
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 57c9ec9b976b..7ee1ac47caa7 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -289,13 +289,28 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
/* Don't touch the hardware now */
} else if (entry->msi_attrib.is_msix) {
void __iomem *base = pci_msix_desc_addr(entry);
+ bool unmasked = !(entry->masked & PCI_MSIX_ENTRY_CTRL_MASKBIT);
if (!base)
goto skip;
+ /*
+ * The specification mandates that the entry is masked
+ * when the message is modified:
+ *
+ * "If software changes the Address or Data value of an
+ * entry while the entry is unmasked, the result is
+ * undefined."
+ */
+ if (unmasked)
+ __pci_msix_desc_mask_irq(entry, PCI_MSIX_ENTRY_CTRL_MASKBIT);
+
writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
writel(msg->data, base + PCI_MSIX_ENTRY_DATA);
+
+ if (unmasked)
+ __pci_msix_desc_mask_irq(entry, 0);
} else {
int pos = dev->msi_cap;
u16 msgctl;
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From da181dc974ad667579baece33c2c8d2d1e4558d5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Thu, 29 Jul 2021 23:51:42 +0200
Subject: [PATCH] PCI/MSI: Enforce that MSI-X table entry is masked for update
The specification (PCIe r5.0, sec 6.1.4.5) states:
For MSI-X, a function is permitted to cache Address and Data values
from unmasked MSI-X Table entries. However, anytime software unmasks a
currently masked MSI-X Table entry either by clearing its Mask bit or
by clearing the Function Mask bit, the function must update any Address
or Data values that it cached from that entry. If software changes the
Address or Data value of an entry while the entry is unmasked, the
result is undefined.
The Linux kernel's MSI-X support never enforced that the entry is masked
before the entry is modified hence the Fixes tag refers to a commit in:
git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Enforce the entry to be masked across the update.
There is no point in enforcing this to be handled at all possible call
sites as this is just pointless code duplication and the common update
function is the obvious place to enforce this.
Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
Reported-by: Kevin Tian <kevin.tian(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.462096385@linutronix.de
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 57c9ec9b976b..7ee1ac47caa7 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -289,13 +289,28 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
/* Don't touch the hardware now */
} else if (entry->msi_attrib.is_msix) {
void __iomem *base = pci_msix_desc_addr(entry);
+ bool unmasked = !(entry->masked & PCI_MSIX_ENTRY_CTRL_MASKBIT);
if (!base)
goto skip;
+ /*
+ * The specification mandates that the entry is masked
+ * when the message is modified:
+ *
+ * "If software changes the Address or Data value of an
+ * entry while the entry is unmasked, the result is
+ * undefined."
+ */
+ if (unmasked)
+ __pci_msix_desc_mask_irq(entry, PCI_MSIX_ENTRY_CTRL_MASKBIT);
+
writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
writel(msg->data, base + PCI_MSIX_ENTRY_DATA);
+
+ if (unmasked)
+ __pci_msix_desc_mask_irq(entry, 0);
} else {
int pos = dev->msi_cap;
u16 msgctl;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 7d5ec3d3612396dc6d4b76366d20ab9fc06f399f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Thu, 29 Jul 2021 23:51:41 +0200
Subject: [PATCH] PCI/MSI: Mask all unused MSI-X entries
When MSI-X is enabled the ordering of calls is:
msix_map_region();
msix_setup_entries();
pci_msi_setup_msi_irqs();
msix_program_entries();
This has a few interesting issues:
1) msix_setup_entries() allocates the MSI descriptors and initializes them
except for the msi_desc:masked member which is left zero initialized.
2) pci_msi_setup_msi_irqs() allocates the interrupt descriptors and sets
up the MSI interrupts which ends up in pci_write_msi_msg() unless the
interrupt chip provides its own irq_write_msi_msg() function.
3) msix_program_entries() does not do what the name suggests. It solely
updates the entries array (if not NULL) and initializes the masked
member for each MSI descriptor by reading the hardware state and then
masks the entry.
Obviously this has some issues:
1) The uninitialized masked member of msi_desc prevents the enforcement
of masking the entry in pci_write_msi_msg() depending on the cached
masked bit. Aside of that half initialized data is a NONO in general
2) msix_program_entries() only ensures that the actually allocated entries
are masked. This is wrong as experimentation with crash testing and
crash kernel kexec has shown.
This limited testing unearthed that when the production kernel had more
entries in use and unmasked when it crashed and the crash kernel
allocated a smaller amount of entries, then a full scan of all entries
found unmasked entries which were in use in the production kernel.
This is obviously a device or emulation issue as the device reset
should mask all MSI-X table entries, but obviously that's just part
of the paper specification.
Cure this by:
1) Masking all table entries in hardware
2) Initializing msi_desc::masked in msix_setup_entries()
3) Removing the mask dance in msix_program_entries()
4) Renaming msix_program_entries() to msix_update_entries() to
reflect the purpose of that function.
As the masking of unused entries has never been done the Fixes tag refers
to a commit in:
git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.403833459@linutronix.de
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 5d39ed828085..57c9ec9b976b 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -691,6 +691,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
{
struct irq_affinity_desc *curmsk, *masks = NULL;
struct msi_desc *entry;
+ void __iomem *addr;
int ret, i;
int vec_count = pci_msix_vec_count(dev);
@@ -711,6 +712,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
entry->msi_attrib.is_msix = 1;
entry->msi_attrib.is_64 = 1;
+
if (entries)
entry->msi_attrib.entry_nr = entries[i].entry;
else
@@ -722,6 +724,10 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
entry->msi_attrib.default_irq = dev->irq;
entry->mask_base = base;
+ addr = pci_msix_desc_addr(entry);
+ if (addr)
+ entry->masked = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL);
+
list_add_tail(&entry->list, dev_to_msi_list(&dev->dev));
if (masks)
curmsk++;
@@ -732,26 +738,25 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
return ret;
}
-static void msix_program_entries(struct pci_dev *dev,
- struct msix_entry *entries)
+static void msix_update_entries(struct pci_dev *dev, struct msix_entry *entries)
{
struct msi_desc *entry;
- int i = 0;
- void __iomem *desc_addr;
for_each_pci_msi_entry(entry, dev) {
- if (entries)
- entries[i++].vector = entry->irq;
+ if (entries) {
+ entries->vector = entry->irq;
+ entries++;
+ }
+ }
+}
- desc_addr = pci_msix_desc_addr(entry);
- if (desc_addr)
- entry->masked = readl(desc_addr +
- PCI_MSIX_ENTRY_VECTOR_CTRL);
- else
- entry->masked = 0;
+static void msix_mask_all(void __iomem *base, int tsize)
+{
+ u32 ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
+ int i;
- msix_mask_irq(entry, 1);
- }
+ for (i = 0; i < tsize; i++, base += PCI_MSIX_ENTRY_SIZE)
+ writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL);
}
/**
@@ -768,9 +773,9 @@ static void msix_program_entries(struct pci_dev *dev,
static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
int nvec, struct irq_affinity *affd)
{
- int ret;
- u16 control;
void __iomem *base;
+ int ret, tsize;
+ u16 control;
/*
* Some devices require MSI-X to be enabled before the MSI-X
@@ -782,12 +787,16 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
/* Request & Map MSI-X table region */
- base = msix_map_region(dev, msix_table_size(control));
+ tsize = msix_table_size(control);
+ base = msix_map_region(dev, tsize);
if (!base) {
ret = -ENOMEM;
goto out_disable;
}
+ /* Ensure that all table entries are masked. */
+ msix_mask_all(base, tsize);
+
ret = msix_setup_entries(dev, base, entries, nvec, affd);
if (ret)
goto out_disable;
@@ -801,7 +810,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
if (ret)
goto out_free;
- msix_program_entries(dev, entries);
+ msix_update_entries(dev, entries);
ret = populate_msi_sysfs(dev);
if (ret)
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 7d5ec3d3612396dc6d4b76366d20ab9fc06f399f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Thu, 29 Jul 2021 23:51:41 +0200
Subject: [PATCH] PCI/MSI: Mask all unused MSI-X entries
When MSI-X is enabled the ordering of calls is:
msix_map_region();
msix_setup_entries();
pci_msi_setup_msi_irqs();
msix_program_entries();
This has a few interesting issues:
1) msix_setup_entries() allocates the MSI descriptors and initializes them
except for the msi_desc:masked member which is left zero initialized.
2) pci_msi_setup_msi_irqs() allocates the interrupt descriptors and sets
up the MSI interrupts which ends up in pci_write_msi_msg() unless the
interrupt chip provides its own irq_write_msi_msg() function.
3) msix_program_entries() does not do what the name suggests. It solely
updates the entries array (if not NULL) and initializes the masked
member for each MSI descriptor by reading the hardware state and then
masks the entry.
Obviously this has some issues:
1) The uninitialized masked member of msi_desc prevents the enforcement
of masking the entry in pci_write_msi_msg() depending on the cached
masked bit. Aside of that half initialized data is a NONO in general
2) msix_program_entries() only ensures that the actually allocated entries
are masked. This is wrong as experimentation with crash testing and
crash kernel kexec has shown.
This limited testing unearthed that when the production kernel had more
entries in use and unmasked when it crashed and the crash kernel
allocated a smaller amount of entries, then a full scan of all entries
found unmasked entries which were in use in the production kernel.
This is obviously a device or emulation issue as the device reset
should mask all MSI-X table entries, but obviously that's just part
of the paper specification.
Cure this by:
1) Masking all table entries in hardware
2) Initializing msi_desc::masked in msix_setup_entries()
3) Removing the mask dance in msix_program_entries()
4) Renaming msix_program_entries() to msix_update_entries() to
reflect the purpose of that function.
As the masking of unused entries has never been done the Fixes tag refers
to a commit in:
git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support")
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.403833459@linutronix.de
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 5d39ed828085..57c9ec9b976b 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -691,6 +691,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
{
struct irq_affinity_desc *curmsk, *masks = NULL;
struct msi_desc *entry;
+ void __iomem *addr;
int ret, i;
int vec_count = pci_msix_vec_count(dev);
@@ -711,6 +712,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
entry->msi_attrib.is_msix = 1;
entry->msi_attrib.is_64 = 1;
+
if (entries)
entry->msi_attrib.entry_nr = entries[i].entry;
else
@@ -722,6 +724,10 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
entry->msi_attrib.default_irq = dev->irq;
entry->mask_base = base;
+ addr = pci_msix_desc_addr(entry);
+ if (addr)
+ entry->masked = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL);
+
list_add_tail(&entry->list, dev_to_msi_list(&dev->dev));
if (masks)
curmsk++;
@@ -732,26 +738,25 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
return ret;
}
-static void msix_program_entries(struct pci_dev *dev,
- struct msix_entry *entries)
+static void msix_update_entries(struct pci_dev *dev, struct msix_entry *entries)
{
struct msi_desc *entry;
- int i = 0;
- void __iomem *desc_addr;
for_each_pci_msi_entry(entry, dev) {
- if (entries)
- entries[i++].vector = entry->irq;
+ if (entries) {
+ entries->vector = entry->irq;
+ entries++;
+ }
+ }
+}
- desc_addr = pci_msix_desc_addr(entry);
- if (desc_addr)
- entry->masked = readl(desc_addr +
- PCI_MSIX_ENTRY_VECTOR_CTRL);
- else
- entry->masked = 0;
+static void msix_mask_all(void __iomem *base, int tsize)
+{
+ u32 ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT;
+ int i;
- msix_mask_irq(entry, 1);
- }
+ for (i = 0; i < tsize; i++, base += PCI_MSIX_ENTRY_SIZE)
+ writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL);
}
/**
@@ -768,9 +773,9 @@ static void msix_program_entries(struct pci_dev *dev,
static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
int nvec, struct irq_affinity *affd)
{
- int ret;
- u16 control;
void __iomem *base;
+ int ret, tsize;
+ u16 control;
/*
* Some devices require MSI-X to be enabled before the MSI-X
@@ -782,12 +787,16 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
/* Request & Map MSI-X table region */
- base = msix_map_region(dev, msix_table_size(control));
+ tsize = msix_table_size(control);
+ base = msix_map_region(dev, tsize);
if (!base) {
ret = -ENOMEM;
goto out_disable;
}
+ /* Ensure that all table entries are masked. */
+ msix_mask_all(base, tsize);
+
ret = msix_setup_entries(dev, base, entries, nvec, affd);
if (ret)
goto out_disable;
@@ -801,7 +810,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
if (ret)
goto out_free;
- msix_program_entries(dev, entries);
+ msix_update_entries(dev, entries);
ret = populate_msi_sysfs(dev);
if (ret)