Limit the number of files in io_uring fixed tables by RLIMIT_NOFILE,
that's the first and the simpliest restriction that we should impose.
Cc: stable(a)vger.kernel.org
Suggested-by: Jens Axboe <axboe(a)kernel.dk>
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
---
fs/io_uring.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 30edc329d803..e6301d5d03a8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7730,6 +7730,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
+ if (nr_args > rlimit(RLIMIT_NOFILE))
+ return -EMFILE;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
--
2.32.0
When a mapped level interrupt (a timer, for example) is deactivated
by the guest, the corresponding host interrupt is equally deactivated.
However, the fate of the pending state still needs to be dealt
with in SW.
This is specially true when the interrupt was in the active+pending
state in the virtual distributor at the point where the guest
was entered. On exit, the pending state is potentially stale
(the guest may have put the interrupt in a non-pending state).
If we don't do anything, the interrupt will be spuriously injected
in the guest. Although this shouldn't have any ill effect (spurious
interrupts are always possible), we can improve the emulation by
detecting the deactivation-while-pending case and resample the
interrupt.
While we're at it, move the logic into a common helper that can
be shared between the two GIC implementations.
Fixes: e40cc57bac79 ("KVM: arm/arm64: vgic: Support level-triggered mapped interrupts")
Reported-by: Raghavendra Rao Ananta <rananta(a)google.com>
Tested-by: Raghavendra Rao Ananta <rananta(a)google.com>
Reviewed-by: Oliver Upton <oupton(a)google.com>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
Notes:
* From v1:
- Moved the resampling into a separate helper, and pointed both
GIC implementations to it.
- Collected TB, RB, with thanks.
arch/arm64/kvm/vgic/vgic-v2.c | 36 +++++----------------------------
arch/arm64/kvm/vgic/vgic-v3.c | 36 +++++----------------------------
arch/arm64/kvm/vgic/vgic.c | 38 +++++++++++++++++++++++++++++++++++
arch/arm64/kvm/vgic/vgic.h | 2 ++
4 files changed, 50 insertions(+), 62 deletions(-)
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 2c580204f1dc..95a18cec14a3 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -60,6 +60,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
u32 val = cpuif->vgic_lr[lr];
u32 cpuid, intid = val & GICH_LR_VIRTUALID;
struct vgic_irq *irq;
+ bool deactivated;
/* Extract the source vCPU id from the LR */
cpuid = val & GICH_LR_PHYSID_CPUID;
@@ -75,7 +76,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
raw_spin_lock(&irq->irq_lock);
- /* Always preserve the active bit */
+ /* Always preserve the active bit, note deactivation */
+ deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT);
irq->active = !!(val & GICH_LR_ACTIVE_BIT);
if (irq->active && vgic_irq_is_sgi(intid))
@@ -96,36 +98,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
irq->pending_latch = false;
- /*
- * Level-triggered mapped IRQs are special because we only
- * observe rising edges as input to the VGIC.
- *
- * If the guest never acked the interrupt we have to sample
- * the physical line and set the line level, because the
- * device state could have changed or we simply need to
- * process the still pending interrupt later.
- *
- * If this causes us to lower the level, we have to also clear
- * the physical active state, since we will otherwise never be
- * told when the interrupt becomes asserted again.
- *
- * Another case is when the interrupt requires a helping hand
- * on deactivation (no HW deactivation, for example).
- */
- if (vgic_irq_is_mapped_level(irq)) {
- bool resample = false;
-
- if (val & GICH_LR_PENDING_BIT) {
- irq->line_level = vgic_get_phys_line_level(irq);
- resample = !irq->line_level;
- } else if (vgic_irq_needs_resampling(irq) &&
- !(irq->active || irq->pending_latch)) {
- resample = true;
- }
-
- if (resample)
- vgic_irq_set_phys_active(irq, false);
- }
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT);
raw_spin_unlock(&irq->irq_lock);
vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 66004f61cd83..21a6207fb2ee 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -46,6 +46,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
u32 intid, cpuid;
struct vgic_irq *irq;
bool is_v2_sgi = false;
+ bool deactivated;
cpuid = val & GICH_LR_PHYSID_CPUID;
cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
@@ -68,7 +69,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
raw_spin_lock(&irq->irq_lock);
- /* Always preserve the active bit */
+ /* Always preserve the active bit, note deactivation */
+ deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
irq->active = !!(val & ICH_LR_ACTIVE_BIT);
if (irq->active && is_v2_sgi)
@@ -89,36 +91,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
irq->pending_latch = false;
- /*
- * Level-triggered mapped IRQs are special because we only
- * observe rising edges as input to the VGIC.
- *
- * If the guest never acked the interrupt we have to sample
- * the physical line and set the line level, because the
- * device state could have changed or we simply need to
- * process the still pending interrupt later.
- *
- * If this causes us to lower the level, we have to also clear
- * the physical active state, since we will otherwise never be
- * told when the interrupt becomes asserted again.
- *
- * Another case is when the interrupt requires a helping hand
- * on deactivation (no HW deactivation, for example).
- */
- if (vgic_irq_is_mapped_level(irq)) {
- bool resample = false;
-
- if (val & ICH_LR_PENDING_BIT) {
- irq->line_level = vgic_get_phys_line_level(irq);
- resample = !irq->line_level;
- } else if (vgic_irq_needs_resampling(irq) &&
- !(irq->active || irq->pending_latch)) {
- resample = true;
- }
-
- if (resample)
- vgic_irq_set_phys_active(irq, false);
- }
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);
raw_spin_unlock(&irq->irq_lock);
vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 111bff47e471..42a6ac78fe95 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -1022,3 +1022,41 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
return map_is_active;
}
+
+/*
+ * Level-triggered mapped IRQs are special because we only observe rising
+ * edges as input to the VGIC.
+ *
+ * If the guest never acked the interrupt we have to sample the physical
+ * line and set the line level, because the device state could have changed
+ * or we simply need to process the still pending interrupt later.
+ *
+ * We could also have entered the guest with the interrupt active+pending.
+ * On the next exit, we need to re-evaluate the pending state, as it could
+ * otherwise result in a spurious interrupt by injecting a now potentially
+ * stale pending state.
+ *
+ * If this causes us to lower the level, we have to also clear the physical
+ * active state, since we will otherwise never be told when the interrupt
+ * becomes asserted again.
+ *
+ * Another case is when the interrupt requires a helping hand on
+ * deactivation (no HW deactivation, for example).
+ */
+void vgic_irq_handle_resampling(struct vgic_irq *irq,
+ bool lr_deactivated, bool lr_pending)
+{
+ if (vgic_irq_is_mapped_level(irq)) {
+ bool resample = false;
+
+ if (unlikely(vgic_irq_needs_resampling(irq))) {
+ resample = !(irq->active || irq->pending_latch);
+ } else if (lr_pending || (lr_deactivated && irq->line_level)) {
+ irq->line_level = vgic_get_phys_line_level(irq);
+ resample = !irq->line_level;
+ }
+
+ if (resample)
+ vgic_irq_set_phys_active(irq, false);
+ }
+}
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index dc1f3d1657ee..14a9218641f5 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -169,6 +169,8 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
unsigned long flags);
void vgic_kick_vcpus(struct kvm *kvm);
+void vgic_irq_handle_resampling(struct vgic_irq *irq,
+ bool lr_deactivated, bool lr_pending);
int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
phys_addr_t addr, phys_addr_t alignment);
--
2.30.2
From: Mike Kravetz <mike.kravetz(a)oracle.com>
Subject: hugetlb: don't pass page cache pages to restore_reserve_on_error
syzbot hit kernel BUG at fs/hugetlbfs/inode.c:532 as described in [1].
This BUG triggers if the HPageRestoreReserve flag is set on a page in
the page cache. It should never be set, as the routine
huge_add_to_page_cache explicitly clears the flag after adding a page
to the cache.
The only code other than huge page allocation which sets the flag is
restore_reserve_on_error. It will potentially set the flag in rare out
of memory conditions. syzbot was injecting errors to cause memory
allocation errors which exercised this specific path.
The code in restore_reserve_on_error is doing the right thing. However,
there are instances where pages in the page cache were being passed to
restore_reserve_on_error. This is incorrect, as once a page goes into
the cache reservation information will not be modified for the page until
it is removed from the cache. Error paths do not remove pages from the
cache, so even in the case of error, the page will remain in the cache
and no reservation adjustment is needed.
Modify routines that potentially call restore_reserve_on_error with a
page cache page to no longer do so.
Note on fixes tag:
Prior to commit 846be08578ed ("mm/hugetlb: expand restore_reserve_on_error
functionality") the routine would not process page cache pages because
the HPageRestoreReserve flag is not set on such pages. Therefore, this
issue could not be trigggered. The code added by commit 846be08578ed
("mm/hugetlb: expand restore_reserve_on_error functionality") is needed
and correct. It exposed incorrect calls to restore_reserve_on_error which
is the root cause addressed by this commit.
[1] https://lore.kernel.org/linux-mm/00000000000050776d05c9b7c7f0@google.com/
Link: https://lkml.kernel.org/r/20210818213304.37038-1-mike.kravetz@oracle.com
Fixes: 846be08578ed ("mm/hugetlb: expand restore_reserve_on_error functionality")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reported-by: <syzbot+67654e51e54455f1c585(a)syzkaller.appspotmail.com>
Cc: Mina Almasry <almasrymina(a)google.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Naoya Horiguchi <naoya.horiguchi(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
--- a/mm/hugetlb.c~hugetlb-dont-pass-page-cache-pages-to-restore_reserve_on_error
+++ a/mm/hugetlb.c
@@ -2476,7 +2476,7 @@ void restore_reserve_on_error(struct hst
if (!rc) {
/*
* This indicates there is an entry in the reserve map
- * added by alloc_huge_page. We know it was added
+ * not added by alloc_huge_page. We know it was added
* before the alloc_huge_page call, otherwise
* HPageRestoreReserve would be set on the page.
* Remove the entry so that a subsequent allocation
@@ -4660,7 +4660,9 @@ retry_avoidcopy:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
out_release_all:
- restore_reserve_on_error(h, vma, haddr, new_page);
+ /* No restore in case of successful pagetable update (Break COW) */
+ if (new_page != old_page)
+ restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
out_release_old:
put_page(old_page);
@@ -4776,7 +4778,7 @@ static vm_fault_t hugetlb_no_page(struct
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
- bool new_page = false;
+ bool new_page, new_pagecache_page = false;
/*
* Currently, we are forced to kill the process in the event the
@@ -4799,6 +4801,7 @@ static vm_fault_t hugetlb_no_page(struct
goto out;
retry:
+ new_page = false;
page = find_lock_page(mapping, idx);
if (!page) {
/* Check for page in userfault range */
@@ -4842,6 +4845,7 @@ retry:
goto retry;
goto out;
}
+ new_pagecache_page = true;
} else {
lock_page(page);
if (unlikely(anon_vma_prepare(vma))) {
@@ -4926,7 +4930,9 @@ backout:
spin_unlock(ptl);
backout_unlocked:
unlock_page(page);
- restore_reserve_on_error(h, vma, haddr, page);
+ /* restore reserve for newly allocated pages not in page cache */
+ if (new_page && !new_pagecache_page)
+ restore_reserve_on_error(h, vma, haddr, page);
put_page(page);
goto out;
}
@@ -5135,6 +5141,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_s
int ret = -ENOMEM;
struct page *page;
int writable;
+ bool new_pagecache_page = false;
if (is_continue) {
ret = -EFAULT;
@@ -5228,6 +5235,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_s
ret = huge_add_to_page_cache(page, mapping, idx);
if (ret)
goto out_release_nounlock;
+ new_pagecache_page = true;
}
ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
@@ -5291,7 +5299,8 @@ out_release_unlock:
if (vm_shared || is_continue)
unlock_page(page);
out_release_nounlock:
- restore_reserve_on_error(h, dst_vma, dst_addr, page);
+ if (!new_pagecache_page)
+ restore_reserve_on_error(h, dst_vma, dst_addr, page);
put_page(page);
goto out;
}
_