July 2020 - Linux-stable-mirror

[PATCH] drm/i915: Also drop vm.ref along error paths for vma construction

by Chris Wilson

Not only do we need to release the vm.ref we acquired for the vma on the duplicate insert branch, but also for the normal error paths, so roll them all into one. Reported-by: Andi Shyti <andi.shyti(a)intel.com> Suggested-by: Andi Shyti <andi.shyti(a)intel.com> Fixes: 2850748ef876 ("drm/i915: Pull i915_vma_pin under the vm->mutex") Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk> Cc: Andi Shyti <andi.shyti(a)intel.com> Cc: <stable(a)vger.kernel.org> # v5.5+ --- drivers/gpu/drm/i915/i915_vma.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 7fe1f317cd2b..f4e22e256ac6 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -104,6 +104,7 @@ vma_create(struct drm_i915_gem_object *obj, struct i915_address_space *vm, const struct i915_ggtt_view *view) { + struct i915_vma *pos = ERR_PTR(-E2BIG); struct i915_vma *vma; struct rb_node *rb, **p; @@ -184,7 +185,6 @@ vma_create(struct drm_i915_gem_object *obj, rb = NULL; p = &obj->vma.tree.rb_node; while (*p) { - struct i915_vma *pos; long cmp; rb = *p; @@ -196,17 +196,12 @@ vma_create(struct drm_i915_gem_object *obj, * and dispose of ours. */ cmp = i915_vma_compare(pos, vm, view); - if (cmp == 0) { - spin_unlock(&obj->vma.lock); - i915_vm_put(vm); - i915_vma_free(vma); - return pos; - } - if (cmp < 0) p = &rb->rb_right; - else + else if (cmp > 0) p = &rb->rb_left; + else + goto err_unlock; } rb_link_node(&vma->obj_node, rb, p); rb_insert_color(&vma->obj_node, &obj->vma.tree); @@ -229,8 +224,9 @@ vma_create(struct drm_i915_gem_object *obj, err_unlock: spin_unlock(&obj->vma.lock); err_vma: + i915_vm_put(vm); i915_vma_free(vma); - return ERR_PTR(-E2BIG); + return pos; } static struct i915_vma * -- 2.20.1

5 years, 5 months

2
1
0 0

[PATCH 01/23] drm/i915: Drop vm.ref for duplicate vma on construction

by Chris Wilson

As we allow for parallel threads to create vma instances in parallel, and we only filter out the duplicates upon reacquiring the spinlock for the rbtree, we have to free the loser of the constructors' race. When freeing, we should also drop any resource references acquired for the redundant vma. Fixes: 2850748ef876 ("drm/i915: Pull i915_vma_pin under the vm->mutex") Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin(a)intel.com> Cc: <stable(a)vger.kernel.org> # v5.5+ --- drivers/gpu/drm/i915/i915_vma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 1f63c4a1f055..7fe1f317cd2b 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -198,6 +198,7 @@ vma_create(struct drm_i915_gem_object *obj, cmp = i915_vma_compare(pos, vm, view); if (cmp == 0) { spin_unlock(&obj->vma.lock); + i915_vm_put(vm); i915_vma_free(vma); return pos; } -- 2.20.1

5 years, 5 months

3
4
0 0

[PATCH] tpm: Define TPM2_SPACE_BUFFER_SIZE to replace the use of PAGE_SIZE

by Jarkko Sakkinen

The size of the buffers for storing context's and sessions can vary from arch to arch as PAGE_SIZE can be anything between 4 kB and 256 kB (the maximum for PPC64). Define a fixed buffer size set to 16 kB. This should be enough for most use with three handles (that is how many we allow at the moment). Reported-by: Stefan Berger <stefanb(a)linux.ibm.com> Cc: stable(a)vger.kernel.org Fixes: 745b361e989a ("tpm: infrastructure for TPM spaces") Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen(a)linux.intel.com> --- drivers/char/tpm/tpm2-space.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/char/tpm/tpm2-space.c b/drivers/char/tpm/tpm2-space.c index 982d341d8837..9bef646093d1 100644 --- a/drivers/char/tpm/tpm2-space.c +++ b/drivers/char/tpm/tpm2-space.c @@ -15,6 +15,8 @@ #include <asm/unaligned.h> #include "tpm.h" +#define TPM2_SPACE_BUFFER_SIZE 16384 /* 16 kB */ + enum tpm2_handle_types { TPM2_HT_HMAC_SESSION = 0x02000000, TPM2_HT_POLICY_SESSION = 0x03000000, @@ -40,11 +42,11 @@ static void tpm2_flush_sessions(struct tpm_chip *chip, struct tpm_space *space) int tpm2_init_space(struct tpm_space *space) { - space->context_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); + space->context_buf = kzalloc(TPM2_SPACE_BUFFER_SIZE, GFP_KERNEL); if (!space->context_buf) return -ENOMEM; - space->session_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); + space->session_buf = kzalloc(TPM2_SPACE_BUFFER_SIZE, GFP_KERNEL); if (space->session_buf == NULL) { kfree(space->context_buf); return -ENOMEM; @@ -311,8 +313,10 @@ int tpm2_prepare_space(struct tpm_chip *chip, struct tpm_space *space, u8 *cmd, sizeof(space->context_tbl)); memcpy(&chip->work_space.session_tbl, &space->session_tbl, sizeof(space->session_tbl)); - memcpy(chip->work_space.context_buf, space->context_buf, PAGE_SIZE); - memcpy(chip->work_space.session_buf, space->session_buf, PAGE_SIZE); + memcpy(chip->work_space.context_buf, space->context_buf, + TPM2_SPACE_BUFFER_SIZE); + memcpy(chip->work_space.session_buf, space->session_buf, + TPM2_SPACE_BUFFER_SIZE); rc = tpm2_load_space(chip); if (rc) { @@ -492,8 +496,8 @@ static int tpm2_save_space(struct tpm_chip *chip) continue; rc = tpm2_save_context(chip, space->context_tbl[i], - space->context_buf, PAGE_SIZE, - &offset); + space->context_buf, + TPM2_SPACE_BUFFER_SIZE, &offset); if (rc == -ENOENT) { space->context_tbl[i] = 0; continue; @@ -509,9 +513,8 @@ static int tpm2_save_space(struct tpm_chip *chip) continue; rc = tpm2_save_context(chip, space->session_tbl[i], - space->session_buf, PAGE_SIZE, - &offset); - + space->session_buf, + TPM2_SPACE_BUFFER_SIZE, &offset); if (rc == -ENOENT) { /* handle error saving session, just forget it */ space->session_tbl[i] = 0; @@ -557,8 +560,10 @@ int tpm2_commit_space(struct tpm_chip *chip, struct tpm_space *space, sizeof(space->context_tbl)); memcpy(&space->session_tbl, &chip->work_space.session_tbl, sizeof(space->session_tbl)); - memcpy(space->context_buf, chip->work_space.context_buf, PAGE_SIZE); - memcpy(space->session_buf, chip->work_space.session_buf, PAGE_SIZE); + memcpy(space->context_buf, chip->work_space.context_buf, + TPM2_SPACE_BUFFER_SIZE); + memcpy(space->session_buf, chip->work_space.session_buf, + TPM2_SPACE_BUFFER_SIZE); return 0; out: -- 2.25.1

5 years, 5 months

2
2
0 0

FAILED: patch "[PATCH] drm/amd/display: Fix ineffective setting of max bpc property" failed to apply to 5.7-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 5.7-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. thanks, greg k-h ------------------ original commit in Linus's tree ------------------ >From fa7041d9d2fc7401cece43f305eb5b87b7017fc4 Mon Sep 17 00:00:00 2001 From: Stylon Wang <stylon.wang(a)amd.com> Date: Fri, 12 Jun 2020 19:04:18 +0800 Subject: [PATCH] drm/amd/display: Fix ineffective setting of max bpc property [Why] Regression was introduced where setting max bpc property has no effect on the atomic check and final commit. It has the same effect as max bpc being stuck at 8. [How] Correctly propagate max bpc with the new connector state. Signed-off-by: Stylon Wang <stylon.wang(a)amd.com> Reviewed-by: Nicholas Kazlauskas <Nicholas.Kazlauskas(a)amd.com> Acked-by: Rodrigo Siqueira <Rodrigo.Siqueira(a)amd.com> Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com> Cc: stable(a)vger.kernel.org diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 7ced9f87be97..10ac8076d4f2 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -5024,7 +5024,8 @@ create_validate_stream_for_sink(struct amdgpu_dm_connector *aconnector, struct drm_connector *connector = &aconnector->base; struct amdgpu_device *adev = connector->dev->dev_private; struct dc_stream_state *stream; - int requested_bpc = connector->state ? connector->state->max_requested_bpc : 8; + const struct drm_connector_state *drm_state = dm_state ? &dm_state->base : NULL; + int requested_bpc = drm_state ? drm_state->max_requested_bpc : 8; enum dc_status dc_result = DC_OK; do {

5 years, 5 months

2
1
0 0

FAILED: patch "[PATCH] mm: fix swap cache node allocation mask" failed to apply to 5.7-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 5.7-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. thanks, greg k-h ------------------ original commit in Linus's tree ------------------ >From 243bce09c91b0145aeaedd5afba799d81841c030 Mon Sep 17 00:00:00 2001 From: Hugh Dickins <hughd(a)google.com> Date: Thu, 25 Jun 2020 20:29:59 -0700 Subject: [PATCH] mm: fix swap cache node allocation mask Chris Murphy reports that a slightly overcommitted load, testing swap and zram along with i915, splats and keeps on splatting, when it had better fail less noisily: gnome-shell: page allocation failure: order:0, mode:0x400d0(__GFP_IO|__GFP_FS|__GFP_COMP|__GFP_RECLAIMABLE), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 2 PID: 1155 Comm: gnome-shell Not tainted 5.7.0-1.fc33.x86_64 #1 Call Trace: dump_stack+0x64/0x88 warn_alloc.cold+0x75/0xd9 __alloc_pages_slowpath.constprop.0+0xcfa/0xd30 __alloc_pages_nodemask+0x2df/0x320 alloc_slab_page+0x195/0x310 allocate_slab+0x3c5/0x440 ___slab_alloc+0x40c/0x5f0 __slab_alloc+0x1c/0x30 kmem_cache_alloc+0x20e/0x220 xas_nomem+0x28/0x70 add_to_swap_cache+0x321/0x400 __read_swap_cache_async+0x105/0x240 swap_cluster_readahead+0x22c/0x2e0 shmem_swapin+0x8e/0xc0 shmem_swapin_page+0x196/0x740 shmem_getpage_gfp+0x3a2/0xa60 shmem_read_mapping_page_gfp+0x32/0x60 shmem_get_pages+0x155/0x5e0 [i915] __i915_gem_object_get_pages+0x68/0xa0 [i915] i915_vma_pin+0x3fe/0x6c0 [i915] eb_add_vma+0x10b/0x2c0 [i915] i915_gem_do_execbuffer+0x704/0x3430 [i915] i915_gem_execbuffer2_ioctl+0x1ea/0x3e0 [i915] drm_ioctl_kernel+0x86/0xd0 [drm] drm_ioctl+0x206/0x390 [drm] ksys_ioctl+0x82/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported on 5.7, but it goes back really to 3.1: when shmem_read_mapping_page_gfp() was implemented for use by i915, and allowed for __GFP_NORETRY and __GFP_NOWARN flags in most places, but missed swapin's "& GFP_KERNEL" mask for page tree node allocation in __read_swap_cache_async() - that was to mask off HIGHUSER_MOVABLE bits from what page cache uses, but GFP_RECLAIM_MASK is now what's needed. Link: https://bugzilla.kernel.org/show_bug.cgi?id=208085 Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2006151330070.11064@eggly.anvils Fixes: 68da9f055755 ("tmpfs: pass gfp to shmem_getpage_gfp") Signed-off-by: Hugh Dickins <hughd(a)google.com> Reviewed-by: Vlastimil Babka <vbabka(a)suse.cz> Reviewed-by: Matthew Wilcox (Oracle) <willy(a)infradead.org> Reported-by: Chris Murphy <lists(a)colorremedies.com> Analyzed-by: Vlastimil Babka <vbabka(a)suse.cz> Analyzed-by: Matthew Wilcox <willy(a)infradead.org> Tested-by: Chris Murphy <lists(a)colorremedies.com> Cc: <stable(a)vger.kernel.org> [3.1+] Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org> diff --git a/mm/swap_state.c b/mm/swap_state.c index e98ff460e9e9..05889e8e3c97 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -21,7 +21,7 @@ #include <linux/vmalloc.h> #include <linux/swap_slots.h> #include <linux/huge_mm.h> - +#include "internal.h" /* * swapper_space is a fiction, retained to simplify the path through @@ -429,7 +429,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, __SetPageSwapBacked(page); /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL)) { + if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK)) { put_swap_page(page, entry); goto fail_unlock; }

5 years, 5 months

3
2
0 0

FAILED: patch "[PATCH] btrfs: fix data block group relocation failure due to" failed to apply to 4.14-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 4.14-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. thanks, greg k-h ------------------ original commit in Linus's tree ------------------ >From 432cd2a10f1c10cead91fe706ff5dc52f06d642a Mon Sep 17 00:00:00 2001 From: Filipe Manana <fdmanana(a)suse.com> Date: Mon, 8 Jun 2020 13:32:55 +0100 Subject: [PATCH] btrfs: fix data block group relocation failure due to concurrent scrub When running relocation of a data block group while scrub is running in parallel, it is possible that the relocation will fail and abort the current transaction with an -EINVAL error: [134243.988595] BTRFS info (device sdc): found 14 extents, stage: move data extents [134243.999871] ------------[ cut here ]------------ [134244.000741] BTRFS: Transaction aborted (error -22) [134244.001692] WARNING: CPU: 0 PID: 26954 at fs/btrfs/ctree.c:1071 __btrfs_cow_block+0x6a7/0x790 [btrfs] [134244.003380] Modules linked in: btrfs blake2b_generic xor raid6_pq (...) [134244.012577] CPU: 0 PID: 26954 Comm: btrfs Tainted: G W 5.6.0-rc7-btrfs-next-58 #5 [134244.014162] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [134244.016184] RIP: 0010:__btrfs_cow_block+0x6a7/0x790 [btrfs] [134244.017151] Code: 48 c7 c7 (...) [134244.020549] RSP: 0018:ffffa41607863888 EFLAGS: 00010286 [134244.021515] RAX: 0000000000000000 RBX: ffff9614bdfe09c8 RCX: 0000000000000000 [134244.022822] RDX: 0000000000000001 RSI: ffffffffb3d63980 RDI: 0000000000000001 [134244.024124] RBP: ffff961589e8c000 R08: 0000000000000000 R09: 0000000000000001 [134244.025424] R10: ffffffffc0ae5955 R11: 0000000000000000 R12: ffff9614bd530d08 [134244.026725] R13: ffff9614ced41b88 R14: ffff9614bdfe2a48 R15: 0000000000000000 [134244.028024] FS: 00007f29b63c08c0(0000) GS:ffff9615ba600000(0000) knlGS:0000000000000000 [134244.029491] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [134244.030560] CR2: 00007f4eb339b000 CR3: 0000000130d6e006 CR4: 00000000003606f0 [134244.031997] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [134244.033153] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [134244.034484] Call Trace: [134244.034984] btrfs_cow_block+0x12b/0x2b0 [btrfs] [134244.035859] do_relocation+0x30b/0x790 [btrfs] [134244.036681] ? do_raw_spin_unlock+0x49/0xc0 [134244.037460] ? _raw_spin_unlock+0x29/0x40 [134244.038235] relocate_tree_blocks+0x37b/0x730 [btrfs] [134244.039245] relocate_block_group+0x388/0x770 [btrfs] [134244.040228] btrfs_relocate_block_group+0x161/0x2e0 [btrfs] [134244.041323] btrfs_relocate_chunk+0x36/0x110 [btrfs] [134244.041345] btrfs_balance+0xc06/0x1860 [btrfs] [134244.043382] ? btrfs_ioctl_balance+0x27c/0x310 [btrfs] [134244.045586] btrfs_ioctl_balance+0x1ed/0x310 [btrfs] [134244.045611] btrfs_ioctl+0x1880/0x3760 [btrfs] [134244.049043] ? do_raw_spin_unlock+0x49/0xc0 [134244.049838] ? _raw_spin_unlock+0x29/0x40 [134244.050587] ? __handle_mm_fault+0x11b3/0x14b0 [134244.051417] ? ksys_ioctl+0x92/0xb0 [134244.052070] ksys_ioctl+0x92/0xb0 [134244.052701] ? trace_hardirqs_off_thunk+0x1a/0x1c [134244.053511] __x64_sys_ioctl+0x16/0x20 [134244.054206] do_syscall_64+0x5c/0x280 [134244.054891] entry_SYSCALL_64_after_hwframe+0x49/0xbe [134244.055819] RIP: 0033:0x7f29b51c9dd7 [134244.056491] Code: 00 00 00 (...) [134244.059767] RSP: 002b:00007ffcccc1dd08 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [134244.061168] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f29b51c9dd7 [134244.062474] RDX: 00007ffcccc1dda0 RSI: 00000000c4009420 RDI: 0000000000000003 [134244.063771] RBP: 0000000000000003 R08: 00005565cea4b000 R09: 0000000000000000 [134244.065032] R10: 0000000000000541 R11: 0000000000000202 R12: 00007ffcccc2060a [134244.066327] R13: 00007ffcccc1dda0 R14: 0000000000000002 R15: 00007ffcccc1dec0 [134244.067626] irq event stamp: 0 [134244.068202] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [134244.069351] hardirqs last disabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020 [134244.070909] softirqs last enabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020 [134244.072392] softirqs last disabled at (0): [<0000000000000000>] 0x0 [134244.073432] ---[ end trace bd7c03622e0b0a99 ]--- The -EINVAL error comes from the following chain of function calls: __btrfs_cow_block() <-- aborts the transaction btrfs_reloc_cow_block() replace_file_extents() get_new_location() <-- returns -EINVAL When relocating a data block group, for each allocated extent of the block group, we preallocate another extent (at prealloc_file_extent_cluster()), associated with the data relocation inode, and then dirty all its pages. These preallocated extents have, and must have, the same size that extents from the data block group being relocated have. Later before we start the relocation stage that updates pointers (bytenr field of file extent items) to point to the the new extents, we trigger writeback for the data relocation inode. The expectation is that writeback will write the pages to the previously preallocated extents, that it follows the NOCOW path. That is generally the case, however, if a scrub is running it may have turned the block group that contains those extents into RO mode, in which case writeback falls back to the COW path. However in the COW path instead of allocating exactly one extent with the expected size, the allocator may end up allocating several smaller extents due to free space fragmentation - because we tell it at cow_file_range() that the minimum allocation size can match the filesystem's sector size. This later breaks the relocation's expectation that an extent associated to a file extent item in the data relocation inode has the same size as the respective extent pointed by a file extent item in another tree - in this case the extent to which the relocation inode poins to is smaller, causing relocation.c:get_new_location() to return -EINVAL. For example, if we are relocating a data block group X that has a logical address of X and the block group has an extent allocated at the logical address X + 128KiB with a size of 64KiB: 1) At prealloc_file_extent_cluster() we allocate an extent for the data relocation inode with a size of 64KiB and associate it to the file offset 128KiB (X + 128KiB - X) of the data relocation inode. This preallocated extent was allocated at block group Z; 2) A scrub running in parallel turns block group Z into RO mode and starts scrubing its extents; 3) Relocation triggers writeback for the data relocation inode; 4) When running delalloc (btrfs_run_delalloc_range()), we try first the NOCOW path because the data relocation inode has BTRFS_INODE_PREALLOC set in its flags. However, because block group Z is in RO mode, the NOCOW path (run_delalloc_nocow()) falls back into the COW path, by calling cow_file_range(); 5) At cow_file_range(), in the first iteration of the while loop we call btrfs_reserve_extent() to allocate a 64KiB extent and pass it a minimum allocation size of 4KiB (fs_info->sectorsize). Due to free space fragmentation, btrfs_reserve_extent() ends up allocating two extents of 32KiB each, each one on a different iteration of that while loop; 6) Writeback of the data relocation inode completes; 7) Relocation proceeds and ends up at relocation.c:replace_file_extents(), with a leaf which has a file extent item that points to the data extent from block group X, that has a logical address (bytenr) of X + 128KiB and a size of 64KiB. Then it calls get_new_location(), which does a lookup in the data relocation tree for a file extent item starting at offset 128KiB (X + 128KiB - X) and belonging to the data relocation inode. It finds a corresponding file extent item, however that item points to an extent that has a size of 32KiB, which doesn't match the expected size of 64KiB, resuling in -EINVAL being returned from this function and propagated up to __btrfs_cow_block(), which aborts the current transaction. To fix this make sure that at cow_file_range() when we call the allocator we pass it a minimum allocation size corresponding the desired extent size if the inode belongs to the data relocation tree, otherwise pass it the filesystem's sector size as the minimum allocation size. CC: stable(a)vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik <josef(a)toxicpanda.com> Signed-off-by: Filipe Manana <fdmanana(a)suse.com> Signed-off-by: David Sterba <dsterba(a)suse.com> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 12b5d61f23bb..62c3f4972ff6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -985,6 +985,7 @@ static noinline int cow_file_range(struct inode *inode, u64 num_bytes; unsigned long ram_size; u64 cur_alloc_size = 0; + u64 min_alloc_size; u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; struct extent_map *em; @@ -1035,10 +1036,26 @@ static noinline int cow_file_range(struct inode *inode, btrfs_drop_extent_cache(BTRFS_I(inode), start, start + num_bytes - 1, 0); + /* + * Relocation relies on the relocated extents to have exactly the same + * size as the original extents. Normally writeback for relocation data + * extents follows a NOCOW path because relocation preallocates the + * extents. However, due to an operation such as scrub turning a block + * group to RO mode, it may fallback to COW mode, so we must make sure + * an extent allocated during COW has exactly the requested size and can + * not be split into smaller extents, otherwise relocation breaks and + * fails during the stage where it updates the bytenr of file extent + * items. + */ + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + min_alloc_size = num_bytes; + else + min_alloc_size = fs_info->sectorsize; + while (num_bytes > 0) { cur_alloc_size = num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, - fs_info->sectorsize, 0, alloc_hint, + min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret < 0) goto out_unlock;

5 years, 5 months

2
1
0 0

FAILED: patch "[PATCH] btrfs: fix race between block group removal and block group" failed to apply to 5.7-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 5.7-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. thanks, greg k-h ------------------ original commit in Linus's tree ------------------ >From ffcb9d44572afbaf8fa6dbf5115bff6dab7b299e Mon Sep 17 00:00:00 2001 From: Filipe Manana <fdmanana(a)suse.com> Date: Mon, 1 Jun 2020 19:12:19 +0100 Subject: [PATCH] btrfs: fix race between block group removal and block group creation There is a race between block group removal and block group creation when the removal is completed by a task running fitrim or scrub. When this happens we end up failing the block group creation with an error -EEXIST since we attempt to insert a duplicate block group item key in the extent tree. That results in a transaction abort. The race happens like this: 1) Task A is doing a fitrim, and at btrfs_trim_block_group() it freezes block group X with btrfs_freeze_block_group() (until very recently that was named btrfs_get_block_group_trimming()); 2) Task B starts removing block group X, either because it's now unused or due to relocation for example. So at btrfs_remove_block_group(), while holding the chunk mutex and the block group's lock, it sets the 'removed' flag of the block group and it sets the local variable 'remove_em' to false, because the block group is currently frozen (its 'frozen' counter is > 0, until very recently this counter was named 'trimming'); 3) Task B unlocks the block group and the chunk mutex; 4) Task A is done trimming the block group and unfreezes the block group by calling btrfs_unfreeze_block_group() (until very recently this was named btrfs_put_block_group_trimming()). In this function we lock the block group and set the local variable 'cleanup' to true because we were able to decrement the block group's 'frozen' counter down to 0 and the flag 'removed' is set in the block group. Since 'cleanup' is set to true, it locks the chunk mutex and removes the extent mapping representing the block group from the mapping tree; 5) Task C allocates a new block group Y and it picks up the logical address that block group X had as the logical address for Y, because X was the block group with the highest logical address and now the second block group with the highest logical address, the last in the fs mapping tree, ends at an offset corresponding to block group X's logical address (this logical address selection is done at volumes.c:find_next_chunk()). At this point the new block group Y does not have yet its item added to the extent tree (nor the corresponding device extent items and chunk item in the device and chunk trees). The new group Y is added to the list of pending block groups in the transaction handle; 6) Before task B proceeds to removing the block group item for block group X from the extent tree, which has a key matching: (X logical offset, BTRFS_BLOCK_GROUP_ITEM_KEY, length) task C while ending its transaction handle calls btrfs_create_pending_block_groups(), which finds block group Y and tries to insert the block group item for Y into the exten tree, which fails with -EEXIST since logical offset is the same that X had and task B hasn't yet deleted the key from the extent tree. This failure results in a transaction abort, producing a stack like the following: ------------[ cut here ]------------ BTRFS: Transaction aborted (error -17) WARNING: CPU: 2 PID: 19736 at fs/btrfs/block-group.c:2074 btrfs_create_pending_block_groups+0x1eb/0x260 [btrfs] Modules linked in: btrfs blake2b_generic xor raid6_pq (...) CPU: 2 PID: 19736 Comm: fsstress Tainted: G W 5.6.0-rc7-btrfs-next-58 #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 RIP: 0010:btrfs_create_pending_block_groups+0x1eb/0x260 [btrfs] Code: ff ff ff 48 8b 55 50 f0 48 (...) RSP: 0018:ffffa4160a1c7d58 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff961581909d98 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffffffffb3d63990 RDI: 0000000000000001 RBP: ffff9614f3356a58 R08: 0000000000000000 R09: 0000000000000001 R10: ffff9615b65b0040 R11: 0000000000000000 R12: ffff961581909c10 R13: ffff9615b0c32000 R14: ffff9614f3356ab0 R15: ffff9614be779000 FS: 00007f2ce2841e80(0000) GS:ffff9615bae00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000555f18780000 CR3: 0000000131d34005 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_start_dirty_block_groups+0x398/0x4e0 [btrfs] btrfs_commit_transaction+0xd0/0xc50 [btrfs] ? btrfs_attach_transaction_barrier+0x1e/0x50 [btrfs] ? __ia32_sys_fdatasync+0x20/0x20 iterate_supers+0xdb/0x180 ksys_sync+0x60/0xb0 __ia32_sys_sync+0xa/0x10 do_syscall_64+0x5c/0x280 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7f2ce1d4d5b7 Code: 83 c4 08 48 3d 01 (...) RSP: 002b:00007ffd8b558c58 EFLAGS: 00000202 ORIG_RAX: 00000000000000a2 RAX: ffffffffffffffda RBX: 000000000000002c RCX: 00007f2ce1d4d5b7 RDX: 00000000ffffffff RSI: 00000000186ba07b RDI: 000000000000002c RBP: 0000555f17b9e520 R08: 0000000000000012 R09: 000000000000ce00 R10: 0000000000000078 R11: 0000000000000202 R12: 0000000000000032 R13: 0000000051eb851f R14: 00007ffd8b558cd0 R15: 0000555f1798ec20 irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020 softirqs last enabled at (0): [<ffffffffb2abdedf>] copy_process+0x74f/0x2020 softirqs last disabled at (0): [<0000000000000000>] 0x0 ---[ end trace bd7c03622e0b0a9c ]--- Fix this simply by making btrfs_remove_block_group() remove the block group's item from the extent tree before it flags the block group as removed. Also make the free space deletion from the free space tree before flagging the block group as removed, to avoid a similar race with adding and removing free space entries for the free space tree. Fixes: 04216820fe83d5 ("Btrfs: fix race between fs trimming and block group remove/allocation") CC: stable(a)vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana <fdmanana(a)suse.com> Signed-off-by: David Sterba <dsterba(a)suse.com> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 6462dd0b155c..c037ef514b64 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1092,6 +1092,25 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&block_group->space_info->lock); + /* + * Remove the free space for the block group from the free space tree + * and the block group's item from the extent tree before marking the + * block group as removed. This is to prevent races with tasks that + * freeze and unfreeze a block group, this task and another task + * allocating a new block group - the unfreeze task ends up removing + * the block group's extent map before the task calling this function + * deletes the block group item from the extent tree, allowing for + * another task to attempt to create another block group with the same + * item key (and failing with -EEXIST and a transaction abort). + */ + ret = remove_block_group_free_space(trans, block_group); + if (ret) + goto out; + + ret = remove_block_group_item(trans, path, block_group); + if (ret < 0) + goto out; + mutex_lock(&fs_info->chunk_mutex); spin_lock(&block_group->lock); block_group->removed = 1; @@ -1126,14 +1145,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, mutex_unlock(&fs_info->chunk_mutex); - ret = remove_block_group_free_space(trans, block_group); - if (ret) - goto out; - - ret = remove_block_group_item(trans, path, block_group); - if (ret < 0) - goto out; - if (remove_em) { struct extent_map_tree *em_tree;

5 years, 5 months

2
1
0 0

[PATCH 3.16 00/10] Fix possible crash on L2CAP socket shutdown

by Denis Grigorev

This series of commits fixes a problem with closing l2cap connection if socket has unACKed frames. Due an to an infinite loop in l2cap_wait_ack the userspace process gets stuck in close() and then the kernel crashes with the following report: Call trace: [<ffffffc000ace0b4>] l2cap_do_send+0x2c/0xec [<ffffffc000acf5f8>] l2cap_send_sframe+0x178/0x260 [<ffffffc000acf740>] l2cap_send_rr_or_rnr+0x60/0x84 [<ffffffc000acf980>] l2cap_ack_timeout+0x60/0xac [<ffffffc0000b35b8>] process_one_work+0x140/0x384 [<ffffffc0000b393c>] worker_thread+0x140/0x4e4 [<ffffffc0000b8c48>] kthread+0xdc/0xf0 All kernels below v4.3 are affected. ------------------------- Commit log: Alexey Dobriyan (1): Bluetooth: Stop sabotaging list poisoning Dean Jenkins (8): Bluetooth: L2CAP ERTM shutdown protect sk and chan Bluetooth: Make __l2cap_wait_ack more efficient Bluetooth: Add BT_DBG to l2cap_sock_shutdown() Bluetooth: __l2cap_wait_ack() use msecs_to_jiffies() Bluetooth: __l2cap_wait_ack() add defensive timeout Bluetooth: Unwind l2cap_sock_shutdown() Bluetooth: Reorganize mutex lock in l2cap_sock_shutdown() Bluetooth: l2cap_disconnection_req priority over shutdown Tedd Ho-Jeong An (1): Bluetooth: Reinitialize the list after deletion for session user list include/net/bluetooth/l2cap.h | 2 + net/bluetooth/l2cap_core.c | 12 ++--- net/bluetooth/l2cap_sock.c | 94 +++++++++++++++++++++++++++-------- 3 files changed, 78 insertions(+), 30 deletions(-) -- 2.17.1

5 years, 5 months

3
14
0 0

FAILED: patch "[PATCH] btrfs: fix a block group ref counter leak after failure to" failed to apply to 4.19-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 4.19-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. thanks, greg k-h ------------------ original commit in Linus's tree ------------------ >From 9fecd13202f520f3f25d5b1c313adb740fe19773 Mon Sep 17 00:00:00 2001 From: Filipe Manana <fdmanana(a)suse.com> Date: Mon, 1 Jun 2020 19:12:06 +0100 Subject: [PATCH] btrfs: fix a block group ref counter leak after failure to remove block group When removing a block group, if we fail to delete the block group's item from the extent tree, we jump to the 'out' label and end up decrementing the block group's reference count once only (by 1), resulting in a counter leak because the block group at that point was already removed from the block group cache rbtree - so we have to decrement the reference count twice, once for the rbtree and once for our lookup at the start of the function. There is a second bug where if removing the free space tree entries (the call to remove_block_group_free_space()) fails we end up jumping to the 'out_put_group' label but end up decrementing the reference count only once, when we should have done it twice, since we have already removed the block group from the block group cache rbtree. This happens because the reference count decrement for the rbtree reference happens after attempting to remove the free space tree entries, which is far away from the place where we remove the block group from the rbtree. To make things less error prone, decrement the reference count for the rbtree immediately after removing the block group from it. This also eleminates the need for two different exit labels on error, renaming 'out_put_label' to just 'out' and removing the old 'out'. Fixes: f6033c5e333238 ("btrfs: fix block group leak when removing fails") CC: stable(a)vger.kernel.org # 4.4+ Reviewed-by: Nikolay Borisov <nborisov(a)suse.com> Reviewed-by: Anand Jain <anand.jain(a)oracle.com> Signed-off-by: Filipe Manana <fdmanana(a)suse.com> Reviewed-by: David Sterba <dsterba(a)suse.com> Signed-off-by: David Sterba <dsterba(a)suse.com> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 176e8a292fd1..6462dd0b155c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -940,7 +940,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; - goto out_put_group; + goto out; } /* @@ -978,7 +978,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(inode); - goto out_put_group; + goto out; } clear_nlink(inode); /* One for the block groups ref */ @@ -1001,13 +1001,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); if (ret < 0) - goto out_put_group; + goto out; if (ret > 0) btrfs_release_path(path); if (ret == 0) { ret = btrfs_del_item(trans, tree_root, path); if (ret) - goto out_put_group; + goto out; btrfs_release_path(path); } @@ -1016,6 +1016,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, &fs_info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); + /* Once for the block groups rbtree */ + btrfs_put_block_group(block_group); + if (fs_info->first_logical_byte == block_group->start) fs_info->first_logical_byte = (u64)-1; spin_unlock(&fs_info->block_group_cache_lock); @@ -1125,10 +1128,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = remove_block_group_free_space(trans, block_group); if (ret) - goto out_put_group; - - /* Once for the block groups rbtree */ - btrfs_put_block_group(block_group); + goto out; ret = remove_block_group_item(trans, path, block_group); if (ret < 0) @@ -1145,10 +1145,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, free_extent_map(em); } -out_put_group: +out: /* Once for the lookup reference */ btrfs_put_block_group(block_group); -out: if (remove_rsv) btrfs_delayed_refs_rsv_release(fs_info, 1); btrfs_free_path(path);

5 years, 5 months

2
1
0 0

[PATCH 1/3] mm/vmscan: restore zone_reclaim_mode ABI

by Dave Hansen

From: Dave Hansen <dave.hansen(a)linux.intel.com> I went to go add a new RECLAIM_* mode for the zone_reclaim_mode sysctl. Like a good kernel developer, I also went to go update the documentation. I noticed that the bits in the documentation didn't match the bits in the #defines. The VM never explicitly checks the RECLAIM_ZONE bit. The bit is, however implicitly checked when checking 'node_reclaim_mode==0'. The RECLAIM_ZONE #define was removed in a cleanup. That, by itself is fine. But, when the bit was removed (bit 0) the _other_ bit locations also got changed. That's not OK because the bit values are documented to mean one specific thing and users surely rely on them meaning that one thing and not changing from kernel to kernel. The end result is that if someone had a script that did: sysctl vm.zone_reclaim_mode=1 That script went from doing nothing to writing out pages during node reclaim after the commit in question. That's not great. Put the bits back the way they were and add a comment so something like this is a bit harder to do again. Update the documentation to make it clear that the first bit is ignored. Signed-off-by: Dave Hansen <dave.hansen(a)linux.intel.com> Fixes: 648b5cf368e0 ("mm/vmscan: remove unused RECLAIM_OFF/RECLAIM_ZONE") Cc: Ben Widawsky <ben.widawsky(a)intel.com> Cc: Alex Shi <alex.shi(a)linux.alibaba.com> Cc: Daniel Wagner <dwagner(a)suse.de> Cc: "Tobin C. Harding" <tobin(a)kernel.org> Cc: Christoph Lameter <cl(a)linux.com> Cc: Andrew Morton <akpm(a)linux-foundation.org> Cc: Huang Ying <ying.huang(a)intel.com> Cc: Dan Williams <dan.j.williams(a)intel.com> Cc: Qian Cai <cai(a)lca.pw> Cc: Daniel Wagner <dwagner(a)suse.de> Cc: stable(a)vger.kernel.org --- b/Documentation/admin-guide/sysctl/vm.rst | 10 +++++----- b/mm/vmscan.c | 9 +++++++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff -puN Documentation/admin-guide/sysctl/vm.rst~mm-vmscan-restore-old-zone_reclaim_mode-abi Documentation/admin-guide/sysctl/vm.rst --- a/Documentation/admin-guide/sysctl/vm.rst~mm-vmscan-restore-old-zone_reclaim_mode-abi 2020-07-01 08:22:11.354955336 -0700 +++ b/Documentation/admin-guide/sysctl/vm.rst 2020-07-01 08:22:11.360955336 -0700 @@ -948,11 +948,11 @@ that benefit from having their data cach left disabled as the caching effect is likely to be more important than data locality. -zone_reclaim may be enabled if it's known that the workload is partitioned -such that each partition fits within a NUMA node and that accessing remote -memory would cause a measurable performance reduction. The page allocator -will then reclaim easily reusable pages (those page cache pages that are -currently not used) before allocating off node pages. +Consider enabling one or more zone_reclaim mode bits if it's known that the +workload is partitioned such that each partition fits within a NUMA node +and that accessing remote memory would cause a measurable performance +reduction. The page allocator will take additional actions before +allocating off node pages. Allowing zone reclaim to write out pages stops processes that are writing large amounts of data from dirtying pages on other nodes. Zone diff -puN mm/vmscan.c~mm-vmscan-restore-old-zone_reclaim_mode-abi mm/vmscan.c --- a/mm/vmscan.c~mm-vmscan-restore-old-zone_reclaim_mode-abi 2020-07-01 08:22:11.356955336 -0700 +++ b/mm/vmscan.c 2020-07-01 08:22:11.362955336 -0700 @@ -4090,8 +4090,13 @@ module_init(kswapd_init) */ int node_reclaim_mode __read_mostly; -#define RECLAIM_WRITE (1<<0) /* Writeout pages during reclaim */ -#define RECLAIM_UNMAP (1<<1) /* Unmap pages during reclaim */ +/* + * These bit locations are exposed in the vm.zone_reclaim_mode sysctl + * ABI. New bits are OK, but existing bits can never change. + */ +#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ /* * Priority for NODE_RECLAIM. This determines the fraction of pages _

5 years, 5 months

4
3
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror July 2020