The patch below does not apply to the 4.20-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From d1051d6ebf8ef3517a5a3cf82bba8436d190f1c2 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov(a)suse.com>
Date: Wed, 21 Nov 2018 17:10:52 +0200
Subject: [PATCH] btrfs: Fix error handling in btrfs_cleanup_ordered_extents
Running btrfs/124 in a loop hung up on me sporadically with the
following call trace:
btrfs D 0 5760 5324 0x00000000
Call Trace:
? __schedule+0x243/0x800
schedule+0x33/0x90
btrfs_start_ordered_extent+0x10c/0x1b0 [btrfs]
? wait_woken+0xa0/0xa0
btrfs_wait_ordered_range+0xbb/0x100 [btrfs]
btrfs_relocate_block_group+0x1ff/0x230 [btrfs]
btrfs_relocate_chunk+0x49/0x100 [btrfs]
btrfs_balance+0xbeb/0x1740 [btrfs]
btrfs_ioctl_balance+0x2ee/0x380 [btrfs]
btrfs_ioctl+0x1691/0x3110 [btrfs]
? lockdep_hardirqs_on+0xed/0x180
? __handle_mm_fault+0x8e7/0xfb0
? _raw_spin_unlock+0x24/0x30
? __handle_mm_fault+0x8e7/0xfb0
? do_vfs_ioctl+0xa5/0x6e0
? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
do_vfs_ioctl+0xa5/0x6e0
? entry_SYSCALL_64_after_hwframe+0x3e/0xbe
ksys_ioctl+0x3a/0x70
__x64_sys_ioctl+0x16/0x20
do_syscall_64+0x60/0x1b0
entry_SYSCALL_64_after_hwframe+0x49/0xbe
This happens because during page writeback it's valid for
writepage_delalloc to instantiate a delalloc range which doesn't belong
to the page currently being written back.
The reason this case is valid is due to find_lock_delalloc_range
returning any available range after the passed delalloc_start and
ignoring whether the page under writeback is within that range.
In turn ordered extents (OE) are always created for the returned range
from find_lock_delalloc_range. If, however, a failure occurs while OE
are being created then the clean up code in btrfs_cleanup_ordered_extents
will be called.
Unfortunately the code in btrfs_cleanup_ordered_extents doesn't consider
the case of such 'foreign' range being processed and instead it always
assumes that the range OE are created for belongs to the page. This
leads to the first page of such foregin range to not be cleaned up since
it's deliberately missed and skipped by the current cleaning up code.
Fix this by correctly checking whether the current page belongs to the
range being instantiated and if so adjsut the range parameters passed
for cleaning up. If it doesn't, then just clean the whole OE range
directly.
Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang")
CC: stable(a)vger.kernel.org # 4.14+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Nikolay Borisov <nborisov(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e7b97c699acf..e1451a69432b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -110,17 +110,17 @@ static void __endio_write_update_ordered(struct inode *inode,
* extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
* and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
* to be released, which we want to happen only when finishing the ordered
- * extent (btrfs_finish_ordered_io()). Also note that the caller of
- * btrfs_run_delalloc_range already does proper cleanup for the first page of
- * the range, that is, it invokes the callback writepage_end_io_hook() for the
- * range of the first page.
+ * extent (btrfs_finish_ordered_io()).
*/
static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
- const u64 offset,
- const u64 bytes)
+ struct page *locked_page,
+ u64 offset, u64 bytes)
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
+ u64 page_start = page_offset(locked_page);
+ u64 page_end = page_start + PAGE_SIZE - 1;
+
struct page *page;
while (index <= end_index) {
@@ -131,8 +131,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
ClearPagePrivate2(page);
put_page(page);
}
- return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
- bytes - PAGE_SIZE, false);
+
+ /*
+ * In case this page belongs to the delalloc range being instantiated
+ * then skip it, since the first page of a range is going to be
+ * properly cleaned up by the caller of run_delalloc_range
+ */
+ if (page_start >= offset && page_end <= (offset + bytes - 1)) {
+ offset += PAGE_SIZE;
+ bytes -= PAGE_SIZE;
+ }
+
+ return __endio_write_update_ordered(inode, offset, bytes, false);
}
static int btrfs_dirty_inode(struct inode *inode);
@@ -1603,7 +1613,8 @@ int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
write_flags);
}
if (ret)
- btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
+ btrfs_cleanup_ordered_extents(inode, locked_page, start,
+ end - start + 1);
return ret;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From a5fb11429167ee6ddeeacc554efaf5776b36433a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Mon, 26 Nov 2018 20:07:17 +0000
Subject: [PATCH] Btrfs: fix deadlock with memory reclaim during scrub
When a transaction commit starts, it attempts to pause scrub and it blocks
until the scrub is paused. So while the transaction is blocked waiting for
scrub to pause, we can not do memory allocation with GFP_KERNEL from scrub,
otherwise we risk getting into a deadlock with reclaim.
Checking for scrub pause requests is done early at the beginning of the
while loop of scrub_stripe() and later in the loop, scrub_extent() and
scrub_raid56_parity() are called, which in turn call scrub_pages() and
scrub_pages_for_parity() respectively. These last two functions do memory
allocations using GFP_KERNEL. Same problem could happen while scrubbing
the super blocks, since it calls scrub_pages().
We also can not have any of the worker tasks, created by the scrub task,
doing GFP_KERNEL allocations, because before pausing, the scrub task waits
for all the worker tasks to complete (also done at scrub_stripe()).
So make sure GFP_NOFS is used for the memory allocations because at any
time a scrub pause request can happen from another task that started to
commit a transaction.
Fixes: 58c4e173847a ("btrfs: scrub: use GFP_KERNEL on the submission path")
CC: stable(a)vger.kernel.org # 4.6+
Reviewed-by: Nikolay Borisov <nborisov(a)suse.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 902819d3cf41..bbd1b36f4918 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -322,6 +322,7 @@ static struct full_stripe_lock *insert_full_stripe_lock(
struct rb_node *parent = NULL;
struct full_stripe_lock *entry;
struct full_stripe_lock *ret;
+ unsigned int nofs_flag;
lockdep_assert_held(&locks_root->lock);
@@ -339,8 +340,17 @@ static struct full_stripe_lock *insert_full_stripe_lock(
}
}
- /* Insert new lock */
+ /*
+ * Insert new lock.
+ *
+ * We must use GFP_NOFS because the scrub task might be waiting for a
+ * worker task executing this function and in turn a transaction commit
+ * might be waiting the scrub task to pause (which needs to wait for all
+ * the worker tasks to complete before pausing).
+ */
+ nofs_flag = memalloc_nofs_save();
ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!ret)
return ERR_PTR(-ENOMEM);
ret->logical = fstripe_logical;
@@ -1620,8 +1630,19 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
mutex_lock(&sctx->wr_lock);
again:
if (!sctx->wr_curr_bio) {
+ unsigned int nofs_flag;
+
+ /*
+ * We must use GFP_NOFS because the scrub task might be waiting
+ * for a worker task executing this function and in turn a
+ * transaction commit might be waiting the scrub task to pause
+ * (which needs to wait for all the worker tasks to complete
+ * before pausing).
+ */
+ nofs_flag = memalloc_nofs_save();
sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!sctx->wr_curr_bio) {
mutex_unlock(&sctx->wr_lock);
return -ENOMEM;
@@ -3772,6 +3793,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
+ unsigned int nofs_flag;
if (btrfs_fs_closing(fs_info))
return -EINVAL;
@@ -3875,6 +3897,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
atomic_inc(&fs_info->scrubs_running);
mutex_unlock(&fs_info->scrub_lock);
+ /*
+ * In order to avoid deadlock with reclaim when there is a transaction
+ * trying to pause scrub, make sure we use GFP_NOFS for all the
+ * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
+ * invoked by our callees. The pausing request is done when the
+ * transaction commit starts, and it blocks the transaction until scrub
+ * is paused (done at specific points at scrub_stripe() or right above
+ * before incrementing fs_info->scrubs_running).
+ */
+ nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
/*
* by holding device list mutex, we can
@@ -3887,6 +3919,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (!ret)
ret = scrub_enumerate_chunks(sctx, dev, start, end);
+ memalloc_nofs_restore(nofs_flag);
wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
atomic_dec(&fs_info->scrubs_running);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From a5fb11429167ee6ddeeacc554efaf5776b36433a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Mon, 26 Nov 2018 20:07:17 +0000
Subject: [PATCH] Btrfs: fix deadlock with memory reclaim during scrub
When a transaction commit starts, it attempts to pause scrub and it blocks
until the scrub is paused. So while the transaction is blocked waiting for
scrub to pause, we can not do memory allocation with GFP_KERNEL from scrub,
otherwise we risk getting into a deadlock with reclaim.
Checking for scrub pause requests is done early at the beginning of the
while loop of scrub_stripe() and later in the loop, scrub_extent() and
scrub_raid56_parity() are called, which in turn call scrub_pages() and
scrub_pages_for_parity() respectively. These last two functions do memory
allocations using GFP_KERNEL. Same problem could happen while scrubbing
the super blocks, since it calls scrub_pages().
We also can not have any of the worker tasks, created by the scrub task,
doing GFP_KERNEL allocations, because before pausing, the scrub task waits
for all the worker tasks to complete (also done at scrub_stripe()).
So make sure GFP_NOFS is used for the memory allocations because at any
time a scrub pause request can happen from another task that started to
commit a transaction.
Fixes: 58c4e173847a ("btrfs: scrub: use GFP_KERNEL on the submission path")
CC: stable(a)vger.kernel.org # 4.6+
Reviewed-by: Nikolay Borisov <nborisov(a)suse.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 902819d3cf41..bbd1b36f4918 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -322,6 +322,7 @@ static struct full_stripe_lock *insert_full_stripe_lock(
struct rb_node *parent = NULL;
struct full_stripe_lock *entry;
struct full_stripe_lock *ret;
+ unsigned int nofs_flag;
lockdep_assert_held(&locks_root->lock);
@@ -339,8 +340,17 @@ static struct full_stripe_lock *insert_full_stripe_lock(
}
}
- /* Insert new lock */
+ /*
+ * Insert new lock.
+ *
+ * We must use GFP_NOFS because the scrub task might be waiting for a
+ * worker task executing this function and in turn a transaction commit
+ * might be waiting the scrub task to pause (which needs to wait for all
+ * the worker tasks to complete before pausing).
+ */
+ nofs_flag = memalloc_nofs_save();
ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!ret)
return ERR_PTR(-ENOMEM);
ret->logical = fstripe_logical;
@@ -1620,8 +1630,19 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
mutex_lock(&sctx->wr_lock);
again:
if (!sctx->wr_curr_bio) {
+ unsigned int nofs_flag;
+
+ /*
+ * We must use GFP_NOFS because the scrub task might be waiting
+ * for a worker task executing this function and in turn a
+ * transaction commit might be waiting the scrub task to pause
+ * (which needs to wait for all the worker tasks to complete
+ * before pausing).
+ */
+ nofs_flag = memalloc_nofs_save();
sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!sctx->wr_curr_bio) {
mutex_unlock(&sctx->wr_lock);
return -ENOMEM;
@@ -3772,6 +3793,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
+ unsigned int nofs_flag;
if (btrfs_fs_closing(fs_info))
return -EINVAL;
@@ -3875,6 +3897,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
atomic_inc(&fs_info->scrubs_running);
mutex_unlock(&fs_info->scrub_lock);
+ /*
+ * In order to avoid deadlock with reclaim when there is a transaction
+ * trying to pause scrub, make sure we use GFP_NOFS for all the
+ * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
+ * invoked by our callees. The pausing request is done when the
+ * transaction commit starts, and it blocks the transaction until scrub
+ * is paused (done at specific points at scrub_stripe() or right above
+ * before incrementing fs_info->scrubs_running).
+ */
+ nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
/*
* by holding device list mutex, we can
@@ -3887,6 +3919,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (!ret)
ret = scrub_enumerate_chunks(sctx, dev, start, end);
+ memalloc_nofs_restore(nofs_flag);
wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
atomic_dec(&fs_info->scrubs_running);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0d228ece59a35a9b9e8ff0d40653234a6d90f61e Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain(a)oracle.com>
Date: Sun, 11 Nov 2018 22:22:17 +0800
Subject: [PATCH] btrfs: dev-replace: go back to suspended state if target
device is missing
At the time of forced unmount we place the running replace to
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state, so when the system comes
back and expect the target device is missing.
Then let the replace state continue to be in
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state instead of
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED as there isn't any matching scrub
running as part of replace.
Fixes: e93c89c1aaaa ("Btrfs: add new sources for device replace code")
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Anand Jain <anand.jain(a)oracle.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 32da6901dc88..11df8f778b63 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -890,6 +890,8 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
"cannot continue dev_replace, tgtdev is missing");
btrfs_info(fs_info,
"you may cancel the operation after 'mount -o degraded'");
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
btrfs_dev_replace_write_unlock(dev_replace);
return 0;
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0d228ece59a35a9b9e8ff0d40653234a6d90f61e Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain(a)oracle.com>
Date: Sun, 11 Nov 2018 22:22:17 +0800
Subject: [PATCH] btrfs: dev-replace: go back to suspended state if target
device is missing
At the time of forced unmount we place the running replace to
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state, so when the system comes
back and expect the target device is missing.
Then let the replace state continue to be in
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state instead of
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED as there isn't any matching scrub
running as part of replace.
Fixes: e93c89c1aaaa ("Btrfs: add new sources for device replace code")
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Anand Jain <anand.jain(a)oracle.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 32da6901dc88..11df8f778b63 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -890,6 +890,8 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
"cannot continue dev_replace, tgtdev is missing");
btrfs_info(fs_info,
"you may cancel the operation after 'mount -o degraded'");
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
btrfs_dev_replace_write_unlock(dev_replace);
return 0;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 65b6657672388b72822e0367f06d41c1e3ffb5bb Mon Sep 17 00:00:00 2001
From: Jernej Skrabec <jernej.skrabec(a)siol.net>
Date: Sun, 4 Nov 2018 19:26:40 +0100
Subject: [PATCH] clk: sunxi-ng: Use u64 for calculation of NM rate
Allwinner H6 SoC has multiplier N range between 1 and 254. Since parent
rate is 24MHz, intermediate result when calculating final rate easily
overflows 32 bit variable.
Because of that, introduce function for calculating clock rate which
uses 64 bit variable for intermediate result.
Fixes: 6174a1e24b0d ("clk: sunxi-ng: Add N-M-factor clock support")
Fixes: ee28648cb2b4 ("clk: sunxi-ng: Remove the use of rational computations")
CC: <stable(a)vger.kernel.org>
Signed-off-by: Jernej Skrabec <jernej.skrabec(a)siol.net>
Signed-off-by: Maxime Ripard <maxime.ripard(a)bootlin.com>
diff --git a/drivers/clk/sunxi-ng/ccu_nm.c b/drivers/clk/sunxi-ng/ccu_nm.c
index 6fe3c14f7b2d..424d8635b053 100644
--- a/drivers/clk/sunxi-ng/ccu_nm.c
+++ b/drivers/clk/sunxi-ng/ccu_nm.c
@@ -19,6 +19,17 @@ struct _ccu_nm {
unsigned long m, min_m, max_m;
};
+static unsigned long ccu_nm_calc_rate(unsigned long parent,
+ unsigned long n, unsigned long m)
+{
+ u64 rate = parent;
+
+ rate *= n;
+ do_div(rate, m);
+
+ return rate;
+}
+
static void ccu_nm_find_best(unsigned long parent, unsigned long rate,
struct _ccu_nm *nm)
{
@@ -28,7 +39,8 @@ static void ccu_nm_find_best(unsigned long parent, unsigned long rate,
for (_n = nm->min_n; _n <= nm->max_n; _n++) {
for (_m = nm->min_m; _m <= nm->max_m; _m++) {
- unsigned long tmp_rate = parent * _n / _m;
+ unsigned long tmp_rate = ccu_nm_calc_rate(parent,
+ _n, _m);
if (tmp_rate > rate)
continue;
@@ -100,7 +112,7 @@ static unsigned long ccu_nm_recalc_rate(struct clk_hw *hw,
if (ccu_sdm_helper_is_enabled(&nm->common, &nm->sdm))
rate = ccu_sdm_helper_read_rate(&nm->common, &nm->sdm, m, n);
else
- rate = parent_rate * n / m;
+ rate = ccu_nm_calc_rate(parent_rate, n, m);
if (nm->common.features & CCU_FEATURE_FIXED_POSTDIV)
rate /= nm->fixed_post_div;
@@ -149,7 +161,7 @@ static long ccu_nm_round_rate(struct clk_hw *hw, unsigned long rate,
_nm.max_m = nm->m.max ?: 1 << nm->m.width;
ccu_nm_find_best(*parent_rate, rate, &_nm);
- rate = *parent_rate * _nm.n / _nm.m;
+ rate = ccu_nm_calc_rate(*parent_rate, _nm.n, _nm.m);
if (nm->common.features & CCU_FEATURE_FIXED_POSTDIV)
rate /= nm->fixed_post_div;