The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 46b2f4590aab71d31088a265c86026b1e96c9de4 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Tue, 24 Jul 2018 11:54:04 +0100
Subject: [PATCH] Btrfs: fix send failure when root has deleted files still
open
The more common use case of send involves creating a RO snapshot and then
use it for a send operation. In this case it's not possible to have inodes
in the snapshot that have a link count of zero (inode with an orphan item)
since during snapshot creation we do the orphan cleanup. However, other
less common use cases for send can end up seeing inodes with a link count
of zero and in this case the send operation fails with a ENOENT error
because any attempt to generate a path for the inode, with the purpose
of creating it or updating it at the receiver, fails since there are no
inode reference items. One use case it to use a regular subvolume for
a send operation after turning it to RO mode or turning a RW snapshot
into RO mode and then using it for a send operation. In both cases, if a
file gets all its hard links deleted while there is an open file
descriptor before turning the subvolume/snapshot into RO mode, the send
operation will encounter an inode with a link count of zero and then
fail with errno ENOENT.
Example using a full send with a subvolume:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ btrfs subvolume create /mnt/sv1
$ touch /mnt/sv1/foo
$ touch /mnt/sv1/bar
# keep an open file descriptor on file bar
$ exec 73</mnt/sv1/bar
$ unlink /mnt/sv1/bar
# Turn the subvolume to RO mode and use it for a full send, while
# holding the open file descriptor.
$ btrfs property set /mnt/sv1 ro true
$ btrfs send -f /tmp/full.send /mnt/sv1
At subvol /mnt/sv1
ERROR: send ioctl failed with -2: No such file or directory
Example using an incremental send with snapshots:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ btrfs subvolume create /mnt/sv1
$ touch /mnt/sv1/foo
$ touch /mnt/sv1/bar
$ btrfs subvolume snapshot -r /mnt/sv1 /mnt/snap1
$ echo "hello world" >> /mnt/sv1/bar
$ btrfs subvolume snapshot -r /mnt/sv1 /mnt/snap2
# Turn the second snapshot to RW mode and delete file foo while
# holding an open file descriptor on it.
$ btrfs property set /mnt/snap2 ro false
$ exec 73</mnt/snap2/foo
$ unlink /mnt/snap2/foo
# Set the second snapshot back to RO mode and do an incremental send.
$ btrfs property set /mnt/snap2 ro true
$ btrfs send -f /tmp/inc.send -p /mnt/snap1 /mnt/snap2
At subvol /mnt/snap2
ERROR: send ioctl failed with -2: No such file or directory
So fix this by ignoring inodes with a link count of zero if we are either
doing a full send or if they do not exist in the parent snapshot (they
are new in the send snapshot), and unlink all paths found in the parent
snapshot when doing an incremental send (and ignoring all other inode
items, such as xattrs and extents).
A test case for fstests follows soon.
CC: stable(a)vger.kernel.org # 4.4+
Reported-by: Martin Wilck <martin.wilck(a)suse.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 42e04cd3cd95..551294a6c9e2 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -100,6 +100,7 @@ struct send_ctx {
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 cur_inode_next_write_offset;
+ bool ignore_cur_inode;
u64 send_progress;
@@ -5796,6 +5797,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
int pending_move = 0;
int refs_processed = 0;
+ if (sctx->ignore_cur_inode)
+ return 0;
+
ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
&refs_processed);
if (ret < 0)
@@ -5914,6 +5918,93 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
return ret;
}
+struct parent_paths_ctx {
+ struct list_head *refs;
+ struct send_ctx *sctx;
+};
+
+static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name,
+ void *ctx)
+{
+ struct parent_paths_ctx *ppctx = ctx;
+
+ return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx,
+ ppctx->refs);
+}
+
+/*
+ * Issue unlink operations for all paths of the current inode found in the
+ * parent snapshot.
+ */
+static int btrfs_unlink_all_paths(struct send_ctx *sctx)
+{
+ LIST_HEAD(deleted_refs);
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct parent_paths_ctx ctx;
+ int ret;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = sctx->cur_ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ ctx.refs = &deleted_refs;
+ ctx.sctx = sctx;
+
+ while (true) {
+ struct extent_buffer *eb = path->nodes[0];
+ int slot = path->slots[0];
+
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(sctx->parent_root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ if (key.objectid != sctx->cur_ino)
+ break;
+ if (key.type != BTRFS_INODE_REF_KEY &&
+ key.type != BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ ret = iterate_inode_ref(sctx->parent_root, path, &key, 1,
+ record_parent_ref, &ctx);
+ if (ret < 0)
+ goto out;
+
+ path->slots[0]++;
+ }
+
+ while (!list_empty(&deleted_refs)) {
+ struct recorded_ref *ref;
+
+ ref = list_first_entry(&deleted_refs, struct recorded_ref, list);
+ ret = send_unlink(sctx, ref->full_path);
+ if (ret < 0)
+ goto out;
+ fs_path_free(ref->full_path);
+ list_del(&ref->list);
+ kfree(ref);
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ if (ret)
+ __free_recorded_refs(&deleted_refs);
+ return ret;
+}
+
static int changed_inode(struct send_ctx *sctx,
enum btrfs_compare_tree_result result)
{
@@ -5928,6 +6019,7 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_inode_new_gen = 0;
sctx->cur_inode_last_extent = (u64)-1;
sctx->cur_inode_next_write_offset = 0;
+ sctx->ignore_cur_inode = false;
/*
* Set send_progress to current inode. This will tell all get_cur_xxx
@@ -5968,6 +6060,33 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_inode_new_gen = 1;
}
+ /*
+ * Normally we do not find inodes with a link count of zero (orphans)
+ * because the most common case is to create a snapshot and use it
+ * for a send operation. However other less common use cases involve
+ * using a subvolume and send it after turning it to RO mode just
+ * after deleting all hard links of a file while holding an open
+ * file descriptor against it or turning a RO snapshot into RW mode,
+ * keep an open file descriptor against a file, delete it and then
+ * turn the snapshot back to RO mode before using it for a send
+ * operation. So if we find such cases, ignore the inode and all its
+ * items completely if it's a new inode, or if it's a changed inode
+ * make sure all its previous paths (from the parent snapshot) are all
+ * unlinked and all other the inode items are ignored.
+ */
+ if (result == BTRFS_COMPARE_TREE_NEW ||
+ result == BTRFS_COMPARE_TREE_CHANGED) {
+ u32 nlinks;
+
+ nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
+ if (nlinks == 0) {
+ sctx->ignore_cur_inode = true;
+ if (result == BTRFS_COMPARE_TREE_CHANGED)
+ ret = btrfs_unlink_all_paths(sctx);
+ goto out;
+ }
+ }
+
if (result == BTRFS_COMPARE_TREE_NEW) {
sctx->cur_inode_gen = left_gen;
sctx->cur_inode_new = 1;
@@ -6306,15 +6425,17 @@ static int changed_cb(struct btrfs_path *left_path,
key->objectid == BTRFS_FREE_SPACE_OBJECTID)
goto out;
- if (key->type == BTRFS_INODE_ITEM_KEY)
+ if (key->type == BTRFS_INODE_ITEM_KEY) {
ret = changed_inode(sctx, result);
- else if (key->type == BTRFS_INODE_REF_KEY ||
- key->type == BTRFS_INODE_EXTREF_KEY)
- ret = changed_ref(sctx, result);
- else if (key->type == BTRFS_XATTR_ITEM_KEY)
- ret = changed_xattr(sctx, result);
- else if (key->type == BTRFS_EXTENT_DATA_KEY)
- ret = changed_extent(sctx, result);
+ } else if (!sctx->ignore_cur_inode) {
+ if (key->type == BTRFS_INODE_REF_KEY ||
+ key->type == BTRFS_INODE_EXTREF_KEY)
+ ret = changed_ref(sctx, result);
+ else if (key->type == BTRFS_XATTR_ITEM_KEY)
+ ret = changed_xattr(sctx, result);
+ else if (key->type == BTRFS_EXTENT_DATA_KEY)
+ ret = changed_extent(sctx, result);
+ }
out:
return ret;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 46b2f4590aab71d31088a265c86026b1e96c9de4 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Tue, 24 Jul 2018 11:54:04 +0100
Subject: [PATCH] Btrfs: fix send failure when root has deleted files still
open
The more common use case of send involves creating a RO snapshot and then
use it for a send operation. In this case it's not possible to have inodes
in the snapshot that have a link count of zero (inode with an orphan item)
since during snapshot creation we do the orphan cleanup. However, other
less common use cases for send can end up seeing inodes with a link count
of zero and in this case the send operation fails with a ENOENT error
because any attempt to generate a path for the inode, with the purpose
of creating it or updating it at the receiver, fails since there are no
inode reference items. One use case it to use a regular subvolume for
a send operation after turning it to RO mode or turning a RW snapshot
into RO mode and then using it for a send operation. In both cases, if a
file gets all its hard links deleted while there is an open file
descriptor before turning the subvolume/snapshot into RO mode, the send
operation will encounter an inode with a link count of zero and then
fail with errno ENOENT.
Example using a full send with a subvolume:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ btrfs subvolume create /mnt/sv1
$ touch /mnt/sv1/foo
$ touch /mnt/sv1/bar
# keep an open file descriptor on file bar
$ exec 73</mnt/sv1/bar
$ unlink /mnt/sv1/bar
# Turn the subvolume to RO mode and use it for a full send, while
# holding the open file descriptor.
$ btrfs property set /mnt/sv1 ro true
$ btrfs send -f /tmp/full.send /mnt/sv1
At subvol /mnt/sv1
ERROR: send ioctl failed with -2: No such file or directory
Example using an incremental send with snapshots:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ btrfs subvolume create /mnt/sv1
$ touch /mnt/sv1/foo
$ touch /mnt/sv1/bar
$ btrfs subvolume snapshot -r /mnt/sv1 /mnt/snap1
$ echo "hello world" >> /mnt/sv1/bar
$ btrfs subvolume snapshot -r /mnt/sv1 /mnt/snap2
# Turn the second snapshot to RW mode and delete file foo while
# holding an open file descriptor on it.
$ btrfs property set /mnt/snap2 ro false
$ exec 73</mnt/snap2/foo
$ unlink /mnt/snap2/foo
# Set the second snapshot back to RO mode and do an incremental send.
$ btrfs property set /mnt/snap2 ro true
$ btrfs send -f /tmp/inc.send -p /mnt/snap1 /mnt/snap2
At subvol /mnt/snap2
ERROR: send ioctl failed with -2: No such file or directory
So fix this by ignoring inodes with a link count of zero if we are either
doing a full send or if they do not exist in the parent snapshot (they
are new in the send snapshot), and unlink all paths found in the parent
snapshot when doing an incremental send (and ignoring all other inode
items, such as xattrs and extents).
A test case for fstests follows soon.
CC: stable(a)vger.kernel.org # 4.4+
Reported-by: Martin Wilck <martin.wilck(a)suse.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 42e04cd3cd95..551294a6c9e2 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -100,6 +100,7 @@ struct send_ctx {
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 cur_inode_next_write_offset;
+ bool ignore_cur_inode;
u64 send_progress;
@@ -5796,6 +5797,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
int pending_move = 0;
int refs_processed = 0;
+ if (sctx->ignore_cur_inode)
+ return 0;
+
ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
&refs_processed);
if (ret < 0)
@@ -5914,6 +5918,93 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
return ret;
}
+struct parent_paths_ctx {
+ struct list_head *refs;
+ struct send_ctx *sctx;
+};
+
+static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name,
+ void *ctx)
+{
+ struct parent_paths_ctx *ppctx = ctx;
+
+ return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx,
+ ppctx->refs);
+}
+
+/*
+ * Issue unlink operations for all paths of the current inode found in the
+ * parent snapshot.
+ */
+static int btrfs_unlink_all_paths(struct send_ctx *sctx)
+{
+ LIST_HEAD(deleted_refs);
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct parent_paths_ctx ctx;
+ int ret;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = sctx->cur_ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ ctx.refs = &deleted_refs;
+ ctx.sctx = sctx;
+
+ while (true) {
+ struct extent_buffer *eb = path->nodes[0];
+ int slot = path->slots[0];
+
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(sctx->parent_root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ if (key.objectid != sctx->cur_ino)
+ break;
+ if (key.type != BTRFS_INODE_REF_KEY &&
+ key.type != BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ ret = iterate_inode_ref(sctx->parent_root, path, &key, 1,
+ record_parent_ref, &ctx);
+ if (ret < 0)
+ goto out;
+
+ path->slots[0]++;
+ }
+
+ while (!list_empty(&deleted_refs)) {
+ struct recorded_ref *ref;
+
+ ref = list_first_entry(&deleted_refs, struct recorded_ref, list);
+ ret = send_unlink(sctx, ref->full_path);
+ if (ret < 0)
+ goto out;
+ fs_path_free(ref->full_path);
+ list_del(&ref->list);
+ kfree(ref);
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ if (ret)
+ __free_recorded_refs(&deleted_refs);
+ return ret;
+}
+
static int changed_inode(struct send_ctx *sctx,
enum btrfs_compare_tree_result result)
{
@@ -5928,6 +6019,7 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_inode_new_gen = 0;
sctx->cur_inode_last_extent = (u64)-1;
sctx->cur_inode_next_write_offset = 0;
+ sctx->ignore_cur_inode = false;
/*
* Set send_progress to current inode. This will tell all get_cur_xxx
@@ -5968,6 +6060,33 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_inode_new_gen = 1;
}
+ /*
+ * Normally we do not find inodes with a link count of zero (orphans)
+ * because the most common case is to create a snapshot and use it
+ * for a send operation. However other less common use cases involve
+ * using a subvolume and send it after turning it to RO mode just
+ * after deleting all hard links of a file while holding an open
+ * file descriptor against it or turning a RO snapshot into RW mode,
+ * keep an open file descriptor against a file, delete it and then
+ * turn the snapshot back to RO mode before using it for a send
+ * operation. So if we find such cases, ignore the inode and all its
+ * items completely if it's a new inode, or if it's a changed inode
+ * make sure all its previous paths (from the parent snapshot) are all
+ * unlinked and all other the inode items are ignored.
+ */
+ if (result == BTRFS_COMPARE_TREE_NEW ||
+ result == BTRFS_COMPARE_TREE_CHANGED) {
+ u32 nlinks;
+
+ nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
+ if (nlinks == 0) {
+ sctx->ignore_cur_inode = true;
+ if (result == BTRFS_COMPARE_TREE_CHANGED)
+ ret = btrfs_unlink_all_paths(sctx);
+ goto out;
+ }
+ }
+
if (result == BTRFS_COMPARE_TREE_NEW) {
sctx->cur_inode_gen = left_gen;
sctx->cur_inode_new = 1;
@@ -6306,15 +6425,17 @@ static int changed_cb(struct btrfs_path *left_path,
key->objectid == BTRFS_FREE_SPACE_OBJECTID)
goto out;
- if (key->type == BTRFS_INODE_ITEM_KEY)
+ if (key->type == BTRFS_INODE_ITEM_KEY) {
ret = changed_inode(sctx, result);
- else if (key->type == BTRFS_INODE_REF_KEY ||
- key->type == BTRFS_INODE_EXTREF_KEY)
- ret = changed_ref(sctx, result);
- else if (key->type == BTRFS_XATTR_ITEM_KEY)
- ret = changed_xattr(sctx, result);
- else if (key->type == BTRFS_EXTENT_DATA_KEY)
- ret = changed_extent(sctx, result);
+ } else if (!sctx->ignore_cur_inode) {
+ if (key->type == BTRFS_INODE_REF_KEY ||
+ key->type == BTRFS_INODE_EXTREF_KEY)
+ ret = changed_ref(sctx, result);
+ else if (key->type == BTRFS_XATTR_ITEM_KEY)
+ ret = changed_xattr(sctx, result);
+ else if (key->type == BTRFS_EXTENT_DATA_KEY)
+ ret = changed_extent(sctx, result);
+ }
out:
return ret;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 46b2f4590aab71d31088a265c86026b1e96c9de4 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Tue, 24 Jul 2018 11:54:04 +0100
Subject: [PATCH] Btrfs: fix send failure when root has deleted files still
open
The more common use case of send involves creating a RO snapshot and then
use it for a send operation. In this case it's not possible to have inodes
in the snapshot that have a link count of zero (inode with an orphan item)
since during snapshot creation we do the orphan cleanup. However, other
less common use cases for send can end up seeing inodes with a link count
of zero and in this case the send operation fails with a ENOENT error
because any attempt to generate a path for the inode, with the purpose
of creating it or updating it at the receiver, fails since there are no
inode reference items. One use case it to use a regular subvolume for
a send operation after turning it to RO mode or turning a RW snapshot
into RO mode and then using it for a send operation. In both cases, if a
file gets all its hard links deleted while there is an open file
descriptor before turning the subvolume/snapshot into RO mode, the send
operation will encounter an inode with a link count of zero and then
fail with errno ENOENT.
Example using a full send with a subvolume:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ btrfs subvolume create /mnt/sv1
$ touch /mnt/sv1/foo
$ touch /mnt/sv1/bar
# keep an open file descriptor on file bar
$ exec 73</mnt/sv1/bar
$ unlink /mnt/sv1/bar
# Turn the subvolume to RO mode and use it for a full send, while
# holding the open file descriptor.
$ btrfs property set /mnt/sv1 ro true
$ btrfs send -f /tmp/full.send /mnt/sv1
At subvol /mnt/sv1
ERROR: send ioctl failed with -2: No such file or directory
Example using an incremental send with snapshots:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ btrfs subvolume create /mnt/sv1
$ touch /mnt/sv1/foo
$ touch /mnt/sv1/bar
$ btrfs subvolume snapshot -r /mnt/sv1 /mnt/snap1
$ echo "hello world" >> /mnt/sv1/bar
$ btrfs subvolume snapshot -r /mnt/sv1 /mnt/snap2
# Turn the second snapshot to RW mode and delete file foo while
# holding an open file descriptor on it.
$ btrfs property set /mnt/snap2 ro false
$ exec 73</mnt/snap2/foo
$ unlink /mnt/snap2/foo
# Set the second snapshot back to RO mode and do an incremental send.
$ btrfs property set /mnt/snap2 ro true
$ btrfs send -f /tmp/inc.send -p /mnt/snap1 /mnt/snap2
At subvol /mnt/snap2
ERROR: send ioctl failed with -2: No such file or directory
So fix this by ignoring inodes with a link count of zero if we are either
doing a full send or if they do not exist in the parent snapshot (they
are new in the send snapshot), and unlink all paths found in the parent
snapshot when doing an incremental send (and ignoring all other inode
items, such as xattrs and extents).
A test case for fstests follows soon.
CC: stable(a)vger.kernel.org # 4.4+
Reported-by: Martin Wilck <martin.wilck(a)suse.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 42e04cd3cd95..551294a6c9e2 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -100,6 +100,7 @@ struct send_ctx {
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 cur_inode_next_write_offset;
+ bool ignore_cur_inode;
u64 send_progress;
@@ -5796,6 +5797,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
int pending_move = 0;
int refs_processed = 0;
+ if (sctx->ignore_cur_inode)
+ return 0;
+
ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
&refs_processed);
if (ret < 0)
@@ -5914,6 +5918,93 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
return ret;
}
+struct parent_paths_ctx {
+ struct list_head *refs;
+ struct send_ctx *sctx;
+};
+
+static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name,
+ void *ctx)
+{
+ struct parent_paths_ctx *ppctx = ctx;
+
+ return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx,
+ ppctx->refs);
+}
+
+/*
+ * Issue unlink operations for all paths of the current inode found in the
+ * parent snapshot.
+ */
+static int btrfs_unlink_all_paths(struct send_ctx *sctx)
+{
+ LIST_HEAD(deleted_refs);
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct parent_paths_ctx ctx;
+ int ret;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = sctx->cur_ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ ctx.refs = &deleted_refs;
+ ctx.sctx = sctx;
+
+ while (true) {
+ struct extent_buffer *eb = path->nodes[0];
+ int slot = path->slots[0];
+
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(sctx->parent_root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ if (key.objectid != sctx->cur_ino)
+ break;
+ if (key.type != BTRFS_INODE_REF_KEY &&
+ key.type != BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ ret = iterate_inode_ref(sctx->parent_root, path, &key, 1,
+ record_parent_ref, &ctx);
+ if (ret < 0)
+ goto out;
+
+ path->slots[0]++;
+ }
+
+ while (!list_empty(&deleted_refs)) {
+ struct recorded_ref *ref;
+
+ ref = list_first_entry(&deleted_refs, struct recorded_ref, list);
+ ret = send_unlink(sctx, ref->full_path);
+ if (ret < 0)
+ goto out;
+ fs_path_free(ref->full_path);
+ list_del(&ref->list);
+ kfree(ref);
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ if (ret)
+ __free_recorded_refs(&deleted_refs);
+ return ret;
+}
+
static int changed_inode(struct send_ctx *sctx,
enum btrfs_compare_tree_result result)
{
@@ -5928,6 +6019,7 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_inode_new_gen = 0;
sctx->cur_inode_last_extent = (u64)-1;
sctx->cur_inode_next_write_offset = 0;
+ sctx->ignore_cur_inode = false;
/*
* Set send_progress to current inode. This will tell all get_cur_xxx
@@ -5968,6 +6060,33 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_inode_new_gen = 1;
}
+ /*
+ * Normally we do not find inodes with a link count of zero (orphans)
+ * because the most common case is to create a snapshot and use it
+ * for a send operation. However other less common use cases involve
+ * using a subvolume and send it after turning it to RO mode just
+ * after deleting all hard links of a file while holding an open
+ * file descriptor against it or turning a RO snapshot into RW mode,
+ * keep an open file descriptor against a file, delete it and then
+ * turn the snapshot back to RO mode before using it for a send
+ * operation. So if we find such cases, ignore the inode and all its
+ * items completely if it's a new inode, or if it's a changed inode
+ * make sure all its previous paths (from the parent snapshot) are all
+ * unlinked and all other the inode items are ignored.
+ */
+ if (result == BTRFS_COMPARE_TREE_NEW ||
+ result == BTRFS_COMPARE_TREE_CHANGED) {
+ u32 nlinks;
+
+ nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
+ if (nlinks == 0) {
+ sctx->ignore_cur_inode = true;
+ if (result == BTRFS_COMPARE_TREE_CHANGED)
+ ret = btrfs_unlink_all_paths(sctx);
+ goto out;
+ }
+ }
+
if (result == BTRFS_COMPARE_TREE_NEW) {
sctx->cur_inode_gen = left_gen;
sctx->cur_inode_new = 1;
@@ -6306,15 +6425,17 @@ static int changed_cb(struct btrfs_path *left_path,
key->objectid == BTRFS_FREE_SPACE_OBJECTID)
goto out;
- if (key->type == BTRFS_INODE_ITEM_KEY)
+ if (key->type == BTRFS_INODE_ITEM_KEY) {
ret = changed_inode(sctx, result);
- else if (key->type == BTRFS_INODE_REF_KEY ||
- key->type == BTRFS_INODE_EXTREF_KEY)
- ret = changed_ref(sctx, result);
- else if (key->type == BTRFS_XATTR_ITEM_KEY)
- ret = changed_xattr(sctx, result);
- else if (key->type == BTRFS_EXTENT_DATA_KEY)
- ret = changed_extent(sctx, result);
+ } else if (!sctx->ignore_cur_inode) {
+ if (key->type == BTRFS_INODE_REF_KEY ||
+ key->type == BTRFS_INODE_EXTREF_KEY)
+ ret = changed_ref(sctx, result);
+ else if (key->type == BTRFS_XATTR_ITEM_KEY)
+ ret = changed_xattr(sctx, result);
+ else if (key->type == BTRFS_EXTENT_DATA_KEY)
+ ret = changed_extent(sctx, result);
+ }
out:
return ret;
The patch below does not apply to the 3.18-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3c4276936f6fbe52884b4ea4e6cc120b890a0f9f Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik(a)fb.com>
Date: Fri, 20 Jul 2018 11:46:10 -0700
Subject: [PATCH] Btrfs: fix btrfs_write_inode vs delayed iput deadlock
We recently ran into the following deadlock involving
btrfs_write_inode():
[ +0.005066] __schedule+0x38e/0x8c0
[ +0.007144] schedule+0x36/0x80
[ +0.006447] bit_wait+0x11/0x60
[ +0.006446] __wait_on_bit+0xbe/0x110
[ +0.007487] ? bit_wait_io+0x60/0x60
[ +0.007319] __inode_wait_for_writeback+0x96/0xc0
[ +0.009568] ? autoremove_wake_function+0x40/0x40
[ +0.009565] inode_wait_for_writeback+0x21/0x30
[ +0.009224] evict+0xb0/0x190
[ +0.006099] iput+0x1a8/0x210
[ +0.006103] btrfs_run_delayed_iputs+0x73/0xc0
[ +0.009047] btrfs_commit_transaction+0x799/0x8c0
[ +0.009567] btrfs_write_inode+0x81/0xb0
[ +0.008008] __writeback_single_inode+0x267/0x320
[ +0.009569] writeback_sb_inodes+0x25b/0x4e0
[ +0.008702] wb_writeback+0x102/0x2d0
[ +0.007487] wb_workfn+0xa4/0x310
[ +0.006794] ? wb_workfn+0xa4/0x310
[ +0.007143] process_one_work+0x150/0x410
[ +0.008179] worker_thread+0x6d/0x520
[ +0.007490] kthread+0x12c/0x160
[ +0.006620] ? put_pwq_unlocked+0x80/0x80
[ +0.008185] ? kthread_park+0xa0/0xa0
[ +0.007484] ? do_syscall_64+0x53/0x150
[ +0.007837] ret_from_fork+0x29/0x40
Writeback calls:
btrfs_write_inode
btrfs_commit_transaction
btrfs_run_delayed_iputs
If iput() is called on that same inode, evict() will wait for writeback
forever.
btrfs_write_inode() was originally added way back in 4730a4bc5bf3
("btrfs_dirty_inode") to support O_SYNC writes. However, ->write_inode()
hasn't been used for O_SYNC since 148f948ba877 ("vfs: Introduce new
helpers for syncing after writing to O_SYNC file or IS_SYNC inode"), so
btrfs_write_inode() is actually unnecessary (and leads to a bunch of
unnecessary commits). Get rid of it, which also gets rid of the
deadlock.
CC: stable(a)vger.kernel.org # 3.2+
Signed-off-by: Josef Bacik <jbacik(a)fb.com>
[Omar: new commit message]
Signed-off-by: Omar Sandoval <osandov(a)fb.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4955e04da4c8..472457795486 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6021,32 +6021,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
return ret;
}
-int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_trans_handle *trans;
- int ret = 0;
- bool nolock = false;
-
- if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
- return 0;
-
- if (btrfs_fs_closing(root->fs_info) &&
- btrfs_is_free_space_inode(BTRFS_I(inode)))
- nolock = true;
-
- if (wbc->sync_mode == WB_SYNC_ALL) {
- if (nolock)
- trans = btrfs_join_transaction_nolock(root);
- else
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- ret = btrfs_commit_transaction(trans);
- }
- return ret;
-}
-
/*
* This is somewhat expensive, updating the tree every time the
* inode changes. But, it is most likely to find the inode in cache.
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index efe8b03ce380..67de3c0fc85b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2344,7 +2344,6 @@ static const struct super_operations btrfs_super_ops = {
.sync_fs = btrfs_sync_fs,
.show_options = btrfs_show_options,
.show_devname = btrfs_show_devname,
- .write_inode = btrfs_write_inode,
.alloc_inode = btrfs_alloc_inode,
.destroy_inode = btrfs_destroy_inode,
.statfs = btrfs_statfs,
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3c4276936f6fbe52884b4ea4e6cc120b890a0f9f Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik(a)fb.com>
Date: Fri, 20 Jul 2018 11:46:10 -0700
Subject: [PATCH] Btrfs: fix btrfs_write_inode vs delayed iput deadlock
We recently ran into the following deadlock involving
btrfs_write_inode():
[ +0.005066] __schedule+0x38e/0x8c0
[ +0.007144] schedule+0x36/0x80
[ +0.006447] bit_wait+0x11/0x60
[ +0.006446] __wait_on_bit+0xbe/0x110
[ +0.007487] ? bit_wait_io+0x60/0x60
[ +0.007319] __inode_wait_for_writeback+0x96/0xc0
[ +0.009568] ? autoremove_wake_function+0x40/0x40
[ +0.009565] inode_wait_for_writeback+0x21/0x30
[ +0.009224] evict+0xb0/0x190
[ +0.006099] iput+0x1a8/0x210
[ +0.006103] btrfs_run_delayed_iputs+0x73/0xc0
[ +0.009047] btrfs_commit_transaction+0x799/0x8c0
[ +0.009567] btrfs_write_inode+0x81/0xb0
[ +0.008008] __writeback_single_inode+0x267/0x320
[ +0.009569] writeback_sb_inodes+0x25b/0x4e0
[ +0.008702] wb_writeback+0x102/0x2d0
[ +0.007487] wb_workfn+0xa4/0x310
[ +0.006794] ? wb_workfn+0xa4/0x310
[ +0.007143] process_one_work+0x150/0x410
[ +0.008179] worker_thread+0x6d/0x520
[ +0.007490] kthread+0x12c/0x160
[ +0.006620] ? put_pwq_unlocked+0x80/0x80
[ +0.008185] ? kthread_park+0xa0/0xa0
[ +0.007484] ? do_syscall_64+0x53/0x150
[ +0.007837] ret_from_fork+0x29/0x40
Writeback calls:
btrfs_write_inode
btrfs_commit_transaction
btrfs_run_delayed_iputs
If iput() is called on that same inode, evict() will wait for writeback
forever.
btrfs_write_inode() was originally added way back in 4730a4bc5bf3
("btrfs_dirty_inode") to support O_SYNC writes. However, ->write_inode()
hasn't been used for O_SYNC since 148f948ba877 ("vfs: Introduce new
helpers for syncing after writing to O_SYNC file or IS_SYNC inode"), so
btrfs_write_inode() is actually unnecessary (and leads to a bunch of
unnecessary commits). Get rid of it, which also gets rid of the
deadlock.
CC: stable(a)vger.kernel.org # 3.2+
Signed-off-by: Josef Bacik <jbacik(a)fb.com>
[Omar: new commit message]
Signed-off-by: Omar Sandoval <osandov(a)fb.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4955e04da4c8..472457795486 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6021,32 +6021,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
return ret;
}
-int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_trans_handle *trans;
- int ret = 0;
- bool nolock = false;
-
- if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
- return 0;
-
- if (btrfs_fs_closing(root->fs_info) &&
- btrfs_is_free_space_inode(BTRFS_I(inode)))
- nolock = true;
-
- if (wbc->sync_mode == WB_SYNC_ALL) {
- if (nolock)
- trans = btrfs_join_transaction_nolock(root);
- else
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- ret = btrfs_commit_transaction(trans);
- }
- return ret;
-}
-
/*
* This is somewhat expensive, updating the tree every time the
* inode changes. But, it is most likely to find the inode in cache.
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index efe8b03ce380..67de3c0fc85b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2344,7 +2344,6 @@ static const struct super_operations btrfs_super_ops = {
.sync_fs = btrfs_sync_fs,
.show_options = btrfs_show_options,
.show_devname = btrfs_show_devname,
- .write_inode = btrfs_write_inode,
.alloc_inode = btrfs_alloc_inode,
.destroy_inode = btrfs_destroy_inode,
.statfs = btrfs_statfs,
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3c4276936f6fbe52884b4ea4e6cc120b890a0f9f Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik(a)fb.com>
Date: Fri, 20 Jul 2018 11:46:10 -0700
Subject: [PATCH] Btrfs: fix btrfs_write_inode vs delayed iput deadlock
We recently ran into the following deadlock involving
btrfs_write_inode():
[ +0.005066] __schedule+0x38e/0x8c0
[ +0.007144] schedule+0x36/0x80
[ +0.006447] bit_wait+0x11/0x60
[ +0.006446] __wait_on_bit+0xbe/0x110
[ +0.007487] ? bit_wait_io+0x60/0x60
[ +0.007319] __inode_wait_for_writeback+0x96/0xc0
[ +0.009568] ? autoremove_wake_function+0x40/0x40
[ +0.009565] inode_wait_for_writeback+0x21/0x30
[ +0.009224] evict+0xb0/0x190
[ +0.006099] iput+0x1a8/0x210
[ +0.006103] btrfs_run_delayed_iputs+0x73/0xc0
[ +0.009047] btrfs_commit_transaction+0x799/0x8c0
[ +0.009567] btrfs_write_inode+0x81/0xb0
[ +0.008008] __writeback_single_inode+0x267/0x320
[ +0.009569] writeback_sb_inodes+0x25b/0x4e0
[ +0.008702] wb_writeback+0x102/0x2d0
[ +0.007487] wb_workfn+0xa4/0x310
[ +0.006794] ? wb_workfn+0xa4/0x310
[ +0.007143] process_one_work+0x150/0x410
[ +0.008179] worker_thread+0x6d/0x520
[ +0.007490] kthread+0x12c/0x160
[ +0.006620] ? put_pwq_unlocked+0x80/0x80
[ +0.008185] ? kthread_park+0xa0/0xa0
[ +0.007484] ? do_syscall_64+0x53/0x150
[ +0.007837] ret_from_fork+0x29/0x40
Writeback calls:
btrfs_write_inode
btrfs_commit_transaction
btrfs_run_delayed_iputs
If iput() is called on that same inode, evict() will wait for writeback
forever.
btrfs_write_inode() was originally added way back in 4730a4bc5bf3
("btrfs_dirty_inode") to support O_SYNC writes. However, ->write_inode()
hasn't been used for O_SYNC since 148f948ba877 ("vfs: Introduce new
helpers for syncing after writing to O_SYNC file or IS_SYNC inode"), so
btrfs_write_inode() is actually unnecessary (and leads to a bunch of
unnecessary commits). Get rid of it, which also gets rid of the
deadlock.
CC: stable(a)vger.kernel.org # 3.2+
Signed-off-by: Josef Bacik <jbacik(a)fb.com>
[Omar: new commit message]
Signed-off-by: Omar Sandoval <osandov(a)fb.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4955e04da4c8..472457795486 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6021,32 +6021,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
return ret;
}
-int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_trans_handle *trans;
- int ret = 0;
- bool nolock = false;
-
- if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
- return 0;
-
- if (btrfs_fs_closing(root->fs_info) &&
- btrfs_is_free_space_inode(BTRFS_I(inode)))
- nolock = true;
-
- if (wbc->sync_mode == WB_SYNC_ALL) {
- if (nolock)
- trans = btrfs_join_transaction_nolock(root);
- else
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- ret = btrfs_commit_transaction(trans);
- }
- return ret;
-}
-
/*
* This is somewhat expensive, updating the tree every time the
* inode changes. But, it is most likely to find the inode in cache.
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index efe8b03ce380..67de3c0fc85b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2344,7 +2344,6 @@ static const struct super_operations btrfs_super_ops = {
.sync_fs = btrfs_sync_fs,
.show_options = btrfs_show_options,
.show_devname = btrfs_show_devname,
- .write_inode = btrfs_write_inode,
.alloc_inode = btrfs_alloc_inode,
.destroy_inode = btrfs_destroy_inode,
.statfs = btrfs_statfs,
Hi !
this is also the wrong version of the patch - the proper version is
below. This has been posted to lkml https://lkml.org/lkml/2018/7/18/191
"[PATCH V3] drm: handle error values properly" but there was no review yet
The version you have here though is for sure broken. So maybe this should
be simply dropped until the above, presumably correct fix, is confirmed.
thx!
hofrat
----- Forwarded message from gregkh(a)linuxfoundation.org -----
Date: Tue, 28 Aug 2018 16:09:59 +0200
From: gregkh(a)linuxfoundation.org
To: 1531571532-22733-1-git-send-email-hofrat(a)osadl.org, alexander.levin(a)microsoft.com, gregkh(a)linuxfoundation.org, hofrat(a)osadl.org,
seanpaul(a)chromium.org
Cc: stable-commits(a)vger.kernel.org
Subject: Patch "drm: re-enable error handling" has been added to the 3.18-stable tree
This is a note to let you know that I've just added the patch titled
drm: re-enable error handling
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
drm-re-enable-error-handling.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Tue Aug 28 16:08:28 CEST 2018
From: Nicholas Mc Guire <hofrat(a)osadl.org>
Date: Sat, 14 Jul 2018 14:32:12 +0200
Subject: drm: re-enable error handling
From: Nicholas Mc Guire <hofrat(a)osadl.org>
[ Upstream commit d530b5f1ca0bb66958a2b714bebe40a1248b9c15 ]
drm_legacy_ctxbitmap_next() returns idr_alloc() which can return
-ENOMEM, -EINVAL or -ENOSPC none of which are -1 . but the call sites
of drm_legacy_ctxbitmap_next() seem to be assuming that the error case
would be -1 (original return of drm_ctxbitmap_next() prior to 2.6.23
was actually -1). Thus reenable error handling by checking for < 0.
Signed-off-by: Nicholas Mc Guire <hofrat(a)osadl.org>
Fixes: 62968144e673 ("drm: convert drm context code to use Linux idr")
Signed-off-by: Sean Paul <seanpaul(a)chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/1531571532-22733-1-git-send-e…
Signed-off-by: Sasha Levin <alexander.levin(a)microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/gpu/drm/drm_context.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/gpu/drm/drm_context.c
+++ b/drivers/gpu/drm/drm_context.c
@@ -341,7 +341,7 @@ int drm_legacy_addctx(struct drm_device
ctx->handle = drm_legacy_ctxbitmap_next(dev);
}
DRM_DEBUG("%d\n", ctx->handle);
- if (ctx->handle == -1) {
+ if (ctx->handle < 0) {
DRM_DEBUG("Not enough free contexts.\n");
/* Should this return -EBUSY instead? */
return -ENOMEM;
Patches currently in stable-queue which might be from hofrat(a)osadl.org are
queue-3.18/drm-re-enable-error-handling.patch
queue-3.18/can-mpc5xxx_can-check-of_iomap-return-before-use.patch
----- End forwarded message -----
In NMI context, we might be in the middle of context switching or in
the middle of switch_mm_irqs_off(). In either case, CR3 might not
match current->mm, which could cause copy_from_user_nmi() and
friends to read the wrong memory.
Fix it by adding a new nmi_uaccess_okay() helper and checking it in
copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
Cc: stable(a)vger.kernel.org
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Signed-off-by: Andy Lutomirski <luto(a)kernel.org>
---
The 0day bot is still chewing on this, but I've tested it a bit locally
and it seems to do the right thing.
I've never observed the bug it fixes, but it does appear to fix a bug
unless I've missed something. It's also a prerequisite for Nadav's
fixmap bugfix.
arch/x86/events/core.c | 2 +-
arch/x86/include/asm/tlbflush.h | 16 ++++++++++++++++
arch/x86/lib/usercopy.c | 5 +++++
arch/x86/mm/tlb.c | 3 +++
4 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5f4829f10129..dfb2f7c0d019 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2465,7 +2465,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
perf_callchain_store(entry, regs->ip);
- if (!current->mm)
+ if (!nmi_uaccess_okay())
return;
if (perf_callchain_user32(regs, entry))
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 89a73bc31622..b23b2625793b 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -230,6 +230,22 @@ struct tlb_state {
};
DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+/*
+ * Blindly accessing user memory from NMI context can be dangerous
+ * if we're in the middle of switching the current user task or
+ * switching the loaded mm. It can also be dangerous if we
+ * interrupted some kernel code that was temporarily using a
+ * different mm.
+ */
+static inline bool nmi_uaccess_okay(void)
+{
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ struct mm_struct *current_mm = current->mm;
+
+ return current_mm && loaded_mm == current_mm &&
+ loaded_mm->pgd == __va(read_cr3_pa());
+}
+
/* Initialize cr4 shadow for this CPU. */
static inline void cr4_init_shadow(void)
{
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index c8c6ad0d58b8..3f435d7fca5e 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -7,6 +7,8 @@
#include <linux/uaccess.h>
#include <linux/export.h>
+#include <asm/tlbflush.h>
+
/*
* We rely on the nested NMI work to allow atomic faults from the NMI path; the
* nested NMI paths are careful to preserve CR2.
@@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
if (__range_not_ok(from, n, TASK_SIZE))
return n;
+ if (!nmi_uaccess_okay())
+ return n;
+
/*
* Even though this function is typically called from NMI/IRQ context
* disable pagefaults so that its behaviour is consistent even when
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 457b281b9339..f4b41d5a93dd 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -345,6 +345,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
*/
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
+ /* Let NMI code know that CR3 may not match expectations. */
+ this_cpu_write(cpu_tlbstate.loaded_mm, NULL);
+
/* The new ASID is already up to date. */
load_new_mm_cr3(next->pgd, new_asid, false);
--
2.17.1
The patch
regulator: bd71837: Disable voltage monitoring for LDO3/4
has been applied to the regulator tree at
https://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator.git
All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.
You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.
If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.
Please add any relevant lists and maintainers to the CCs when replying
to this mail.
Thanks,
Mark
>From 823f18f8b860526fc099c222619a126d57d2ad8c Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen(a)fi.rohmeurope.com>
Date: Wed, 29 Aug 2018 15:36:10 +0300
Subject: [PATCH] regulator: bd71837: Disable voltage monitoring for LDO3/4
There is a HW quirk in BD71837. The shutdown sequence timings for
bucks/LDOs which are enabled via register interface are changed.
At PMIC poweroff the voltage for BUCK6/7 is cut immediately at the
beginning of shut-down sequence. This causes LDO5/6 voltage
monitoring to detect under voltage and force PMIC to emergency
state instead of poweroff. Disable voltage monitoring for LDO5 and
LDO6 at probe to avoid this.
Signed-off-by: Matti Vaittinen <matti.vaittinen(a)fi.rohmeurope.com>
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
drivers/regulator/bd71837-regulator.c | 19 +++++++++++++++
include/linux/mfd/rohm-bd718x7.h | 33 ++++++++++++++++++++++++---
2 files changed, 49 insertions(+), 3 deletions(-)
diff --git a/drivers/regulator/bd71837-regulator.c b/drivers/regulator/bd71837-regulator.c
index 0f8ac8dec3e1..a1bd8aaf4d98 100644
--- a/drivers/regulator/bd71837-regulator.c
+++ b/drivers/regulator/bd71837-regulator.c
@@ -569,6 +569,25 @@ static int bd71837_probe(struct platform_device *pdev)
BD71837_REG_REGLOCK);
}
+ /*
+ * There is a HW quirk in BD71837. The shutdown sequence timings for
+ * bucks/LDOs which are controlled via register interface are changed.
+ * At PMIC poweroff the voltage for BUCK6/7 is cut immediately at the
+ * beginning of shut-down sequence. As bucks 6 and 7 are parent
+ * supplies for LDO5 and LDO6 - this causes LDO5/6 voltage
+ * monitoring to errorneously detect under voltage and force PMIC to
+ * emergency state instead of poweroff. In order to avoid this we
+ * disable voltage monitoring for LDO5 and LDO6
+ */
+ err = regmap_update_bits(pmic->mfd->regmap, BD718XX_REG_MVRFLTMASK2,
+ BD718XX_LDO5_VRMON80 | BD718XX_LDO6_VRMON80,
+ BD718XX_LDO5_VRMON80 | BD718XX_LDO6_VRMON80);
+ if (err) {
+ dev_err(&pmic->pdev->dev,
+ "Failed to disable voltage monitoring\n");
+ goto err;
+ }
+
for (i = 0; i < ARRAY_SIZE(pmic_regulator_inits); i++) {
struct regulator_desc *desc;
diff --git a/include/linux/mfd/rohm-bd718x7.h b/include/linux/mfd/rohm-bd718x7.h
index a528747f8aed..e8338e5dc10b 100644
--- a/include/linux/mfd/rohm-bd718x7.h
+++ b/include/linux/mfd/rohm-bd718x7.h
@@ -78,9 +78,9 @@ enum {
BD71837_REG_TRANS_COND0 = 0x1F,
BD71837_REG_TRANS_COND1 = 0x20,
BD71837_REG_VRFAULTEN = 0x21,
- BD71837_REG_MVRFLTMASK0 = 0x22,
- BD71837_REG_MVRFLTMASK1 = 0x23,
- BD71837_REG_MVRFLTMASK2 = 0x24,
+ BD718XX_REG_MVRFLTMASK0 = 0x22,
+ BD718XX_REG_MVRFLTMASK1 = 0x23,
+ BD718XX_REG_MVRFLTMASK2 = 0x24,
BD71837_REG_RCVCFG = 0x25,
BD71837_REG_RCVNUM = 0x26,
BD71837_REG_PWRONCONFIG0 = 0x27,
@@ -159,6 +159,33 @@ enum {
#define BUCK8_MASK 0x3F
#define BUCK8_DEFAULT 0x1E
+/* BD718XX Voltage monitoring masks */
+#define BD718XX_BUCK1_VRMON80 0x1
+#define BD718XX_BUCK1_VRMON130 0x2
+#define BD718XX_BUCK2_VRMON80 0x4
+#define BD718XX_BUCK2_VRMON130 0x8
+#define BD718XX_1ST_NODVS_BUCK_VRMON80 0x1
+#define BD718XX_1ST_NODVS_BUCK_VRMON130 0x2
+#define BD718XX_2ND_NODVS_BUCK_VRMON80 0x4
+#define BD718XX_2ND_NODVS_BUCK_VRMON130 0x8
+#define BD718XX_3RD_NODVS_BUCK_VRMON80 0x10
+#define BD718XX_3RD_NODVS_BUCK_VRMON130 0x20
+#define BD718XX_4TH_NODVS_BUCK_VRMON80 0x40
+#define BD718XX_4TH_NODVS_BUCK_VRMON130 0x80
+#define BD718XX_LDO1_VRMON80 0x1
+#define BD718XX_LDO2_VRMON80 0x2
+#define BD718XX_LDO3_VRMON80 0x4
+#define BD718XX_LDO4_VRMON80 0x8
+#define BD718XX_LDO5_VRMON80 0x10
+#define BD718XX_LDO6_VRMON80 0x20
+
+/* BD71837 specific voltage monitoring masks */
+#define BD71837_BUCK3_VRMON80 0x10
+#define BD71837_BUCK3_VRMON130 0x20
+#define BD71837_BUCK4_VRMON80 0x40
+#define BD71837_BUCK4_VRMON130 0x80
+#define BD71837_LDO7_VRMON80 0x40
+
/* BD71837_REG_IRQ bits */
#define IRQ_SWRST 0x40
#define IRQ_PWRON_S 0x20
--
2.18.0
[BUG]
fstrim on some btrfs only trims the unallocated space, not trimming any
space in existing block groups.
[CAUSE]
Before fstrim_range passed to btrfs_trim_fs(), it get truncated to
range [0, super->total_bytes).
So later btrfs_trim_fs() will only be able to trim block groups in range
[0, super->total_bytes).
While for btrfs, any bytenr aligned to sector size is valid, since btrfs use
its logical address space, there is nothing limiting the location where
we put block groups.
For btrfs with routine balance, it's quite easy to relocate all
block groups and bytenr of block groups will start beyond super->total_bytes.
In that case, btrfs will not trim existing block groups.
[FIX]
Just remove the truncation in btrfs_ioctl_fitrim(), so btrfs_trim_fs()
can get the unmodified range, which is normally set to [0, U64_MAX].
Reported-by: Chris Murphy <lists(a)colorremedies.com>
Fixes: f4c697e6406d ("btrfs: return EINVAL if start > total_bytes in fitrim ioctl")
Cc: <stable(a)vger.kernel.org> # v4.0+
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
---
fs/btrfs/extent-tree.c | 10 +---------
fs/btrfs/ioctl.c | 11 +++++++----
2 files changed, 8 insertions(+), 13 deletions(-)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7768f206196a..d1478d66c7a5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10851,21 +10851,13 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
u64 start;
u64 end;
u64 trimmed = 0;
- u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
u64 bg_failed = 0;
u64 dev_failed = 0;
int bg_ret = 0;
int dev_ret = 0;
int ret = 0;
- /*
- * try to trim all FS space, our block group may start from non-zero.
- */
- if (range->len == total_bytes)
- cache = btrfs_lookup_first_block_group(fs_info, range->start);
- else
- cache = btrfs_lookup_block_group(fs_info, range->start);
-
+ cache = btrfs_lookup_first_block_group(fs_info, range->start);
for (; cache; cache = next_block_group(fs_info, cache)) {
if (cache->key.objectid >= (range->start + range->len)) {
btrfs_put_block_group(cache);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 63600dc2ac4c..8165a4bfa579 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -491,7 +491,6 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
- u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -515,11 +514,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
return -EOPNOTSUPP;
if (copy_from_user(&range, arg, sizeof(range)))
return -EFAULT;
- if (range.start > total_bytes ||
- range.len < fs_info->sb->s_blocksize)
+
+ /*
+ * NOTE: Don't truncate the range using super->total_bytes.
+ * Bytenr of btrfs block group is in btrfs logical address space,
+ * which can be any sector size aligned bytenr in [0, U64_MAX].
+ */
+ if (range.len < fs_info->sb->s_blocksize)
return -EINVAL;
- range.len = min(range.len, total_bytes - range.start);
range.minlen = max(range.minlen, minlen);
ret = btrfs_trim_fs(fs_info, &range);
if (ret < 0)
--
2.18.0