Curently we don't use any preallocation when a file is already closed
when allocating blocks (from writeback code when converting delayed
allocation). However for small files, using locality group preallocation
is actually desirable as that is not specific to a particular file.
Rather it is a method to pack small files together to reduce
fragmentation and for that the fact the file is closed is actually even
stronger hint the file would benefit from packing. So change the logic
to allow locality group preallocation in this case.
Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning")
CC: stable(a)vger.kernel.org
Reported-and-tested-by: Stefan Wahren <stefan.wahren(a)i2se.com>
Tested-by: Ojaswin Mujoo <ojaswin(a)linux.ibm.com>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list(a)gmail.com>
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/ext4/mballoc.c | 27 +++++++++++++++------------
1 file changed, 15 insertions(+), 12 deletions(-)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 6251b4a6cc63..af1e49c3603f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5195,6 +5195,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int bsbits = ac->ac_sb->s_blocksize_bits;
loff_t size, isize;
+ bool inode_pa_eligible, group_pa_eligible;
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return;
@@ -5202,25 +5203,27 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
return;
+ group_pa_eligible = sbi->s_mb_group_prealloc > 0;
+ inode_pa_eligible = true;
size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
+ /* No point in using inode preallocation for closed files */
if ((size == isize) && !ext4_fs_is_busy(sbi) &&
- !inode_is_open_for_write(ac->ac_inode)) {
- ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
- return;
- }
+ !inode_is_open_for_write(ac->ac_inode))
+ inode_pa_eligible = false;
- if (sbi->s_mb_group_prealloc <= 0) {
- ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
- return;
- }
-
- /* don't use group allocation for large files */
size = max(size, isize);
- if (size > sbi->s_mb_stream_request) {
- ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ /* Don't use group allocation for large files */
+ if (size > sbi->s_mb_stream_request)
+ group_pa_eligible = false;
+
+ if (!group_pa_eligible) {
+ if (inode_pa_eligible)
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ else
+ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
}
--
2.35.3
mb_set_largest_free_order() updates lists containing groups with largest
chunk of free space of given order. The way it updates it leads to
always moving the group to the tail of the list. Thus allocations
looking for free space of given order effectively end up cycling through
all groups (and due to initialization in last to first order). This
spreads allocations among block groups which reduces performance for
rotating disks or low-end flash media. Change
mb_set_largest_free_order() to only update lists if the order of the
largest free chunk in the group changed.
Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning")
CC: stable(a)vger.kernel.org
Reported-and-tested-by: Stefan Wahren <stefan.wahren(a)i2se.com>
Tested-by: Ojaswin Mujoo <ojaswin(a)linux.ibm.com>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list(a)gmail.com>
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/ext4/mballoc.c | 24 +++++++++++++-----------
1 file changed, 13 insertions(+), 11 deletions(-)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 41e1cfecac3b..6251b4a6cc63 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1077,23 +1077,25 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
struct ext4_sb_info *sbi = EXT4_SB(sb);
int i;
- if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) {
+ for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
+ if (grp->bb_counters[i] > 0)
+ break;
+ /* No need to move between order lists? */
+ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
+ i == grp->bb_largest_free_order) {
+ grp->bb_largest_free_order = i;
+ return;
+ }
+
+ if (grp->bb_largest_free_order >= 0) {
write_lock(&sbi->s_mb_largest_free_orders_locks[
grp->bb_largest_free_order]);
list_del_init(&grp->bb_largest_free_order_node);
write_unlock(&sbi->s_mb_largest_free_orders_locks[
grp->bb_largest_free_order]);
}
- grp->bb_largest_free_order = -1; /* uninit */
-
- for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) {
- if (grp->bb_counters[i] > 0) {
- grp->bb_largest_free_order = i;
- break;
- }
- }
- if (test_opt2(sb, MB_OPTIMIZE_SCAN) &&
- grp->bb_largest_free_order >= 0 && grp->bb_free) {
+ grp->bb_largest_free_order = i;
+ if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
write_lock(&sbi->s_mb_largest_free_orders_locks[
grp->bb_largest_free_order]);
list_add_tail(&grp->bb_largest_free_order_node,
--
2.35.3
From: Chuck Lever <chuck.lever(a)oracle.com>
commit f11ad7aa653130b71e2e89bed207f387718216d5 upstream.
RFC 8881 explains the purpose of the write verifier this way:
> The final portion of the result is the field writeverf. This field
> is the write verifier and is a cookie that the client can use to
> determine whether a server has changed instance state (e.g., server
> restart) between a call to WRITE and a subsequent call to either
> WRITE or COMMIT.
But then it says:
> This cookie MUST be unchanged during a single instance of the
> NFSv4.1 server and MUST be unique between instances of the NFSv4.1
> server. If the cookie changes, then the client MUST assume that
> any data written with an UNSTABLE4 value for committed and an old
> writeverf in the reply has been lost and will need to be
> recovered.
RFC 1813 has similar language for NFSv3. NFSv2 does not have a write
verifier since it doesn't implement the COMMIT procedure.
Since commit 19e0663ff9bc ("nfsd: Ensure sampling of the write
verifier is atomic with the write"), the Linux NFS server has
returned a boot-time-based verifier for UNSTABLE WRITEs, but a zero
verifier for FILE_SYNC and DATA_SYNC WRITEs. FILE_SYNC and DATA_SYNC
WRITEs are not followed up with a COMMIT, so there's no need for
clients to compare verifiers for stable writes.
However, by returning a different verifier for stable and unstable
writes, the above commit puts the Linux NFS server a step farther
out of compliance with the first MUST above. At least one NFS client
(FreeBSD) noticed the difference, making this a potential
regression.
Reported-by: Rick Macklem <rmacklem(a)uoguelph.ca>
Link: https://lore.kernel.org/linux-nfs/YQXPR0101MB096857EEACF04A6DF1FC6D9BDD749@…
Fixes: 19e0663ff9bc ("nfsd: Ensure sampling of the write verifier is atomic with the write")
Signed-off-by: Chuck Lever <chuck.lever(a)oracle.com>
Signed-off-by: Michael Kochera <kochera(a)google.com>
---
Removed down_write to fix the conflict in the cherry-pick. The down_write functionality was no longer needed there. Upstream commit 555dbf1a9aac6d3150c8b52fa35f768a692f4eeb titled nfsd: Replace use of rwsem with errseq_t removed those and replace it with new functionality that was more scalable. This commit is already backported onto 5.10 and so removing down_write ensures consistency with that change. Tested by compiling and booting successfully.
fs/nfsd/vfs.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c852bb5ff212..a4ae1fcd2ab1 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1014,6 +1014,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
since = READ_ONCE(file->f_wb_err);
if (flags & RWF_SYNC) {
+ if (verf)
+ nfsd_copy_boot_verifier(verf,
+ net_generic(SVC_NET(rqstp),
+ nfsd_net_id));
host_err = vfs_iter_write(file, &iter, &pos, flags);
if (host_err < 0)
nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
--
2.37.2.789.g6183377224-goog
--
I am Stefano Pessina, an Italian business tycoon, investor, and
philanthropist. the vice chairman, chief executive officer (CEO), and
the single largest shareholder of Walgreens Boots Alliance. I gave
away 25 percent of my personal wealth to charity. And I also pledged
to give away the rest of 25% this year 2022 to Individuals.. I have
decided to donate $2,200,000.00 to you. If you are interested in my
donation, do contact me for more details.......
One of the side-effects of mb_optimize_scan was that the optimized
functions to select next group to try were called even before we tried
the goal group. As a result we no longer allocate files close to
corresponding inodes as well as we don't try to expand currently
allocated extent in the same group. This results in reaim regression
with workfile.disk workload of upto 8% with many clients on my test
machine:
baseline mb_optimize_scan
Hmean disk-1 2114.16 ( 0.00%) 2099.37 ( -0.70%)
Hmean disk-41 87794.43 ( 0.00%) 83787.47 * -4.56%*
Hmean disk-81 148170.73 ( 0.00%) 135527.05 * -8.53%*
Hmean disk-121 177506.11 ( 0.00%) 166284.93 * -6.32%*
Hmean disk-161 220951.51 ( 0.00%) 207563.39 * -6.06%*
Hmean disk-201 208722.74 ( 0.00%) 203235.59 ( -2.63%)
Hmean disk-241 222051.60 ( 0.00%) 217705.51 ( -1.96%)
Hmean disk-281 252244.17 ( 0.00%) 241132.72 * -4.41%*
Hmean disk-321 255844.84 ( 0.00%) 245412.84 * -4.08%*
Also this is causing huge regression (time increased by a factor of 5 or
so) when untarring archive with lots of small files on some eMMC storage
cards.
Fix the problem by making sure we try goal group first.
Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning")
CC: stable(a)vger.kernel.org
Reported-by: Stefan Wahren <stefan.wahren(a)i2se.com>
Link: https://lore.kernel.org/all/20220727105123.ckwrhbilzrxqpt24@quack3/
Link: https://lore.kernel.org/all/0d81a7c2-46b7-6010-62a4-3e6cfc1628d6@i2se.com/
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/ext4/mballoc.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bd8f8b5c3d30..41e1cfecac3b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1049,8 +1049,10 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
{
*new_cr = ac->ac_criteria;
- if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining)
+ if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
+ *group = next_linear_group(ac, *group, ngroups);
return;
+ }
if (*new_cr == 0) {
ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
@@ -2636,7 +2638,7 @@ static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
ext4_group_t prefetch_grp = 0, ngroups, group, i;
- int cr = -1;
+ int cr = -1, new_cr;
int err = 0, first_err = 0;
unsigned int nr = 0, prefetch_ios = 0;
struct ext4_sb_info *sbi;
@@ -2711,13 +2713,11 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
prefetch_grp = group;
- for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups),
- i++) {
- int ret = 0, new_cr;
+ for (i = 0, new_cr = cr; i < ngroups; i++,
+ ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
+ int ret = 0;
cond_resched();
-
- ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups);
if (new_cr != cr) {
cr = new_cr;
goto repeat;
--
2.35.3