Gao Xiang has reported that on ext4 O_SYNC direct IO does not properly
sync file size update and thus if we crash at unfortunate moment, the
file can have smaller size although O_SYNC IO has reported successful
completion. The problem happens because update of on-disk inode size is
handled in ext4_dio_write_iter() *after* iomap_dio_rw() (and thus
dio_complete() in particular) has returned and generic_file_sync() gets
called by dio_complete(). Fix the problem by handling on-disk inode size
update directly in our ->end_io completion handler.
References: https://lore.kernel.org/all/02d18236-26ef-09b0-90ad-030c4fe3ee20@linux.alib…
Reported-by: Gao Xiang <hsiangkao(a)linux.alibaba.com>
CC: stable(a)vger.kernel.org
Fixes: 378f32bab371 ("ext4: introduce direct I/O write using iomap infrastructure")
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/ext4/file.c | 153 +++++++++++++++++++++----------------------------
1 file changed, 65 insertions(+), 88 deletions(-)
Changes since v2:
* Added more comments explaining the code flow
* Added WARN_ON_ONCE to verify extending IO is handled synchronously
Changes since v1:
* Rebased on top of Linus' tree (instead of a tree with iomap cleanup)
* Made ext4_dio_write_end_io() always return number of written bytes on
success for consistency
* Added Fixes tag
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6830ea3a6c59..19d9db4799c4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -306,80 +306,38 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
}
static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
- ssize_t written, size_t count)
+ ssize_t count)
{
handle_t *handle;
- bool truncate = false;
- u8 blkbits = inode->i_blkbits;
- ext4_lblk_t written_blk, end_blk;
- int ret;
-
- /*
- * Note that EXT4_I(inode)->i_disksize can get extended up to
- * inode->i_size while the I/O was running due to writeback of delalloc
- * blocks. But, the code in ext4_iomap_alloc() is careful to use
- * zeroed/unwritten extents if this is possible; thus we won't leave
- * uninitialized blocks in a file even if we didn't succeed in writing
- * as much as we intended.
- */
- WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
- if (offset + count <= EXT4_I(inode)->i_disksize) {
- /*
- * We need to ensure that the inode is removed from the orphan
- * list if it has been added prematurely, due to writeback of
- * delalloc blocks.
- */
- if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-
- if (IS_ERR(handle)) {
- ext4_orphan_del(NULL, inode);
- return PTR_ERR(handle);
- }
-
- ext4_orphan_del(handle, inode);
- ext4_journal_stop(handle);
- }
-
- return written;
- }
-
- if (written < 0)
- goto truncate;
+ lockdep_assert_held_write(&inode->i_rwsem);
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- written = PTR_ERR(handle);
- goto truncate;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
- if (ext4_update_inode_size(inode, offset + written)) {
- ret = ext4_mark_inode_dirty(handle, inode);
+ if (ext4_update_inode_size(inode, offset + count)) {
+ int ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret)) {
- written = ret;
ext4_journal_stop(handle);
- goto truncate;
+ return ret;
}
}
- /*
- * We may need to truncate allocated but not written blocks beyond EOF.
- */
- written_blk = ALIGN(offset + written, 1 << blkbits);
- end_blk = ALIGN(offset + count, 1 << blkbits);
- if (written_blk < end_blk && ext4_can_truncate(inode))
- truncate = true;
-
- /*
- * Remove the inode from the orphan list if it has been extended and
- * everything went OK.
- */
- if (!truncate && inode->i_nlink)
+ if (inode->i_nlink)
ext4_orphan_del(handle, inode);
ext4_journal_stop(handle);
- if (truncate) {
-truncate:
+ return count;
+}
+
+/*
+ * Clean up the inode after DIO or DAX extending write has completed and the
+ * inode size has been updated using ext4_handle_inode_extension().
+ */
+static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count)
+{
+ lockdep_assert_held_write(&inode->i_rwsem);
+ if (count < 0) {
ext4_truncate_failed_write(inode);
/*
* If the truncate operation failed early, then the inode may
@@ -388,9 +346,28 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
*/
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
+ return;
}
+ /*
+ * If i_disksize got extended due to writeback of delalloc blocks while
+ * the DIO was running we could fail to cleanup the orphan list in
+ * ext4_handle_inode_extension(). Do it now.
+ */
+ if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
+ handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- return written;
+ if (IS_ERR(handle)) {
+ /*
+ * The write has successfully completed. Not much to
+ * do with the error here so just cleanup the orphan
+ * list and hope for the best.
+ */
+ ext4_orphan_del(NULL, inode);
+ return;
+ }
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ }
}
static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
@@ -399,31 +376,22 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(iocb->ki_filp);
+ if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
+ error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
if (error)
return error;
-
- if (size && flags & IOMAP_DIO_UNWRITTEN) {
- error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
- if (error < 0)
- return error;
- }
/*
- * If we are extending the file, we have to update i_size here before
- * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
- * buffered reads could zero out too much from page cache pages. Update
- * of on-disk size will happen later in ext4_dio_write_iter() where
- * we have enough information to also perform orphan list handling etc.
- * Note that we perform all extending writes synchronously under
- * i_rwsem held exclusively so i_size update is safe here in that case.
- * If the write was not extending, we cannot see pos > i_size here
- * because operations reducing i_size like truncate wait for all
- * outstanding DIO before updating i_size.
+ * Note that EXT4_I(inode)->i_disksize can get extended up to
+ * inode->i_size while the I/O was running due to writeback of delalloc
+ * blocks. But the code in ext4_iomap_alloc() is careful to use
+ * zeroed/unwritten extents if this is possible; thus we won't leave
+ * uninitialized blocks in a file even if we didn't succeed in writing
+ * as much as we intended.
*/
- pos += size;
- if (pos > i_size_read(inode))
- i_size_write(inode, pos);
-
- return 0;
+ WARN_ON_ONCE(i_size_read(inode) < READ_ONCE(EXT4_I(inode)->i_disksize));
+ if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize))
+ return size;
+ return ext4_handle_inode_extension(inode, pos, size);
}
static const struct iomap_dio_ops ext4_dio_write_ops = {
@@ -606,9 +574,16 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
dio_flags, NULL, 0);
if (ret == -ENOTBLK)
ret = 0;
-
- if (extend)
- ret = ext4_handle_inode_extension(inode, offset, ret, count);
+ if (extend) {
+ /*
+ * We always perform extending DIO write synchronously so by
+ * now the IO is completed and ext4_handle_inode_extension()
+ * was called. Cleanup the inode in case of error or race with
+ * writeback of delalloc blocks.
+ */
+ WARN_ON_ONCE(ret == -EIOCBQUEUED);
+ ext4_inode_extension_cleanup(inode, ret);
+ }
out:
if (ilock_shared)
@@ -689,8 +664,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
- if (extend)
- ret = ext4_handle_inode_extension(inode, offset, ret, count);
+ if (extend) {
+ ret = ext4_handle_inode_extension(inode, offset, ret);
+ ext4_inode_extension_cleanup(inode, ret);
+ }
out:
inode_unlock(inode);
if (ret > 0)
--
2.35.3
stable-rc/linux-5.4.y build: 17 builds: 2 failed, 15 passed, 8 errors, 30 warnings (v5.4.258-119-g9842aef4b12b)
Full Build Summary: https://kernelci.org/build/stable-rc/branch/linux-5.4.y/kernel/v5.4.258-119…
Tree: stable-rc
Branch: linux-5.4.y
Git Describe: v5.4.258-119-g9842aef4b12b
Git Commit: 9842aef4b12b300a40f0bc2d408313e89a790d20
Git URL: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Built: 7 unique architectures
Build Failures Detected:
arm:
imx_v6_v7_defconfig: (gcc-10) FAIL
multi_v7_defconfig: (gcc-10) FAIL
Errors and Warnings Detected:
arc:
arm64:
defconfig (gcc-10): 2 warnings
defconfig+arm64-chromebook (gcc-10): 2 warnings
arm:
imx_v6_v7_defconfig (gcc-10): 4 errors, 2 warnings
multi_v7_defconfig (gcc-10): 4 errors, 2 warnings
i386:
allnoconfig (gcc-10): 2 warnings
i386_defconfig (gcc-10): 2 warnings
tinyconfig (gcc-10): 2 warnings
mips:
riscv:
x86_64:
allnoconfig (gcc-10): 4 warnings
tinyconfig (gcc-10): 4 warnings
x86_64_defconfig (gcc-10): 4 warnings
x86_64_defconfig+x86-chromebook (gcc-10): 4 warnings
Errors summary:
2 drivers/gpio/gpio-vf610.c:340:2: error: implicit declaration of function ‘gpio_irq_chip_set_chip’ [-Werror=implicit-function-declaration]
2 drivers/gpio/gpio-vf610.c:251:2: error: ‘GPIOCHIP_IRQ_RESOURCE_HELPERS’ undeclared here (not in a function)
2 drivers/gpio/gpio-vf610.c:250:6: error: ‘IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND’ undeclared here (not in a function); did you mean ‘IRQCHIP_MASK_ON_SUSPEND’?
2 drivers/gpio/gpio-vf610.c:249:11: error: ‘IRQCHIP_IMMUTABLE’ undeclared here (not in a function); did you mean ‘IS_IMMUTABLE’?
Warnings summary:
7 ld: warning: creating DT_TEXTREL in a PIE
4 ld: arch/x86/boot/compressed/head_64.o: warning: relocation in read-only section `.head.text'
4 arch/arm64/include/asm/memory.h:238:15: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
3 ld: arch/x86/boot/compressed/head_32.o: warning: relocation in read-only section `.head.text'
2 drivers/gpio/gpio-vf610.c:251:2: warning: excess elements in struct initializer
2 cc1: some warnings being treated as errors
2 arch/x86/entry/entry_64.o: warning: objtool: If this is a retpoline, please patch it in with alternatives and annotate it with ANNOTATE_NOSPEC_ALTERNATIVE.
2 arch/x86/entry/entry_64.o: warning: objtool: .entry.text+0x1c1: unsupported intra-function call
2 arch/x86/entry/entry_64.o: warning: objtool: .entry.text+0x151: unsupported intra-function call
2 arch/x86/entry/entry_64.S:1756: Warning: no instruction mnemonic suffix given and no register operands; using default for `sysret'
Section mismatches summary:
1 WARNING: vmlinux.o(___ksymtab_gpl+vic_init_cascaded+0x0): Section mismatch in reference from the variable __ksymtab_vic_init_cascaded to the function .init.text:vic_init_cascaded()
================================================================================
Detailed per-defconfig build reports:
--------------------------------------------------------------------------------
32r2el_defconfig (mips, gcc-10) — PASS, 0 errors, 0 warnings, 0 section mismatches
--------------------------------------------------------------------------------
allnoconfig (i386, gcc-10) — PASS, 0 errors, 2 warnings, 0 section mismatches
Warnings:
ld: arch/x86/boot/compressed/head_32.o: warning: relocation in read-only section `.head.text'
ld: warning: creating DT_TEXTREL in a PIE
--------------------------------------------------------------------------------
allnoconfig (x86_64, gcc-10) — PASS, 0 errors, 4 warnings, 0 section mismatches
Warnings:
arch/x86/entry/entry_64.S:1756: Warning: no instruction mnemonic suffix given and no register operands; using default for `sysret'
arch/x86/entry/entry_64.o: warning: objtool: .entry.text+0x151: unsupported intra-function call
ld: arch/x86/boot/compressed/head_64.o: warning: relocation in read-only section `.head.text'
ld: warning: creating DT_TEXTREL in a PIE
--------------------------------------------------------------------------------
defconfig (riscv, gcc-10) — PASS, 0 errors, 0 warnings, 0 section mismatches
--------------------------------------------------------------------------------
defconfig (arm64, gcc-10) — PASS, 0 errors, 2 warnings, 0 section mismatches
Warnings:
arch/arm64/include/asm/memory.h:238:15: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
arch/arm64/include/asm/memory.h:238:15: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
--------------------------------------------------------------------------------
defconfig+arm64-chromebook (arm64, gcc-10) — PASS, 0 errors, 2 warnings, 0 section mismatches
Warnings:
arch/arm64/include/asm/memory.h:238:15: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
arch/arm64/include/asm/memory.h:238:15: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
--------------------------------------------------------------------------------
haps_hs_smp_defconfig (arc, gcc-10) — PASS, 0 errors, 0 warnings, 0 section mismatches
--------------------------------------------------------------------------------
i386_defconfig (i386, gcc-10) — PASS, 0 errors, 2 warnings, 0 section mismatches
Warnings:
ld: arch/x86/boot/compressed/head_32.o: warning: relocation in read-only section `.head.text'
ld: warning: creating DT_TEXTREL in a PIE
--------------------------------------------------------------------------------
imx_v6_v7_defconfig (arm, gcc-10) — FAIL, 4 errors, 2 warnings, 0 section mismatches
Errors:
drivers/gpio/gpio-vf610.c:249:11: error: ‘IRQCHIP_IMMUTABLE’ undeclared here (not in a function); did you mean ‘IS_IMMUTABLE’?
drivers/gpio/gpio-vf610.c:250:6: error: ‘IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND’ undeclared here (not in a function); did you mean ‘IRQCHIP_MASK_ON_SUSPEND’?
drivers/gpio/gpio-vf610.c:251:2: error: ‘GPIOCHIP_IRQ_RESOURCE_HELPERS’ undeclared here (not in a function)
drivers/gpio/gpio-vf610.c:340:2: error: implicit declaration of function ‘gpio_irq_chip_set_chip’ [-Werror=implicit-function-declaration]
Warnings:
drivers/gpio/gpio-vf610.c:251:2: warning: excess elements in struct initializer
cc1: some warnings being treated as errors
--------------------------------------------------------------------------------
multi_v5_defconfig (arm, gcc-10) — PASS, 0 errors, 0 warnings, 0 section mismatches
Section mismatches:
WARNING: vmlinux.o(___ksymtab_gpl+vic_init_cascaded+0x0): Section mismatch in reference from the variable __ksymtab_vic_init_cascaded to the function .init.text:vic_init_cascaded()
--------------------------------------------------------------------------------
multi_v7_defconfig (arm, gcc-10) — FAIL, 4 errors, 2 warnings, 0 section mismatches
Errors:
drivers/gpio/gpio-vf610.c:249:11: error: ‘IRQCHIP_IMMUTABLE’ undeclared here (not in a function); did you mean ‘IS_IMMUTABLE’?
drivers/gpio/gpio-vf610.c:250:6: error: ‘IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND’ undeclared here (not in a function); did you mean ‘IRQCHIP_MASK_ON_SUSPEND’?
drivers/gpio/gpio-vf610.c:251:2: error: ‘GPIOCHIP_IRQ_RESOURCE_HELPERS’ undeclared here (not in a function)
drivers/gpio/gpio-vf610.c:340:2: error: implicit declaration of function ‘gpio_irq_chip_set_chip’ [-Werror=implicit-function-declaration]
Warnings:
drivers/gpio/gpio-vf610.c:251:2: warning: excess elements in struct initializer
cc1: some warnings being treated as errors
--------------------------------------------------------------------------------
omap2plus_defconfig (arm, gcc-10) — PASS, 0 errors, 0 warnings, 0 section mismatches
--------------------------------------------------------------------------------
tinyconfig (i386, gcc-10) — PASS, 0 errors, 2 warnings, 0 section mismatches
Warnings:
ld: arch/x86/boot/compressed/head_32.o: warning: relocation in read-only section `.head.text'
ld: warning: creating DT_TEXTREL in a PIE
--------------------------------------------------------------------------------
tinyconfig (x86_64, gcc-10) — PASS, 0 errors, 4 warnings, 0 section mismatches
Warnings:
arch/x86/entry/entry_64.S:1756: Warning: no instruction mnemonic suffix given and no register operands; using default for `sysret'
arch/x86/entry/entry_64.o: warning: objtool: .entry.text+0x151: unsupported intra-function call
ld: arch/x86/boot/compressed/head_64.o: warning: relocation in read-only section `.head.text'
ld: warning: creating DT_TEXTREL in a PIE
--------------------------------------------------------------------------------
vexpress_defconfig (arm, gcc-10) — PASS, 0 errors, 0 warnings, 0 section mismatches
--------------------------------------------------------------------------------
x86_64_defconfig (x86_64, gcc-10) — PASS, 0 errors, 4 warnings, 0 section mismatches
Warnings:
arch/x86/entry/entry_64.o: warning: objtool: .entry.text+0x1c1: unsupported intra-function call
arch/x86/entry/entry_64.o: warning: objtool: If this is a retpoline, please patch it in with alternatives and annotate it with ANNOTATE_NOSPEC_ALTERNATIVE.
ld: arch/x86/boot/compressed/head_64.o: warning: relocation in read-only section `.head.text'
ld: warning: creating DT_TEXTREL in a PIE
--------------------------------------------------------------------------------
x86_64_defconfig+x86-chromebook (x86_64, gcc-10) — PASS, 0 errors, 4 warnings, 0 section mismatches
Warnings:
arch/x86/entry/entry_64.o: warning: objtool: .entry.text+0x1c1: unsupported intra-function call
arch/x86/entry/entry_64.o: warning: objtool: If this is a retpoline, please patch it in with alternatives and annotate it with ANNOTATE_NOSPEC_ALTERNATIVE.
ld: arch/x86/boot/compressed/head_64.o: warning: relocation in read-only section `.head.text'
ld: warning: creating DT_TEXTREL in a PIE
---
For more info write to <info(a)kernelci.org>