Inode's i_io_list list head is used to attach inode to several different
lists - wb->{b_dirty, b_dirty_time, b_io, b_more_io}. When flush worker
prepares a list of inodes to writeback e.g. for sync(2), it moves inodes
to b_io list. Thus it is critical for sync(2) data integrity guarantees
that inode is not requeued to any other writeback list when inode is
queued for processing by flush worker. That's the reason why
writeback_single_inode() does not touch i_io_list (unless the inode is
completely clean) and why __mark_inode_dirty() does not touch i_io_list
if I_SYNC flag is set.
However there are two flaws in the current logic:
1) When inode has only I_DIRTY_TIME set but it is already queued in b_io
list due to sync(2), concurrent __mark_inode_dirty(inode, I_DIRTY_SYNC)
can still move inode back to b_dirty list resulting in skipping
writeback of inode time stamps during sync(2).
2) When inode is on b_dirty_time list and writeback_single_inode() races
with __mark_inode_dirty() like:
writeback_single_inode() __mark_inode_dirty(inode, I_DIRTY_PAGES)
inode->i_state |= I_SYNC
__writeback_single_inode()
inode->i_state |= I_DIRTY_PAGES;
if (inode->i_state & I_SYNC)
bail
if (!(inode->i_state & I_DIRTY_ALL))
- not true so nothing done
We end up with I_DIRTY_PAGES inode on b_dirty_time list and thus
standard background writeback will not writeback this inode leading to
possible dirty throttling stalls etc. (thanks to Martijn Coenen for this
analysis).
Fix these problems by tracking whether inode is queued in b_io or
b_more_io lists in a new I_SYNC_QUEUED flag. When this flag is set, we
know flush worker has queued inode and we should not touch i_io_list.
On the other hand we also know that once flush worker is done with the
inode it will requeue the inode to appropriate dirty list. When
I_SYNC_QUEUED is not set, __mark_inode_dirty() can (and must) move inode
to appropriate dirty list.
Reported-by: Martijn Coenen <maco(a)android.com>
Fixes: 0ae45f63d4ef ("vfs: add support for a lazytime mount option")
CC: stable(a)vger.kernel.org
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/fs-writeback.c | 17 ++++++++++++-----
include/linux/fs.h | 8 ++++++--
2 files changed, 18 insertions(+), 7 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ff0b18331590..f470c10641c5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -146,6 +146,7 @@ static void inode_io_list_del_locked(struct inode *inode,
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
+ inode->i_state &= ~I_SYNC_QUEUED;
list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
}
@@ -1187,6 +1188,7 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
inode->dirtied_when = jiffies;
}
inode_io_list_move_locked(inode, wb, &wb->b_dirty);
+ inode->i_state &= ~I_SYNC_QUEUED;
}
static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
@@ -1262,8 +1264,11 @@ static int move_expired_inodes(struct list_head *delaying_queue,
break;
list_move(&inode->i_io_list, &tmp);
moved++;
+ spin_lock(&inode->i_lock);
if (flags & EXPIRE_DIRTY_ATIME)
- set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
+ inode->i_state |= I_DIRTY_TIME_EXPIRED;
+ inode->i_state |= I_SYNC_QUEUED;
+ spin_unlock(&inode->i_lock);
if (sb_is_blkdev_sb(inode->i_sb))
continue;
if (sb && sb != inode->i_sb)
@@ -1438,6 +1443,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
} else if (inode->i_state & I_DIRTY_TIME) {
inode->dirtied_when = jiffies;
inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
+ inode->i_state &= ~I_SYNC_QUEUED;
} else {
/* The inode is clean. Remove from writeback lists. */
inode_io_list_del_locked(inode, wb);
@@ -2301,11 +2307,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
inode->i_state |= flags;
/*
- * If the inode is being synced, just update its dirty state.
- * The unlocker will place the inode on the appropriate
- * superblock list, based upon its state.
+ * If the inode is queued for writeback by flush worker, just
+ * update its dirty state. Once the flush worker is done with
+ * the inode it will place it on the appropriate superblock
+ * list, based upon its state.
*/
- if (inode->i_state & I_SYNC)
+ if (inode->i_state & I_SYNC_QUEUED)
goto out_unlock_inode;
/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 19ef6c88c152..48556efcdcf0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2157,6 +2157,10 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
*
* I_DONTCACHE Evict inode as soon as it is not used anymore.
*
+ * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists.
+ * Used to detect that mark_inode_dirty() should not move
+ * inode between dirty lists.
+ *
* Q: What is the difference between I_WILL_FREE and I_FREEING?
*/
#define I_DIRTY_SYNC (1 << 0)
@@ -2174,12 +2178,12 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
#define I_DIO_WAKEUP (1 << __I_DIO_WAKEUP)
#define I_LINKABLE (1 << 10)
#define I_DIRTY_TIME (1 << 11)
-#define __I_DIRTY_TIME_EXPIRED 12
-#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED)
+#define I_DIRTY_TIME_EXPIRED (1 << 12)
#define I_WB_SWITCH (1 << 13)
#define I_OVL_INUSE (1 << 14)
#define I_CREATING (1 << 15)
#define I_DONTCACHE (1 << 16)
+#define I_SYNC_QUEUED (1 << 17)
#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
--
2.16.4
Hello,
I found the following sctp commits:
582eea230536a6f104097dd46205822005d5fe3a ("sctp: fix possibly using a
bad saddr with a given dst")
backported in the following stable versions: v5.6.x, v5.5.x, v4.19.x,
v4.14.x, v4.9.x, v4.4.x.
5c3e82fe159622e46e91458c1a6509c321a62820 ("sctp: fix refcount bug in
sctp_wfree")
backported in the following stable versions: v5.6.x, v5.5.x, v4.19.x,
v4.14.x, v4.9.x
However I cannot find them in v5.4.x yet. I checked stable queue on
netdev side (http://patchwork.ozlabs.org/bundle/davem/stable/?state=*)
but also main stable queue
https://git.kernel.org/pub/scm/linux/kernel/git/stable/stable-queue.git
I was wondering whether it was an oversight or was it expected?
Sorry for the noise if I'm mistaken.
Best regards,
--
William
Hello,
I found the following commit fbe4e0c1b298b4665ee6915266c9d6c5b934ef4a
("ipv4: fix a RCU-list lock in fib_triestat_seq_show") backported in
the following stable versions: v5.6.x, v5.5.x, v4.19.x, v4.14.x,
v4.9.x, v4.4.x.
However I cannot find it in v5.4.x yet. I checked stable queue on
netdev side (http://patchwork.ozlabs.org/bundle/davem/stable/?state=*)
but also main stable queue
https://git.kernel.org/pub/scm/linux/kernel/git/stable/stable-queue.git
I was wondering whether it was an oversight or it was expected?
Sorry for the noise if I'm mistaken.
Best regards,
--
William
From: Hans de Goede <hdegoede(a)redhat.com>
Before commit cfc4c189bc70 ("pwm: Read initial hardware state at request
time"), a driver's get_state callback would get called once per PWM from
pwmchip_add().
pwm-lpss' runtime-pm code was relying on this, getting a runtime-pm ref for
PWMs which are enabled at probe time from within its get_state callback,
before enabling runtime-pm.
The change to calling get_state at request time causes a number of
problems:
1. PWMs enabled at probe time may get runtime suspended before they are
requested, causing e.g. a LCD backlight controlled by the PWM to turn off.
2. When the request happens when the PWM has been runtime suspended, the
ctrl register will read all 1 / 0xffffffff, causing get_state to store
bogus values in the pwm_state.
3. get_state was using an async pm_runtime_get() call, because it assumed
that runtime-pm has not been enabled yet. If shortly after the request an
apply call is made, then the pwm_lpss_is_updating() check may trigger
because the resume triggered by the pm_runtime_get() call is not complete
yet, so the ctrl register still reads all 1 / 0xffffffff.
This commit fixes these issues by moving the initial pm_runtime_get() call
for PWMs which are enabled at probe time to the pwm_lpss_probe() function;
and by making get_state take a runtime-pm ref before reading the ctrl reg.
BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1828927
Fixes: cfc4c189bc70 ("pwm: Read initial hardware state at request time")
Cc: stable(a)vger.kernel.org
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Signed-off-by: Thierry Reding <thierry.reding(a)gmail.com>
---
drivers/pwm/pwm-lpss.c | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/drivers/pwm/pwm-lpss.c b/drivers/pwm/pwm-lpss.c
index 75bbfe5f3bc2..9d965ffe66d1 100644
--- a/drivers/pwm/pwm-lpss.c
+++ b/drivers/pwm/pwm-lpss.c
@@ -158,7 +158,6 @@ static int pwm_lpss_apply(struct pwm_chip *chip, struct pwm_device *pwm,
return 0;
}
-/* This function gets called once from pwmchip_add to get the initial state */
static void pwm_lpss_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
struct pwm_state *state)
{
@@ -167,6 +166,8 @@ static void pwm_lpss_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
unsigned long long base_unit, freq, on_time_div;
u32 ctrl;
+ pm_runtime_get_sync(chip->dev);
+
base_unit_range = BIT(lpwm->info->base_unit_bits);
ctrl = pwm_lpss_read(pwm);
@@ -187,8 +188,7 @@ static void pwm_lpss_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
state->polarity = PWM_POLARITY_NORMAL;
state->enabled = !!(ctrl & PWM_ENABLE);
- if (state->enabled)
- pm_runtime_get(chip->dev);
+ pm_runtime_put(chip->dev);
}
static const struct pwm_ops pwm_lpss_ops = {
@@ -202,7 +202,8 @@ struct pwm_lpss_chip *pwm_lpss_probe(struct device *dev, struct resource *r,
{
struct pwm_lpss_chip *lpwm;
unsigned long c;
- int ret;
+ int i, ret;
+ u32 ctrl;
if (WARN_ON(info->npwm > MAX_PWMS))
return ERR_PTR(-ENODEV);
@@ -232,6 +233,12 @@ struct pwm_lpss_chip *pwm_lpss_probe(struct device *dev, struct resource *r,
return ERR_PTR(ret);
}
+ for (i = 0; i < lpwm->info->npwm; i++) {
+ ctrl = pwm_lpss_read(&lpwm->chip.pwms[i]);
+ if (ctrl & PWM_ENABLE)
+ pm_runtime_get(dev);
+ }
+
return lpwm;
}
EXPORT_SYMBOL_GPL(pwm_lpss_probe);
--
2.20.1
> Fix /proc/bootconfig to show the correctly choose the
> double or single quotes according to the value.
I suggest to improve this wording a bit.
Regards,
Markus
Some of tests in xfstests failed with cifsd kernel server since commit
e80ddeb2f70e. cifsd kernel server validates credit charge from client
by calculating it base on max((InputCount + OutputCount) and
(MaxInputResponse + MaxOutputResponse)) according to specification.
MS-SMB2 specification describe credit charge calculation of smb2 ioctl :
If Connection.SupportsMultiCredit is TRUE, the server MUST validate
CreditCharge based on the maximum of (InputCount + OutputCount) and
(MaxInputResponse + MaxOutputResponse), as specified in section 3.3.5.2.5.
If the validation fails, it MUST fail the IOCTL request with
STATUS_INVALID_PARAMETER.
This patch add indatalen that can be a non-zero value to calculation of
credit charge in SMB2_ioctl_init().
Fixes: e80ddeb2f70e ("smb3: fix incorrect number of credits when ioctl
MaxOutputResponse > 64K")
Cc: Stable <stable(a)vger.kernel.org>
Cc: Aurelien Aptel <aaptel(a)suse.com>
Cc: Steve French <smfrench(a)gmail.com>
Signed-off-by: Namjae Jeon <namjae.jeon(a)samsung.com>
---
fs/cifs/smb2pdu.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index ded96b529a4d..86d894499fbb 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2973,7 +2973,9 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
* response size smaller.
*/
req->MaxOutputResponse = cpu_to_le32(max_response_size);
- req->sync_hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(max_response_size, SMB2_MAX_BUFFER_SIZE));
+ req->sync_hdr.CreditCharge =
+ cpu_to_le16(DIV_ROUND_UP(max(indatalen, max_response_size),
+ SMB2_MAX_BUFFER_SIZE));
if (is_fsctl)
req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
else
--
2.17.1
From: "Eric W. Biederman" <ebiederm(a)xmission.com>
Recently syzbot reported that unmounting proc when there is an ongoing
inotify watch on the root directory of proc could result in a use
after free when the watch is removed after the unmount of proc
when the watcher exits.
Commit 69879c01a0c3 ("proc: Remove the now unnecessary internal mount
of proc") made it easier to unmount proc and allowed syzbot to see the
problem, but looking at the code it has been around for a long time.
Looking at the code the fsnotify watch should have been removed by
fsnotify_sb_delete in generic_shutdown_super. Unfortunately the inode
was allocated with new_inode_pseudo instead of new_inode so the inode
was not on the sb->s_inodes list. Which prevented
fsnotify_unmount_inodes from finding the inode and removing the watch
as well as made it so the "VFS: Busy inodes after unmount" warning
could not find the inodes to warn about them.
Make all of the inodes in proc visible to generic_shutdown_super,
and fsnotify_sb_delete by using new_inode instead of new_inode_pseudo.
The only functional difference is that new_inode places the inodes
on the sb->s_inodes list.
I wrote a small test program and I can verify that without changes it
can trigger this issue, and by replacing new_inode_pseudo with
new_inode the issues goes away.
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/000000000000d788c905a7dfa3f4@google.com
Reported-by: syzbot+7d2debdcdb3cb93c1e5e(a)syzkaller.appspotmail.com
Fixes: 0097875bd415 ("proc: Implement /proc/thread-self to point at the directory of the current thread")
Fixes: 021ada7dff22 ("procfs: switch /proc/self away from proc_dir_entry")
Fixes: 51f0885e5415 ("vfs,proc: guarantee unique inodes in /proc")
Signed-off-by: "Eric W. Biederman" <ebiederm(a)xmission.com>
---
fs/proc/inode.c | 2 +-
fs/proc/self.c | 2 +-
fs/proc/thread_self.c | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index f40c2532c057..28d6105e908e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -617,7 +617,7 @@ const struct inode_operations proc_link_inode_operations = {
struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
{
- struct inode *inode = new_inode_pseudo(sb);
+ struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = de->low_ino;
diff --git a/fs/proc/self.c b/fs/proc/self.c
index ca5158fa561c..72cd69bcaf4a 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -43,7 +43,7 @@ int proc_setup_self(struct super_block *s)
inode_lock(root_inode);
self = d_alloc_name(s->s_root, "self");
if (self) {
- struct inode *inode = new_inode_pseudo(s);
+ struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = self_inum;
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index ac284f409568..a553273fbd41 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -43,7 +43,7 @@ int proc_setup_thread_self(struct super_block *s)
inode_lock(root_inode);
thread_self = d_alloc_name(s->s_root, "thread-self");
if (thread_self) {
- struct inode *inode = new_inode_pseudo(s);
+ struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = thread_self_inum;
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
--
2.20.1
The suspend entry and exit code for 32-bit Tegra devices assumes that
the PLLM (which is used to provide the clock for external memory)
is always enabled on entry to suspend. Hence, the current code always
disables the PLLM on entry to suspend and re-enables the PLLM on exit
from suspend.
Since the introduction of the Tegra124 EMC driver by commit 73a7f0a90641
("memory: tegra: Add EMC (external memory controller) driver"), which is
used to scale the EMC frequency, PLLM may not be the current clock
source for the EMC on entry to suspend and hence may not be enabled.
Always enabling the PLLM on exit from suspend can cause the actual
status on the PLL to be different from that reported by the common clock
framework.
On kernels prior to v4.5, the code to set the rate of the PLLM had a
test to verify if the PLL was enabled and if the PLL was enabled,
setting the rate would fail. Since commit 267b62a96951
("clk: tegra: pll: Update PLLM handling") the test to see if PLLM is
enabled was removed.
With these earlier kernels, if the PLLM is disabled on entering suspend
and the EMC driver attempts to set the parent of the EMC clock to the
PLLM on exiting suspend, then the set rate for the PLLM will fail and in
turn cause the resume to fail.
We should not be re-enabling the PLLM on resume from suspend unless it
was enabled on entry to suspend. Therefore, fix this by saving the state
of PLLM on entry to suspend and only re-enable it, if it was already
enabled.
Fixes: 73a7f0a90641 ("memory: tegra: Add EMC (external memory controller) driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Jon Hunter <jonathanh(a)nvidia.com>
---
arch/arm/mach-tegra/sleep-tegra30.S | 33 +++++++++++++++++++++++------
1 file changed, 27 insertions(+), 6 deletions(-)
diff --git a/arch/arm/mach-tegra/sleep-tegra30.S b/arch/arm/mach-tegra/sleep-tegra30.S
index 3341a12bbb9c..c2f0793a424f 100644
--- a/arch/arm/mach-tegra/sleep-tegra30.S
+++ b/arch/arm/mach-tegra/sleep-tegra30.S
@@ -337,26 +337,42 @@ ENTRY(tegra30_lp1_reset)
add r1, r1, #2
wait_until r1, r7, r3
- /* enable PLLM via PMC */
+ /* restore PLLM state */
mov32 r2, TEGRA_PMC_BASE
+ adr r7, tegra_pllm_status
+ ldr r1, [r7]
+ cmp r2, #(1 << 12)
+ bne _skip_pllm
+
ldr r1, [r2, #PMC_PLLP_WB0_OVERRIDE]
orr r1, r1, #(1 << 12)
str r1, [r2, #PMC_PLLP_WB0_OVERRIDE]
pll_enable r1, r0, CLK_RESET_PLLM_BASE, 0
+ pll_locked r1, r0, CLK_RESET_PLLM_BASE
+
+_skip_pllm:
pll_enable r1, r0, CLK_RESET_PLLC_BASE, 0
pll_enable r1, r0, CLK_RESET_PLLX_BASE, 0
b _pll_m_c_x_done
_no_pll_iddq_exit:
- /* enable PLLM via PMC */
+ /* restore PLLM state */
mov32 r2, TEGRA_PMC_BASE
+ adr r7, tegra_pllm_status
+ ldr r1, [r7]
+ cmp r2, #(1 << 12)
+ bne _skip_pllm_no_iddq
+
ldr r1, [r2, #PMC_PLLP_WB0_OVERRIDE]
orr r1, r1, #(1 << 12)
str r1, [r2, #PMC_PLLP_WB0_OVERRIDE]
pll_enable r1, r0, CLK_RESET_PLLM_BASE, CLK_RESET_PLLM_MISC
+ pll_locked r1, r0, CLK_RESET_PLLM_BASE
+
+_skip_pllm_no_iddq:
pll_enable r1, r0, CLK_RESET_PLLC_BASE, CLK_RESET_PLLC_MISC
pll_enable r1, r0, CLK_RESET_PLLX_BASE, CLK_RESET_PLLX_MISC
@@ -364,7 +380,6 @@ _pll_m_c_x_done:
pll_enable r1, r0, CLK_RESET_PLLP_BASE, CLK_RESET_PLLP_MISC
pll_enable r1, r0, CLK_RESET_PLLA_BASE, CLK_RESET_PLLA_MISC
- pll_locked r1, r0, CLK_RESET_PLLM_BASE
pll_locked r1, r0, CLK_RESET_PLLP_BASE
pll_locked r1, r0, CLK_RESET_PLLA_BASE
pll_locked r1, r0, CLK_RESET_PLLC_BASE
@@ -526,6 +541,8 @@ __no_dual_emc_chanl:
ENDPROC(tegra30_lp1_reset)
.align L1_CACHE_SHIFT
+tegra_pllm_status:
+ .word 0
tegra30_sdram_pad_address:
.word TEGRA_EMC_BASE + EMC_CFG @0x0
.word TEGRA_EMC_BASE + EMC_ZCAL_INTERVAL @0x4
@@ -624,10 +641,14 @@ tegra30_switch_cpu_to_clk32k:
add r1, r1, #2
wait_until r1, r7, r9
- /* disable PLLM via PMC in LP1 */
+ /* disable PLLM, if enabled, via PMC in LP1 */
+ adr r1, tegra_pllm_status
ldr r0, [r4, #PMC_PLLP_WB0_OVERRIDE]
- bic r0, r0, #(1 << 12)
- str r0, [r4, #PMC_PLLP_WB0_OVERRIDE]
+ and r2, r0, #(1 << 12)
+ str r2, [r1]
+ cmp r2, #(1 << 12)
+ biceq r0, r0, #(1 << 12)
+ streq r0, [r4, #PMC_PLLP_WB0_OVERRIDE]
/* disable PLLP, PLLA, PLLC and PLLX */
ldr r0, [r5, #CLK_RESET_PLLP_BASE]
--
2.17.1