From: Ralph Campbell <rcampbell(a)nvidia.com>
Private ZONE_DEVICE pages use a special pte entry and thus are not
present. Properly handle this case in map_pte(), it is already handled
in check_pte(), the map_pte() part was lost in some rebase most probably.
Without this patch the slow migration path can not migrate back to any
private ZONE_DEVICE memory to regular memory. This was found after stress
testing migration back to system memory. This ultimatly can lead to the CPU
constantly page fault looping on the special swap entry.
Changes since v2:
- add comments explaining what is going on
Changes since v1:
- properly lock pte directory in map_pte()
Signed-off-by: Ralph Campbell <rcampbell(a)nvidia.com>
Signed-off-by: Jérôme Glisse <jglisse(a)redhat.com>
Reviewed-by: Balbir Singh <bsingharora(a)gmail.com>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: stable(a)vger.kernel.org
---
mm/page_vma_mapped.c | 24 +++++++++++++++++++++++-
1 file changed, 23 insertions(+), 1 deletion(-)
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index ae3c2a35d61b..11df03e71288 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -21,7 +21,29 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
if (!is_swap_pte(*pvmw->pte))
return false;
} else {
- if (!pte_present(*pvmw->pte))
+ /*
+ * We get here when we are trying to unmap a private
+ * device page from the process address space. Such
+ * page is not CPU accessible and thus is mapped as
+ * a special swap entry, nonetheless it still does
+ * count as a valid regular mapping for the page (and
+ * is accounted as such in page maps count).
+ *
+ * So handle this special case as if it was a normal
+ * page mapping ie lock CPU page table and returns
+ * true.
+ *
+ * For more details on device private memory see HMM
+ * (include/linux/hmm.h or mm/hmm.c).
+ */
+ if (is_swap_pte(*pvmw->pte)) {
+ swp_entry_t entry;
+
+ /* Handle un-addressable ZONE_DEVICE memory */
+ entry = pte_to_swp_entry(*pvmw->pte);
+ if (!is_device_private_entry(entry))
+ return false;
+ } else if (!pte_present(*pvmw->pte))
return false;
}
}
--
2.17.2
Detaching of mark connector from fsnotify_put_mark() can race with
unmounting of the filesystem like:
CPU1 CPU2
fsnotify_put_mark()
spin_lock(&conn->lock);
...
inode = fsnotify_detach_connector_from_object(conn)
spin_unlock(&conn->lock);
generic_shutdown_super()
fsnotify_unmount_inodes()
sees connector detached for inode
-> nothing to do
evict_inode()
barfs on pending inode reference
iput(inode);
Resulting in "Busy inodes after unmount" message and possible kernel
oops. Make fsnotify_unmount_inodes() properly wait for outstanding inode
references from detached connectors.
Note that the accounting of outstanding inode references in the
superblock can cause some cacheline contention on the counter. OTOH it
happens only during deletion of the last notification mark from an inode
(or during unlinking of watched inode) and that is not too bad. I have
measured time to create & delete inotify watch 100000 times from 64
processes in parallel (each process having its own inotify group and its
own file on a shared superblock) on a 64 CPU machine. Average and
standard deviation of 15 runs look like:
Avg Stddev
Vanilla 9.817400 0.276165
Fixed 9.710467 0.228294
So there's no statistically significant difference.
Fixes: 6b3f05d24d35 ("fsnotify: Detach mark from object list when last reference is dropped")
CC: stable(a)vger.kernel.org
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/notify/fsnotify.c | 3 +++
fs/notify/mark.c | 39 +++++++++++++++++++++++++++++++--------
include/linux/fs.h | 3 +++
3 files changed, 37 insertions(+), 8 deletions(-)
Changes since v1:
* added Fixes tag
* improved fsnotify_drop_object to take object type
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index f174397b63a0..00d4f4357724 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -96,6 +96,9 @@ void fsnotify_unmount_inodes(struct super_block *sb)
if (iput_inode)
iput(iput_inode);
+ /* Wait for outstanding inode references from connectors */
+ wait_var_event(&sb->s_fsnotify_inode_refs,
+ !atomic_long_read(&sb->s_fsnotify_inode_refs));
}
/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 59cdb27826de..f4e330b5b379 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -179,17 +179,20 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
}
}
-static struct inode *fsnotify_detach_connector_from_object(
- struct fsnotify_mark_connector *conn)
+static void *fsnotify_detach_connector_from_object(
+ struct fsnotify_mark_connector *conn,
+ unsigned int *type)
{
struct inode *inode = NULL;
+ *type = conn->type;
if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED)
return NULL;
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = fsnotify_conn_inode(conn);
inode->i_fsnotify_mask = 0;
+ atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs);
} else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
}
@@ -211,10 +214,29 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
fsnotify_put_group(group);
}
+/* Drop object reference originally held by a connector */
+static void fsnotify_drop_object(unsigned int type, void *objp)
+{
+ struct inode *inode;
+ struct super_block *sb;
+
+ if (!objp)
+ return;
+ /* Currently only inode references are passed to be dropped */
+ if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE))
+ return;
+ inode = objp;
+ sb = inode->i_sb;
+ iput(inode);
+ if (atomic_long_dec_and_test(&sb->s_fsnotify_inode_refs))
+ wake_up_var(&sb->s_fsnotify_inode_refs);
+}
+
void fsnotify_put_mark(struct fsnotify_mark *mark)
{
struct fsnotify_mark_connector *conn;
- struct inode *inode = NULL;
+ void *objp = NULL;
+ unsigned int type;
bool free_conn = false;
/* Catch marks that were actually never attached to object */
@@ -234,7 +256,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
conn = mark->connector;
hlist_del_init_rcu(&mark->obj_list);
if (hlist_empty(&conn->list)) {
- inode = fsnotify_detach_connector_from_object(conn);
+ objp = fsnotify_detach_connector_from_object(conn, &type);
free_conn = true;
} else {
__fsnotify_recalc_mask(conn);
@@ -242,7 +264,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
mark->connector = NULL;
spin_unlock(&conn->lock);
- iput(inode);
+ fsnotify_drop_object(type, objp);
if (free_conn) {
spin_lock(&destroy_lock);
@@ -709,7 +731,8 @@ void fsnotify_destroy_marks(fsnotify_connp_t *connp)
{
struct fsnotify_mark_connector *conn;
struct fsnotify_mark *mark, *old_mark = NULL;
- struct inode *inode;
+ void *objp;
+ unsigned int type;
conn = fsnotify_grab_connector(connp);
if (!conn)
@@ -735,11 +758,11 @@ void fsnotify_destroy_marks(fsnotify_connp_t *connp)
* mark references get dropped. It would lead to strange results such
* as delaying inode deletion or blocking unmount.
*/
- inode = fsnotify_detach_connector_from_object(conn);
+ objp = fsnotify_detach_connector_from_object(conn, &type);
spin_unlock(&conn->lock);
if (old_mark)
fsnotify_put_mark(old_mark);
- iput(inode);
+ fsnotify_drop_object(type, objp);
}
/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 33322702c910..5090f3dcec3b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1428,6 +1428,9 @@ struct super_block {
/* Number of inodes with nlink == 0 but still referenced */
atomic_long_t s_remove_count;
+ /* Pending fsnotify inode refs */
+ atomic_long_t s_fsnotify_inode_refs;
+
/* Being remounted read-only */
int s_readonly_remount;
--
2.16.4
Detaching of mark connector from fsnotify_put_mark() can race with
unmounting of the filesystem like:
CPU1 CPU2
fsnotify_put_mark()
spin_lock(&conn->lock);
...
inode = fsnotify_detach_connector_from_object(conn)
spin_unlock(&conn->lock);
generic_shutdown_super()
fsnotify_unmount_inodes()
sees connector detached for inode
-> nothing to do
evict_inode()
barfs on pending inode reference
iput(inode);
Resulting in "Busy inodes after unmount" message and possible kernel
oops. Make fsnotify_unmount_inodes() properly wait for outstanding inode
references from detached connectors.
Note that the accounting of outstanding inode references in the
superblock can cause some cacheline contention on the counter. OTOH it
happens only during deletion of the last notification mark from an inode
(or during unlinking of watched inode) and that is not too bad. I have
measured time to create & delete inotify watch 100000 times from 64
processes in parallel (each process having its own inotify group and its
own file on a shared superblock) on a 64 CPU machine. Average and
standard deviation of 15 runs look like:
Avg Stddev
Vanilla 9.817400 0.276165
Fixed 9.710467 0.228294
So there's no statistically significant difference.
Fixes: 6b3f05d24d35 ("fsnotify: Detach mark from object list when last reference is dropped")
CC: stable(a)vger.kernel.org
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/notify/fsnotify.c | 3 +++
fs/notify/mark.c | 39 +++++++++++++++++++++++++++++++--------
include/linux/fs.h | 3 +++
3 files changed, 37 insertions(+), 8 deletions(-)
Changes since v2:
* fixed uninitialized warning
Changes since v1:
* added Fixes tag
* fsnotify_drop_object() now takes type
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index f174397b63a0..00d4f4357724 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -96,6 +96,9 @@ void fsnotify_unmount_inodes(struct super_block *sb)
if (iput_inode)
iput(iput_inode);
+ /* Wait for outstanding inode references from connectors */
+ wait_var_event(&sb->s_fsnotify_inode_refs,
+ !atomic_long_read(&sb->s_fsnotify_inode_refs));
}
/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 59cdb27826de..09535f6423fc 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -179,17 +179,20 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
}
}
-static struct inode *fsnotify_detach_connector_from_object(
- struct fsnotify_mark_connector *conn)
+static void *fsnotify_detach_connector_from_object(
+ struct fsnotify_mark_connector *conn,
+ unsigned int *type)
{
struct inode *inode = NULL;
+ *type = conn->type;
if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED)
return NULL;
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = fsnotify_conn_inode(conn);
inode->i_fsnotify_mask = 0;
+ atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs);
} else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
}
@@ -211,10 +214,29 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
fsnotify_put_group(group);
}
+/* Drop object reference originally held by a connector */
+static void fsnotify_drop_object(unsigned int type, void *objp)
+{
+ struct inode *inode;
+ struct super_block *sb;
+
+ if (!objp)
+ return;
+ /* Currently only inode references are passed to be dropped */
+ if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE))
+ return;
+ inode = objp;
+ sb = inode->i_sb;
+ iput(inode);
+ if (atomic_long_dec_and_test(&sb->s_fsnotify_inode_refs))
+ wake_up_var(&sb->s_fsnotify_inode_refs);
+}
+
void fsnotify_put_mark(struct fsnotify_mark *mark)
{
struct fsnotify_mark_connector *conn;
- struct inode *inode = NULL;
+ void *objp = NULL;
+ unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED;
bool free_conn = false;
/* Catch marks that were actually never attached to object */
@@ -234,7 +256,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
conn = mark->connector;
hlist_del_init_rcu(&mark->obj_list);
if (hlist_empty(&conn->list)) {
- inode = fsnotify_detach_connector_from_object(conn);
+ objp = fsnotify_detach_connector_from_object(conn, &type);
free_conn = true;
} else {
__fsnotify_recalc_mask(conn);
@@ -242,7 +264,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
mark->connector = NULL;
spin_unlock(&conn->lock);
- iput(inode);
+ fsnotify_drop_object(type, objp);
if (free_conn) {
spin_lock(&destroy_lock);
@@ -709,7 +731,8 @@ void fsnotify_destroy_marks(fsnotify_connp_t *connp)
{
struct fsnotify_mark_connector *conn;
struct fsnotify_mark *mark, *old_mark = NULL;
- struct inode *inode;
+ void *objp;
+ unsigned int type;
conn = fsnotify_grab_connector(connp);
if (!conn)
@@ -735,11 +758,11 @@ void fsnotify_destroy_marks(fsnotify_connp_t *connp)
* mark references get dropped. It would lead to strange results such
* as delaying inode deletion or blocking unmount.
*/
- inode = fsnotify_detach_connector_from_object(conn);
+ objp = fsnotify_detach_connector_from_object(conn, &type);
spin_unlock(&conn->lock);
if (old_mark)
fsnotify_put_mark(old_mark);
- iput(inode);
+ fsnotify_drop_object(type, objp);
}
/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 33322702c910..5090f3dcec3b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1428,6 +1428,9 @@ struct super_block {
/* Number of inodes with nlink == 0 but still referenced */
atomic_long_t s_remove_count;
+ /* Pending fsnotify inode refs */
+ atomic_long_t s_fsnotify_inode_refs;
+
/* Being remounted read-only */
int s_readonly_remount;
--
2.16.4
Since acpi_os_get_timer() may be called after the timer subsystem has
been suspended, use the jiffies counter instead of ktime_get(). This
patch avoids that the following warning is reported during hibernation:
WARNING: CPU: 0 PID: 612 at kernel/time/timekeeping.c:751 ktime_get+0x116/0x120
RIP: 0010:ktime_get+0x116/0x120
Call Trace:
acpi_os_get_timer+0xe/0x30
acpi_ds_exec_begin_control_op+0x175/0x1de
acpi_ds_exec_begin_op+0x2c7/0x39a
acpi_ps_create_op+0x573/0x5e4
acpi_ps_parse_loop+0x349/0x1220
acpi_ps_parse_aml+0x25b/0x6da
acpi_ps_execute_method+0x327/0x41b
acpi_ns_evaluate+0x4e9/0x6f5
acpi_ut_evaluate_object+0xd9/0x2f2
acpi_rs_get_method_data+0x8f/0x114
acpi_walk_resources+0x122/0x1b6
acpi_pci_link_get_current.isra.2+0x157/0x280
acpi_pci_link_set+0x32f/0x4a0
irqrouter_resume+0x58/0x80
syscore_resume+0x84/0x380
hibernation_snapshot+0x20c/0x4f0
hibernate+0x22d/0x3a6
state_store+0x99/0xa0
kobj_attr_store+0x37/0x50
sysfs_kf_write+0x87/0xa0
kernfs_fop_write+0x1a5/0x240
__vfs_write+0xd2/0x410
vfs_write+0x101/0x250
ksys_write+0xab/0x120
__x64_sys_write+0x43/0x50
do_syscall_64+0x71/0x220
entry_SYSCALL_64_after_hwframe+0x49/0xbe
Fixes: 164a08cee135 ("ACPICA: Dispatcher: Introduce timeout mechanism for infinite loop detection")
Reported-by: Fengguang Wu <fengguang.wu(a)intel.com>
References: https://lists.01.org/pipermail/lkp/2018-April/008406.html
Cc: Rafael J. Wysocki <rjw(a)rjwysocki.net>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Len Brown <lenb(a)kernel.org>
Cc: Yu Chen <yu.c.chen(a)intel.com>
Cc: Fengguang Wu <fengguang.wu(a)intel.com>
Cc: linux-acpi(a)vger.kernel.org
Cc: stable(a)vger.kernel.org
Signed-off-by: Bart Van Assche <bvanassche(a)acm.org>
---
drivers/acpi/osl.c | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 8df9abfa947b..ed73f6fb0779 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -617,15 +617,18 @@ void acpi_os_stall(u32 us)
}
/*
- * Support ACPI 3.0 AML Timer operand
- * Returns 64-bit free-running, monotonically increasing timer
- * with 100ns granularity
+ * Support ACPI 3.0 AML Timer operand. Returns a 64-bit free-running,
+ * monotonically increasing timer with 100ns granularity. Do not use
+ * ktime_get() to implement this function because this function may get
+ * called after timekeeping has been suspended. Note: calling this function
+ * after timekeeping has been suspended may lead to unexpected results
+ * because when timekeeping is suspended the jiffies counter is not
+ * incremented. See also timekeeping_suspend().
*/
u64 acpi_os_get_timer(void)
{
- u64 time_ns = ktime_to_ns(ktime_get());
- do_div(time_ns, 100);
- return time_ns;
+ return (get_jiffies_64() - INITIAL_JIFFIES) *
+ (ACPI_100NSEC_PER_SEC / HZ);
}
acpi_status acpi_os_read_port(acpi_io_address port, u32 * value, u32 width)
--
2.19.1.568.g152ad8e336-goog
The calculated ideal rate can easily overflow an unsigned long, thus
making the best div selection buggy as soon as no ideal match is found
before the overflow occurs.
Fixes: 4731a72df273 ("drm/sun4i: request exact rates to our parents")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Boris Brezillon <boris.brezillon(a)bootlin.com>
Acked-by: Maxime Ripard <maxime.ripard(a)bootlin.com>
---
Changes in v2:
- Add a comment to explain why we bail out after an overflow
- Add Maxime ack
- Use a goto instead of a break
---
drivers/gpu/drm/sun4i/sun4i_dotclock.c | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/sun4i/sun4i_dotclock.c b/drivers/gpu/drm/sun4i/sun4i_dotclock.c
index e36004fbe453..2a15f2f9271e 100644
--- a/drivers/gpu/drm/sun4i/sun4i_dotclock.c
+++ b/drivers/gpu/drm/sun4i/sun4i_dotclock.c
@@ -81,9 +81,19 @@ static long sun4i_dclk_round_rate(struct clk_hw *hw, unsigned long rate,
int i;
for (i = tcon->dclk_min_div; i <= tcon->dclk_max_div; i++) {
- unsigned long ideal = rate * i;
+ u64 ideal = (u64)rate * i;
unsigned long rounded;
+ /*
+ * ideal has overflowed the max value that can be stored in an
+ * unsigned long, and every clk operation we might do on a
+ * truncated u64 value will give us incorrect results.
+ * Let's just stop there since bigger dividers will result in
+ * the same overflow issue.
+ */
+ if (ideal > ULONG_MAX)
+ goto out;
+
rounded = clk_hw_round_rate(clk_hw_get_parent(hw),
ideal);
--
2.14.1
This is a note to let you know that I've just added the patch titled
usbip:vudc: BUG kmalloc-2048 (Not tainted): Poison overwritten
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-next branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will also be merged in the next major kernel release
during the merge window.
If you have any questions about this process, please let me know.
>From e28fd56ad5273be67d0fae5bedc7e1680e729952 Mon Sep 17 00:00:00 2001
From: "Shuah Khan (Samsung OSG)" <shuah(a)kernel.org>
Date: Thu, 18 Oct 2018 10:19:29 -0600
Subject: usbip:vudc: BUG kmalloc-2048 (Not tainted): Poison overwritten
In rmmod path, usbip_vudc does platform_device_put() twice once from
platform_device_unregister() and then from put_vudc_device().
The second put results in:
BUG kmalloc-2048 (Not tainted): Poison overwritten error or
BUG: KASAN: use-after-free in kobject_put+0x1e/0x230 if KASAN is
enabled.
[ 169.042156] calling init+0x0/0x1000 [usbip_vudc] @ 1697
[ 169.042396] =============================================================================
[ 169.043678] probe of usbip-vudc.0 returned 1 after 350 usecs
[ 169.044508] BUG kmalloc-2048 (Not tainted): Poison overwritten
[ 169.044509] -----------------------------------------------------------------------------
...
[ 169.057849] INFO: Freed in device_release+0x2b/0x80 age=4223 cpu=3 pid=1693
[ 169.057852] kobject_put+0x86/0x1b0
[ 169.057853] 0xffffffffc0c30a96
[ 169.057855] __x64_sys_delete_module+0x157/0x240
Fix it to call platform_device_del() instead and let put_vudc_device() do
the platform_device_put().
Reported-by: Randy Dunlap <rdunlap(a)infradead.org>
Signed-off-by: Shuah Khan (Samsung OSG) <shuah(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/usbip/vudc_main.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/drivers/usb/usbip/vudc_main.c b/drivers/usb/usbip/vudc_main.c
index 3fc22037a82f..390733e6937e 100644
--- a/drivers/usb/usbip/vudc_main.c
+++ b/drivers/usb/usbip/vudc_main.c
@@ -73,6 +73,10 @@ static int __init init(void)
cleanup:
list_for_each_entry_safe(udc_dev, udc_dev2, &vudc_devices, dev_entry) {
list_del(&udc_dev->dev_entry);
+ /*
+ * Just do platform_device_del() here, put_vudc_device()
+ * calls the platform_device_put()
+ */
platform_device_del(udc_dev->pdev);
put_vudc_device(udc_dev);
}
@@ -89,7 +93,11 @@ static void __exit cleanup(void)
list_for_each_entry_safe(udc_dev, udc_dev2, &vudc_devices, dev_entry) {
list_del(&udc_dev->dev_entry);
- platform_device_unregister(udc_dev->pdev);
+ /*
+ * Just do platform_device_del() here, put_vudc_device()
+ * calls the platform_device_put()
+ */
+ platform_device_del(udc_dev->pdev);
put_vudc_device(udc_dev);
}
platform_driver_unregister(&vudc_driver);
--
2.19.1
The patch titled
Subject: hugetlbfs: dirty pages as they are added to pagecache
has been added to the -mm tree. Its filename is
hugetlbfs-dirty-pages-as-they-are-added-to-pagecache.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/hugetlbfs-dirty-pages-as-they-are-…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/hugetlbfs-dirty-pages-as-they-are-…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Mike Kravetz <mike.kravetz(a)oracle.com>
Subject: hugetlbfs: dirty pages as they are added to pagecache
Some test systems were experiencing negative huge page reserve counts and
incorrect file block counts. This was traced to /proc/sys/vm/drop_caches
removing clean pages from hugetlbfs file pagecaches. When non-hugetlbfs
explicit code removes the pages, the appropriate accounting is not
performed.
This can be recreated as follows:
fallocate -l 2M /dev/hugepages/foo
echo 1 > /proc/sys/vm/drop_caches
fallocate -l 2M /dev/hugepages/foo
grep -i huge /proc/meminfo
AnonHugePages: 0 kB
ShmemHugePages: 0 kB
HugePages_Total: 2048
HugePages_Free: 2047
HugePages_Rsvd: 18446744073709551615
HugePages_Surp: 0
Hugepagesize: 2048 kB
Hugetlb: 4194304 kB
ls -lsh /dev/hugepages/foo
4.0M -rw-r--r--. 1 root root 2.0M Oct 17 20:05 /dev/hugepages/foo
To address this issue, dirty pages as they are added to pagecache. This
can easily be reproduced with fallocate as shown above. Read faulted
pages will eventually end up being marked dirty. But there is a window
where they are clean and could be impacted by code such as drop_caches.
So, just dirty them all as they are added to the pagecache.
In addition, it makes little sense to even try to drop hugetlbfs pagecache
pages, so disable calls to these filesystems in drop_caches code.
Link: http://lkml.kernel.org/r/20181018041022.4529-1-mike.kravetz@oracle.com
Fixes: 70c3547e36f5 ("hugetlbfs: add hugetlbfs_fallocate()")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Alexander Viro <viro(a)zeniv.linux.org.uk>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
--- a/fs/drop_caches.c~hugetlbfs-dirty-pages-as-they-are-added-to-pagecache
+++ a/fs/drop_caches.c
@@ -9,6 +9,7 @@
#include <linux/writeback.h>
#include <linux/sysctl.h>
#include <linux/gfp.h>
+#include <linux/magic.h>
#include "internal.h"
/* A global variable is a bit ugly, but it keeps the code simple */
@@ -18,6 +19,12 @@ static void drop_pagecache_sb(struct sup
{
struct inode *inode, *toput_inode = NULL;
+ /*
+ * It makes no sense to try and drop hugetlbfs page cache pages.
+ */
+ if (sb->s_magic == HUGETLBFS_MAGIC)
+ return;
+
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
--- a/mm/hugetlb.c~hugetlbfs-dirty-pages-as-they-are-added-to-pagecache
+++ a/mm/hugetlb.c
@@ -3690,6 +3690,12 @@ int huge_add_to_page_cache(struct page *
return err;
ClearPagePrivate(page);
+ /*
+ * set page dirty so that it will not be removed from cache/file
+ * by non-hugetlbfs specific code paths.
+ */
+ set_page_dirty(page);
+
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
_
Patches currently in -mm which might be from mike.kravetz(a)oracle.com are
hugetlbfs-dirty-pages-as-they-are-added-to-pagecache.patch
On Thu, 18 Oct 2018 11:23:12 PDT (-0700), merker(a)debian.org wrote:
> On Thu, Oct 18, 2018 at 11:13:02AM +0200, gregkh(a)linuxfoundation.org wrote:
>>
>> This is a note to let you know that I've just added the patch titled
>>
>> RISC-V: include linux/ftrace.h in asm-prototypes.h
>>
>> to the 4.4-stable tree which can be found at:
>> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>>
>> The filename of the patch is:
>> risc-v-include-linux-ftrace.h-in-asm-prototypes.h.patch
>> and it can be found in the queue-4.4 subdirectory.
>>
>> If you, or anyone else, feels it should not be added to the stable tree,
>> please let <stable(a)vger.kernel.org> know about it.
> [...]
>> From: James Cowgill <jcowgill(a)debian.org>
>> Date: Thu, 6 Sep 2018 22:57:56 +0100
>> Subject: RISC-V: include linux/ftrace.h in asm-prototypes.h
>>
>> From: James Cowgill <jcowgill(a)debian.org>
>>
>> [ Upstream commit 57a489786de9ec37d6e25ef1305dc337047f0236 ]
>
> I guess it doesn't make much sense to add this patch to the 4.4
> and 3.18 stable trees. The patch creates an arch-specific header
> (arch/riscv/include/asm/asm-prototypes.h), but the first mainline
> kernel with support for the RISC-V architecture has been kernel
> 4.15.
I agree.