In eviction recency check, we are currently not holding a local reference to the memcg that the refaulted folio belonged to when it was evicted. This could cause serious memcg lifetime issues, for e.g in the memcg hierarchy traversal done in mem_cgroup_get_nr_swap_pages(). This has occurred in production:
[ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 [ 155757.807568] #PF: supervisor read access in kernel mode [ 155757.818024] #PF: error_code(0x0000) - not-present page [ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 [ 155757.839985] Oops: 0000 [#1] SMP [ 155757.846444] CPU: 7 PID: 1380944 Comm: ThriftSrv-pri3- Kdump: loaded Tainted: G S 6.4.3-0_fbk1_rc0_594_g8d0cbcaa67ba #1 [ 155757.870808] Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM16 05/24/2021 [ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 [ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 [ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 [ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 [ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 [ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 [ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 [ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 [ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 [ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 155758.091376] PKRU: 55555554 [ 155758.096957] Call Trace: [ 155758.102016] <TASK> [ 155758.106502] ? __die+0x78/0xc0 [ 155758.112793] ? page_fault_oops+0x286/0x380 [ 155758.121175] ? exc_page_fault+0x5d/0x110 [ 155758.129209] ? asm_exc_page_fault+0x22/0x30 [ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155758.148060] workingset_test_recent+0xda/0x1b0 [ 155758.157133] workingset_refault+0xca/0x1e0 [ 155758.165508] filemap_add_folio+0x4d/0x70 [ 155758.173538] page_cache_ra_unbounded+0xed/0x190 [ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 [ 155758.191738] filemap_read+0x68d/0xdf0 [ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 [ 155758.207981] ? __napi_schedule+0x55/0x90 [ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 [ 155758.224601] do_syscall_64+0x3d/0x80 [ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 155758.242473] RIP: 0033:0x7f62c29153b5 [ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b [ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 [ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 [ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 [ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c [ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 [ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 [ 155758.376661] </TASK>
This patch fixes the issue by getting a local reference inside unpack_shadow().
Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container") Signed-off-by: Nhat Pham nphamcs@gmail.com Cc: stable@vger.kernel.org --- mm/workingset.c | 57 ++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 22 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c index da58a26d0d4d..c20b26bb6cb1 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -206,10 +206,11 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, return xa_mk_value(eviction); }
-static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, - unsigned long *evictionp, bool *workingsetp) +static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp, + pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) { unsigned long entry = xa_to_value(shadow); + struct mem_cgroup *memcg; int memcgid, nid; bool workingset;
@@ -220,7 +221,24 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT;
- *memcgidp = memcgid; + /* + * Look up the memcg associated with the stored ID. It might + * have been deleted since the folio's eviction. + * + * Note that in rare events the ID could have been recycled + * for a new cgroup that refaults a shared folio. This is + * impossible to tell from the available data. However, this + * should be a rare and limited disturbance, and activations + * are always speculative anyway. Ultimately, it's the aging + * algorithm's job to shake out the minimum access frequency + * for the active cache. + */ + memcg = mem_cgroup_from_id(memcgid); + if (memcg && css_tryget(&memcg->css)) + *memcgp = memcg; + else + *memcgp = NULL; + *pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset; @@ -262,15 +280,16 @@ static void *lru_gen_eviction(struct folio *folio) static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) { - int memcg_id; unsigned long min_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat;
- unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); + unpack_shadow(shadow, &memcg, &pgdat, token, workingset); + if (!mem_cgroup_disabled() && !memcg) + return false;
- memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); + mem_cgroup_put(memcg);
min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)); @@ -421,36 +440,29 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) unsigned long refault_distance; unsigned long workingset_size; unsigned long refault; - int memcgid; struct pglist_data *pgdat; unsigned long eviction;
if (lru_gen_enabled()) return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset);
- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); - eviction <<= bucket_order; - + unpack_shadow(shadow, &eviction_memcg, &pgdat, &eviction, workingset); /* - * Look up the memcg associated with the stored ID. It might - * have been deleted since the folio's eviction. + * When memcg is enabled, we only get !memcg here if the + * eviction group has been deleted. In that case, ignore + * the refault. * - * Note that in rare events the ID could have been recycled - * for a new cgroup that refaults a shared folio. This is - * impossible to tell from the available data. However, this - * should be a rare and limited disturbance, and activations - * are always speculative anyway. Ultimately, it's the aging - * algorithm's job to shake out the minimum access frequency - * for the active cache. + * When memcg is disabled, we always get NULL since there + * is no root_mem_cgroup for !CONFIG_MEMCG. Continue; the + * mem_cgroup_lruvec() will get us the global lruvec. * - * XXX: On !CONFIG_MEMCG, this will always return NULL; it - * would be better if the root_mem_cgroup existed in all + * XXX: It would be better if the root_mem_cgroup existed in all * configurations instead. */ - eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && !eviction_memcg) return false;
+ eviction <<= bucket_order; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age);
@@ -493,6 +505,7 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) } }
+ mem_cgroup_put(eviction_memcg); return refault_distance <= workingset_size; }
On Thu, Aug 17, 2023 at 9:47 AM Nhat Pham nphamcs@gmail.com wrote:
In eviction recency check, we are currently not holding a local reference to the memcg that the refaulted folio belonged to when it was evicted. This could cause serious memcg lifetime issues, for e.g in the memcg hierarchy traversal done in mem_cgroup_get_nr_swap_pages(). This has occurred in production:
Doesn't workingset_refault() call workingset_test_recent() under RCU? Shouldn't this guarantee the validity of the memcg?
[ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 [ 155757.807568] #PF: supervisor read access in kernel mode [ 155757.818024] #PF: error_code(0x0000) - not-present page [ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 [ 155757.839985] Oops: 0000 [#1] SMP [ 155757.846444] CPU: 7 PID: 1380944 Comm: ThriftSrv-pri3- Kdump: loaded Tainted: G S 6.4.3-0_fbk1_rc0_594_g8d0cbcaa67ba #1 [ 155757.870808] Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM16 05/24/2021 [ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 [ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 [ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 [ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 [ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 [ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 [ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 [ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 [ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 [ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 155758.091376] PKRU: 55555554 [ 155758.096957] Call Trace: [ 155758.102016] <TASK> [ 155758.106502] ? __die+0x78/0xc0 [ 155758.112793] ? page_fault_oops+0x286/0x380 [ 155758.121175] ? exc_page_fault+0x5d/0x110 [ 155758.129209] ? asm_exc_page_fault+0x22/0x30 [ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155758.148060] workingset_test_recent+0xda/0x1b0 [ 155758.157133] workingset_refault+0xca/0x1e0 [ 155758.165508] filemap_add_folio+0x4d/0x70 [ 155758.173538] page_cache_ra_unbounded+0xed/0x190 [ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 [ 155758.191738] filemap_read+0x68d/0xdf0 [ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 [ 155758.207981] ? __napi_schedule+0x55/0x90 [ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 [ 155758.224601] do_syscall_64+0x3d/0x80 [ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 155758.242473] RIP: 0033:0x7f62c29153b5 [ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b [ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 [ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 [ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 [ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c [ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 [ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 [ 155758.376661] </TASK>
This patch fixes the issue by getting a local reference inside unpack_shadow().
Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container") Signed-off-by: Nhat Pham nphamcs@gmail.com Cc: stable@vger.kernel.org
mm/workingset.c | 57 ++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 22 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c index da58a26d0d4d..c20b26bb6cb1 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -206,10 +206,11 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, return xa_mk_value(eviction); }
-static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
unsigned long *evictionp, bool *workingsetp)
+static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp,
pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp)
We should probably document that we are returning a memcg with a ref on it, or change the name to put "get" and "memcg" in there somewhere. Alternatively we could separate the function that gets the memcg from the shadow (e.g. get_memcg_from_shadow or so).
I would also hold an RCU read lock here so that the call to mem_cgroup_from_id() is valid. I know all callers currently do, but it may change later. Alternatively we can add an assertion or a warning.
{ unsigned long entry = xa_to_value(shadow);
struct mem_cgroup *memcg; int memcgid, nid; bool workingset;
@@ -220,7 +221,24 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT;
*memcgidp = memcgid;
/*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
*
* Note that in rare events the ID could have been recycled
* for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
* algorithm's job to shake out the minimum access frequency
* for the active cache.
*/
memcg = mem_cgroup_from_id(memcgid);
if (memcg && css_tryget(&memcg->css))
*memcgp = memcg;
else
*memcgp = NULL;
*pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset;
@@ -262,15 +280,16 @@ static void *lru_gen_eviction(struct folio *folio) static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) {
int memcg_id; unsigned long min_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat;
unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
unpack_shadow(shadow, &memcg, &pgdat, token, workingset);
if (!mem_cgroup_disabled() && !memcg)
return false;
memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat);
mem_cgroup_put(memcg); min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
@@ -421,36 +440,29 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) unsigned long refault_distance; unsigned long workingset_size; unsigned long refault;
int memcgid; struct pglist_data *pgdat; unsigned long eviction; if (lru_gen_enabled()) return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset);
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
eviction <<= bucket_order;
unpack_shadow(shadow, &eviction_memcg, &pgdat, &eviction, workingset); /*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
* When memcg is enabled, we only get !memcg here if the
* eviction group has been deleted. In that case, ignore
* the refault. *
* Note that in rare events the ID could have been recycled
* for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
* algorithm's job to shake out the minimum access frequency
* for the active cache.
* When memcg is disabled, we always get NULL since there
* is no root_mem_cgroup for !CONFIG_MEMCG. Continue; the
* mem_cgroup_lruvec() will get us the global lruvec. *
* XXX: On !CONFIG_MEMCG, this will always return NULL; it
* would be better if the root_mem_cgroup existed in all
* XXX: It would be better if the root_mem_cgroup existed in all * configurations instead. */
eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && !eviction_memcg) return false;
eviction <<= bucket_order; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age);
@@ -493,6 +505,7 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) } }
mem_cgroup_put(eviction_memcg); return refault_distance <= workingset_size;
}
-- 2.34.1
Thanks for the review, Yosry! On Thu, Aug 17, 2023 at 10:40 AM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 9:47 AM Nhat Pham nphamcs@gmail.com wrote:
In eviction recency check, we are currently not holding a local reference to the memcg that the refaulted folio belonged to when it was evicted. This could cause serious memcg lifetime issues, for e.g in the memcg hierarchy traversal done in mem_cgroup_get_nr_swap_pages(). This has occurred in production:
Doesn't workingset_refault() call workingset_test_recent() under RCU?
Yep we're under RCU protection. But the eviction_memcg is grabbed from a memcg id packed in the shadow - its reference count might go down to 0 in the meantime and get killed.
This pattern (tryget memcg inside RCU) is done in a couple other places too.
Shouldn't this guarantee the validity of the memcg?
[ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 [ 155757.807568] #PF: supervisor read access in kernel mode [ 155757.818024] #PF: error_code(0x0000) - not-present page [ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 [ 155757.839985] Oops: 0000 [#1] SMP [ 155757.846444] CPU: 7 PID: 1380944 Comm: ThriftSrv-pri3- Kdump: loaded Tainted: G S 6.4.3-0_fbk1_rc0_594_g8d0cbcaa67ba #1 [ 155757.870808] Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM16 05/24/2021 [ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 [ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 [ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 [ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 [ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 [ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 [ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 [ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 [ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 [ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 155758.091376] PKRU: 55555554 [ 155758.096957] Call Trace: [ 155758.102016] <TASK> [ 155758.106502] ? __die+0x78/0xc0 [ 155758.112793] ? page_fault_oops+0x286/0x380 [ 155758.121175] ? exc_page_fault+0x5d/0x110 [ 155758.129209] ? asm_exc_page_fault+0x22/0x30 [ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155758.148060] workingset_test_recent+0xda/0x1b0 [ 155758.157133] workingset_refault+0xca/0x1e0 [ 155758.165508] filemap_add_folio+0x4d/0x70 [ 155758.173538] page_cache_ra_unbounded+0xed/0x190 [ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 [ 155758.191738] filemap_read+0x68d/0xdf0 [ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 [ 155758.207981] ? __napi_schedule+0x55/0x90 [ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 [ 155758.224601] do_syscall_64+0x3d/0x80 [ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 155758.242473] RIP: 0033:0x7f62c29153b5 [ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b [ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 [ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 [ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 [ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c [ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 [ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 [ 155758.376661] </TASK>
This patch fixes the issue by getting a local reference inside unpack_shadow().
Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container") Signed-off-by: Nhat Pham nphamcs@gmail.com Cc: stable@vger.kernel.org
mm/workingset.c | 57 ++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 22 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c index da58a26d0d4d..c20b26bb6cb1 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -206,10 +206,11 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, return xa_mk_value(eviction); }
-static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
unsigned long *evictionp, bool *workingsetp)
+static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp,
pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp)
We should probably document that we are returning a memcg with a ref on it, or change the name to put "get" and "memcg" in there somewhere. Alternatively we could separate the function that gets the memcg from the shadow (e.g. get_memcg_from_shadow or so).
Hmm I feel like unpack as a verb is good here. But I can add a small comment/documentation above the function to signify that this function holds a reference to the memcg, and a corresponding put call will be needed, if that helps.
I would also hold an RCU read lock here so that the call to mem_cgroup_from_id() is valid. I know all callers currently do, but it may change later. Alternatively we can add an assertion or a warning.
I prefer the assertion idea. But FWIW, mem_cgroup_from_id() already has a warning for this (WARN_ON_ONCE(!rcu_read_lock_held()))
Seems redundant to me, especially if the reason why we care about unpack_shadow() being under RCU is for mem_cgroup_from_id() to be valid.
{ unsigned long entry = xa_to_value(shadow);
struct mem_cgroup *memcg; int memcgid, nid; bool workingset;
@@ -220,7 +221,24 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT;
*memcgidp = memcgid;
/*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
*
* Note that in rare events the ID could have been recycled
* for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
* algorithm's job to shake out the minimum access frequency
* for the active cache.
*/
memcg = mem_cgroup_from_id(memcgid);
if (memcg && css_tryget(&memcg->css))
*memcgp = memcg;
else
*memcgp = NULL;
*pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset;
@@ -262,15 +280,16 @@ static void *lru_gen_eviction(struct folio *folio) static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) {
int memcg_id; unsigned long min_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat;
unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
unpack_shadow(shadow, &memcg, &pgdat, token, workingset);
if (!mem_cgroup_disabled() && !memcg)
return false;
memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat);
mem_cgroup_put(memcg); min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
@@ -421,36 +440,29 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) unsigned long refault_distance; unsigned long workingset_size; unsigned long refault;
int memcgid; struct pglist_data *pgdat; unsigned long eviction; if (lru_gen_enabled()) return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset);
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
eviction <<= bucket_order;
unpack_shadow(shadow, &eviction_memcg, &pgdat, &eviction, workingset); /*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
* When memcg is enabled, we only get !memcg here if the
* eviction group has been deleted. In that case, ignore
* the refault. *
* Note that in rare events the ID could have been recycled
* for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
* algorithm's job to shake out the minimum access frequency
* for the active cache.
* When memcg is disabled, we always get NULL since there
* is no root_mem_cgroup for !CONFIG_MEMCG. Continue; the
* mem_cgroup_lruvec() will get us the global lruvec. *
* XXX: On !CONFIG_MEMCG, this will always return NULL; it
* would be better if the root_mem_cgroup existed in all
* XXX: It would be better if the root_mem_cgroup existed in all * configurations instead. */
eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && !eviction_memcg) return false;
eviction <<= bucket_order; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age);
@@ -493,6 +505,7 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) } }
mem_cgroup_put(eviction_memcg); return refault_distance <= workingset_size;
}
-- 2.34.1
On Thu, Aug 17, 2023 at 11:13 AM Nhat Pham nphamcs@gmail.com wrote:
Thanks for the review, Yosry! On Thu, Aug 17, 2023 at 10:40 AM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 9:47 AM Nhat Pham nphamcs@gmail.com wrote:
In eviction recency check, we are currently not holding a local reference to the memcg that the refaulted folio belonged to when it was evicted. This could cause serious memcg lifetime issues, for e.g in the memcg hierarchy traversal done in mem_cgroup_get_nr_swap_pages(). This has occurred in production:
Doesn't workingset_refault() call workingset_test_recent() under RCU?
Yep we're under RCU protection. But the eviction_memcg is grabbed from a memcg id packed in the shadow - its reference count might go down to 0 in the meantime and get killed.
I see, is this basically protecting against a race with mem_cgroup_id_put_many()? My thought was that given that we have an id, then we should have a ref to the memcg, but if we race with mem_cgroup_id_put_many() we might get the id just before it is removed and the ref is put, and then use the memcg after it's freed.
Is this what's happening here? If yes, I think this makes sense.
This pattern (tryget memcg inside RCU) is done in a couple other places too.
Shouldn't this guarantee the validity of the memcg?
[ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 [ 155757.807568] #PF: supervisor read access in kernel mode [ 155757.818024] #PF: error_code(0x0000) - not-present page [ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 [ 155757.839985] Oops: 0000 [#1] SMP [ 155757.846444] CPU: 7 PID: 1380944 Comm: ThriftSrv-pri3- Kdump: loaded Tainted: G S 6.4.3-0_fbk1_rc0_594_g8d0cbcaa67ba #1 [ 155757.870808] Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM16 05/24/2021 [ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 [ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 [ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 [ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 [ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 [ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 [ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 [ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 [ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 [ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 155758.091376] PKRU: 55555554 [ 155758.096957] Call Trace: [ 155758.102016] <TASK> [ 155758.106502] ? __die+0x78/0xc0 [ 155758.112793] ? page_fault_oops+0x286/0x380 [ 155758.121175] ? exc_page_fault+0x5d/0x110 [ 155758.129209] ? asm_exc_page_fault+0x22/0x30 [ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155758.148060] workingset_test_recent+0xda/0x1b0 [ 155758.157133] workingset_refault+0xca/0x1e0 [ 155758.165508] filemap_add_folio+0x4d/0x70 [ 155758.173538] page_cache_ra_unbounded+0xed/0x190 [ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 [ 155758.191738] filemap_read+0x68d/0xdf0 [ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 [ 155758.207981] ? __napi_schedule+0x55/0x90 [ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 [ 155758.224601] do_syscall_64+0x3d/0x80 [ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 155758.242473] RIP: 0033:0x7f62c29153b5 [ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b [ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 [ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 [ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 [ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c [ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 [ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 [ 155758.376661] </TASK>
This patch fixes the issue by getting a local reference inside unpack_shadow().
Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container") Signed-off-by: Nhat Pham nphamcs@gmail.com Cc: stable@vger.kernel.org
mm/workingset.c | 57 ++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 22 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c index da58a26d0d4d..c20b26bb6cb1 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -206,10 +206,11 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, return xa_mk_value(eviction); }
-static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
unsigned long *evictionp, bool *workingsetp)
+static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp,
pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp)
We should probably document that we are returning a memcg with a ref on it, or change the name to put "get" and "memcg" in there somewhere. Alternatively we could separate the function that gets the memcg from the shadow (e.g. get_memcg_from_shadow or so).
Hmm I feel like unpack as a verb is good here. But I can add a small comment/documentation above the function to signify that this function holds a reference to the memcg, and a corresponding put call will be needed, if that helps.
I don't want to be nitpicking, so no strong opinions here :)
I would also hold an RCU read lock here so that the call to mem_cgroup_from_id() is valid. I know all callers currently do, but it may change later. Alternatively we can add an assertion or a warning.
I prefer the assertion idea. But FWIW, mem_cgroup_from_id() already has a warning for this (WARN_ON_ONCE(!rcu_read_lock_held()))
Seems redundant to me, especially if the reason why we care about unpack_shadow() being under RCU is for mem_cgroup_from_id() to be valid.
Fair enough, I guess basic testing with debugging enabled should spot any misuse quickly enough.
In eviction recency check, we are currently not holding a local reference to the memcg that the refaulted folio belonged to when it was evicted. This could cause serious memcg lifetime issues, for e.g in the memcg hierarchy traversal done in mem_cgroup_get_nr_swap_pages(). This has occurred in production:
[ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 [ 155757.807568] #PF: supervisor read access in kernel mode [ 155757.818024] #PF: error_code(0x0000) - not-present page [ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 [ 155757.839985] Oops: 0000 [#1] SMP [ 155757.846444] CPU: 7 PID: 1380944 Comm: ThriftSrv-pri3- Kdump: loaded Tainted: G S 6.4.3-0_fbk1_rc0_594_g8d0cbcaa67ba #1 [ 155757.870808] Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM16 05/24/2021 [ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 [ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 [ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 [ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 [ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 [ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 [ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 [ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 [ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 [ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 155758.091376] PKRU: 55555554 [ 155758.096957] Call Trace: [ 155758.102016] <TASK> [ 155758.106502] ? __die+0x78/0xc0 [ 155758.112793] ? page_fault_oops+0x286/0x380 [ 155758.121175] ? exc_page_fault+0x5d/0x110 [ 155758.129209] ? asm_exc_page_fault+0x22/0x30 [ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155758.148060] workingset_test_recent+0xda/0x1b0 [ 155758.157133] workingset_refault+0xca/0x1e0 [ 155758.165508] filemap_add_folio+0x4d/0x70 [ 155758.173538] page_cache_ra_unbounded+0xed/0x190 [ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 [ 155758.191738] filemap_read+0x68d/0xdf0 [ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 [ 155758.207981] ? __napi_schedule+0x55/0x90 [ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 [ 155758.224601] do_syscall_64+0x3d/0x80 [ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 155758.242473] RIP: 0033:0x7f62c29153b5 [ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b [ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 [ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 [ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 [ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c [ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 [ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 [ 155758.376661] </TASK>
This patch fixes the issue by getting a local reference inside unpack_shadow().
Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container") Signed-off-by: Nhat Pham nphamcs@gmail.com Cc: stable@vger.kernel.org --- mm/workingset.c | 65 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 22 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c index da58a26d0d4d..03cadad4e484 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -206,10 +206,19 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, return xa_mk_value(eviction); }
-static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, - unsigned long *evictionp, bool *workingsetp) +/* + * Unpacks the stored fields of a shadow entry into the given pointers. + * + * The memcg pointer is only populated if the memcg recorded in the shadow + * entry is valid. In this case, a reference to the memcg will be acquired, + * and a corresponding mem_cgroup_put() will be needed when we no longer + * need the memcg. + */ +static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp, + pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) { unsigned long entry = xa_to_value(shadow); + struct mem_cgroup *memcg; int memcgid, nid; bool workingset;
@@ -220,7 +229,24 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT;
- *memcgidp = memcgid; + /* + * Look up the memcg associated with the stored ID. It might + * have been deleted since the folio's eviction. + * + * Note that in rare events the ID could have been recycled + * for a new cgroup that refaults a shared folio. This is + * impossible to tell from the available data. However, this + * should be a rare and limited disturbance, and activations + * are always speculative anyway. Ultimately, it's the aging + * algorithm's job to shake out the minimum access frequency + * for the active cache. + */ + memcg = mem_cgroup_from_id(memcgid); + if (memcg && css_tryget(&memcg->css)) + *memcgp = memcg; + else + *memcgp = NULL; + *pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset; @@ -262,15 +288,16 @@ static void *lru_gen_eviction(struct folio *folio) static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) { - int memcg_id; unsigned long min_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat;
- unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); + unpack_shadow(shadow, &memcg, &pgdat, token, workingset); + if (!mem_cgroup_disabled() && !memcg) + return false;
- memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); + mem_cgroup_put(memcg);
min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)); @@ -421,36 +448,29 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) unsigned long refault_distance; unsigned long workingset_size; unsigned long refault; - int memcgid; struct pglist_data *pgdat; unsigned long eviction;
if (lru_gen_enabled()) return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset);
- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); - eviction <<= bucket_order; - + unpack_shadow(shadow, &eviction_memcg, &pgdat, &eviction, workingset); /* - * Look up the memcg associated with the stored ID. It might - * have been deleted since the folio's eviction. + * When memcg is enabled, we only get !memcg here if the + * eviction group has been deleted. In that case, ignore + * the refault. * - * Note that in rare events the ID could have been recycled - * for a new cgroup that refaults a shared folio. This is - * impossible to tell from the available data. However, this - * should be a rare and limited disturbance, and activations - * are always speculative anyway. Ultimately, it's the aging - * algorithm's job to shake out the minimum access frequency - * for the active cache. + * When memcg is disabled, we always get NULL since there + * is no root_mem_cgroup for !CONFIG_MEMCG. Continue; the + * mem_cgroup_lruvec() will get us the global lruvec. * - * XXX: On !CONFIG_MEMCG, this will always return NULL; it - * would be better if the root_mem_cgroup existed in all + * XXX: It would be better if the root_mem_cgroup existed in all * configurations instead. */ - eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && !eviction_memcg) return false;
+ eviction <<= bucket_order; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age);
@@ -493,6 +513,7 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) } }
+ mem_cgroup_put(eviction_memcg); return refault_distance <= workingset_size; }
On Thu, Aug 17, 2023 at 12:01 PM Nhat Pham nphamcs@gmail.com wrote:
In eviction recency check, we are currently not holding a local reference to the memcg that the refaulted folio belonged to when it was evicted. This could cause serious memcg lifetime issues, for e.g in the memcg hierarchy traversal done in mem_cgroup_get_nr_swap_pages(). This has occurred in production:
[ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 [ 155757.807568] #PF: supervisor read access in kernel mode [ 155757.818024] #PF: error_code(0x0000) - not-present page [ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 [ 155757.839985] Oops: 0000 [#1] SMP [ 155757.846444] CPU: 7 PID: 1380944 Comm: ThriftSrv-pri3- Kdump: loaded Tainted: G S 6.4.3-0_fbk1_rc0_594_g8d0cbcaa67ba #1 [ 155757.870808] Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM16 05/24/2021 [ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 [ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 [ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 [ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 [ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 [ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 [ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 [ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 [ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 [ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 155758.091376] PKRU: 55555554 [ 155758.096957] Call Trace: [ 155758.102016] <TASK> [ 155758.106502] ? __die+0x78/0xc0 [ 155758.112793] ? page_fault_oops+0x286/0x380 [ 155758.121175] ? exc_page_fault+0x5d/0x110 [ 155758.129209] ? asm_exc_page_fault+0x22/0x30 [ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155758.148060] workingset_test_recent+0xda/0x1b0 [ 155758.157133] workingset_refault+0xca/0x1e0 [ 155758.165508] filemap_add_folio+0x4d/0x70 [ 155758.173538] page_cache_ra_unbounded+0xed/0x190 [ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 [ 155758.191738] filemap_read+0x68d/0xdf0 [ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 [ 155758.207981] ? __napi_schedule+0x55/0x90 [ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 [ 155758.224601] do_syscall_64+0x3d/0x80 [ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 155758.242473] RIP: 0033:0x7f62c29153b5 [ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b [ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 [ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 [ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 [ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c [ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 [ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 [ 155758.376661] </TASK>
This patch fixes the issue by getting a local reference inside unpack_shadow().
Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container")
Beyond mem_cgroup_get_nr_swap_pages(), we still use the eviction_memcg without grabbing a ref to it first in workingset_test_recent() (and in workingset_refault() before that) as well as lru_gen_test_recent().
Wouldn't the fix go back even further? or am I misinterpreting the problem?
Signed-off-by: Nhat Pham nphamcs@gmail.com Cc: stable@vger.kernel.org
mm/workingset.c | 65 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 22 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c index da58a26d0d4d..03cadad4e484 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -206,10 +206,19 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, return xa_mk_value(eviction); }
-static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
unsigned long *evictionp, bool *workingsetp)
+/*
- Unpacks the stored fields of a shadow entry into the given pointers.
- The memcg pointer is only populated if the memcg recorded in the shadow
- entry is valid. In this case, a reference to the memcg will be acquired,
- and a corresponding mem_cgroup_put() will be needed when we no longer
- need the memcg.
- */
+static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp,
pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp)
{ unsigned long entry = xa_to_value(shadow);
struct mem_cgroup *memcg; int memcgid, nid; bool workingset;
@@ -220,7 +229,24 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT;
*memcgidp = memcgid;
/*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
*
* Note that in rare events the ID could have been recycled
* for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
* algorithm's job to shake out the minimum access frequency
* for the active cache.
*/
memcg = mem_cgroup_from_id(memcgid);
if (memcg && css_tryget(&memcg->css))
*memcgp = memcg;
else
*memcgp = NULL;
*pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset;
@@ -262,15 +288,16 @@ static void *lru_gen_eviction(struct folio *folio) static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) {
int memcg_id; unsigned long min_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat;
unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
unpack_shadow(shadow, &memcg, &pgdat, token, workingset);
if (!mem_cgroup_disabled() && !memcg)
return false;
+Yu Zhao
There is a change of behavior here, right?
The existing code will continue if !mem_cgroup_disabled() && !memcg is true, and mem_cgroup_lruvec() will return the lruvec of the root memcg. Now we are just returning false.
Is this intentional?
memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat);
mem_cgroup_put(memcg); min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
@@ -421,36 +448,29 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) unsigned long refault_distance; unsigned long workingset_size; unsigned long refault;
int memcgid; struct pglist_data *pgdat; unsigned long eviction; if (lru_gen_enabled()) return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset);
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
eviction <<= bucket_order;
unpack_shadow(shadow, &eviction_memcg, &pgdat, &eviction, workingset); /*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
* When memcg is enabled, we only get !memcg here if the
* eviction group has been deleted. In that case, ignore
* the refault. *
* Note that in rare events the ID could have been recycled
* for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
* algorithm's job to shake out the minimum access frequency
* for the active cache.
* When memcg is disabled, we always get NULL since there
* is no root_mem_cgroup for !CONFIG_MEMCG. Continue; the
* mem_cgroup_lruvec() will get us the global lruvec. *
* XXX: On !CONFIG_MEMCG, this will always return NULL; it
* would be better if the root_mem_cgroup existed in all
* XXX: It would be better if the root_mem_cgroup existed in all * configurations instead. */
eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && !eviction_memcg) return false;
eviction <<= bucket_order; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age);
@@ -493,6 +513,7 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) } }
mem_cgroup_put(eviction_memcg); return refault_distance <= workingset_size;
}
-- 2.34.1
On Thu, Aug 17, 2023 at 1:50 PM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 12:01 PM Nhat Pham nphamcs@gmail.com wrote:
In eviction recency check, we are currently not holding a local reference to the memcg that the refaulted folio belonged to when it was evicted. This could cause serious memcg lifetime issues, for e.g in the memcg hierarchy traversal done in mem_cgroup_get_nr_swap_pages(). This has occurred in production:
[ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 [ 155757.807568] #PF: supervisor read access in kernel mode [ 155757.818024] #PF: error_code(0x0000) - not-present page [ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 [ 155757.839985] Oops: 0000 [#1] SMP [ 155757.846444] CPU: 7 PID: 1380944 Comm: ThriftSrv-pri3- Kdump: loaded Tainted: G S 6.4.3-0_fbk1_rc0_594_g8d0cbcaa67ba #1 [ 155757.870808] Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM16 05/24/2021 [ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 [ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 [ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 [ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 [ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 [ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 [ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 [ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 [ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 [ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 155758.091376] PKRU: 55555554 [ 155758.096957] Call Trace: [ 155758.102016] <TASK> [ 155758.106502] ? __die+0x78/0xc0 [ 155758.112793] ? page_fault_oops+0x286/0x380 [ 155758.121175] ? exc_page_fault+0x5d/0x110 [ 155758.129209] ? asm_exc_page_fault+0x22/0x30 [ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155758.148060] workingset_test_recent+0xda/0x1b0 [ 155758.157133] workingset_refault+0xca/0x1e0 [ 155758.165508] filemap_add_folio+0x4d/0x70 [ 155758.173538] page_cache_ra_unbounded+0xed/0x190 [ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 [ 155758.191738] filemap_read+0x68d/0xdf0 [ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 [ 155758.207981] ? __napi_schedule+0x55/0x90 [ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 [ 155758.224601] do_syscall_64+0x3d/0x80 [ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 155758.242473] RIP: 0033:0x7f62c29153b5 [ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b [ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 [ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 [ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 [ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c [ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 [ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 [ 155758.376661] </TASK>
This patch fixes the issue by getting a local reference inside unpack_shadow().
Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container")
Beyond mem_cgroup_get_nr_swap_pages(), we still use the eviction_memcg without grabbing a ref to it first in workingset_test_recent() (and in workingset_refault() before that) as well as lru_gen_test_recent().
Wouldn't the fix go back even further? or am I misinterpreting the problem?
Hmm I don't see eviction_memcg being used outside of *_test_recent (the rest just uses memcg = folio_memcg(folio), which if I'm not mistaken is the memcg that is refaulting the folio into memory).
Inside workingset_test_recent(), the only other place where eviction_memcg is used is for mem_cgroup_lruvec. This function call won't crash whether eviction_memcg is valid or not. The crash only happens during mem_cgroup_get_nr_swap_pages, which has an upward traversal from eviction_memcg to root.
Let me know if this does not make sense and/or is insufficient to ensure safe upward traversal from eviction_memcg to root!
Signed-off-by: Nhat Pham nphamcs@gmail.com Cc: stable@vger.kernel.org
mm/workingset.c | 65 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 22 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c index da58a26d0d4d..03cadad4e484 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -206,10 +206,19 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, return xa_mk_value(eviction); }
-static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
unsigned long *evictionp, bool *workingsetp)
+/*
- Unpacks the stored fields of a shadow entry into the given pointers.
- The memcg pointer is only populated if the memcg recorded in the shadow
- entry is valid. In this case, a reference to the memcg will be acquired,
- and a corresponding mem_cgroup_put() will be needed when we no longer
- need the memcg.
- */
+static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp,
pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp)
{ unsigned long entry = xa_to_value(shadow);
struct mem_cgroup *memcg; int memcgid, nid; bool workingset;
@@ -220,7 +229,24 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT;
*memcgidp = memcgid;
/*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
*
* Note that in rare events the ID could have been recycled
* for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
* algorithm's job to shake out the minimum access frequency
* for the active cache.
*/
memcg = mem_cgroup_from_id(memcgid);
if (memcg && css_tryget(&memcg->css))
*memcgp = memcg;
else
*memcgp = NULL;
*pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset;
@@ -262,15 +288,16 @@ static void *lru_gen_eviction(struct folio *folio) static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) {
int memcg_id; unsigned long min_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat;
unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
unpack_shadow(shadow, &memcg, &pgdat, token, workingset);
if (!mem_cgroup_disabled() && !memcg)
return false;
+Yu Zhao
There is a change of behavior here, right?
The existing code will continue if !mem_cgroup_disabled() && !memcg is true, and mem_cgroup_lruvec() will return the lruvec of the root memcg. Now we are just returning false.
Is this intentional?
Oh right, there is. Should have cc-ed Yu Zhao as well, my bad. get_maintainers.pl isn't always sufficient I guess :)
But yeah, this behavioral change is intentional.
Correct me if I'm wrong of course, but it seems like MGLRU should follow the same pattern here. That is, once we return from unpack_shadow, the possible scenarios are the same as prescribed in workingset_test_recent:
1. If mem_cgroup is disabled, we can ignore this check. 2. If mem_cgroup is enabled, then the only reason why we get NULL memcg from unpack_shadow is if the eviction_memcg is no longer valid. We should not try to get its lruvec, or substitute it with the root memcg, but return false right away (i.e not recent).
memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat);
mem_cgroup_put(memcg); min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
@@ -421,36 +448,29 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) unsigned long refault_distance; unsigned long workingset_size; unsigned long refault;
int memcgid; struct pglist_data *pgdat; unsigned long eviction; if (lru_gen_enabled()) return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset);
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
eviction <<= bucket_order;
unpack_shadow(shadow, &eviction_memcg, &pgdat, &eviction, workingset); /*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
* When memcg is enabled, we only get !memcg here if the
* eviction group has been deleted. In that case, ignore
* the refault. *
* Note that in rare events the ID could have been recycled
* for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
* algorithm's job to shake out the minimum access frequency
* for the active cache.
* When memcg is disabled, we always get NULL since there
* is no root_mem_cgroup for !CONFIG_MEMCG. Continue; the
* mem_cgroup_lruvec() will get us the global lruvec. *
* XXX: On !CONFIG_MEMCG, this will always return NULL; it
* would be better if the root_mem_cgroup existed in all
* XXX: It would be better if the root_mem_cgroup existed in all * configurations instead. */
eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && !eviction_memcg) return false;
eviction <<= bucket_order; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age);
@@ -493,6 +513,7 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) } }
mem_cgroup_put(eviction_memcg); return refault_distance <= workingset_size;
}
-- 2.34.1
On Thu, Aug 17, 2023 at 3:43 PM Nhat Pham nphamcs@gmail.com wrote:
On Thu, Aug 17, 2023 at 1:50 PM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 12:01 PM Nhat Pham nphamcs@gmail.com wrote:
In eviction recency check, we are currently not holding a local reference to the memcg that the refaulted folio belonged to when it was evicted. This could cause serious memcg lifetime issues, for e.g in the memcg hierarchy traversal done in mem_cgroup_get_nr_swap_pages(). This has occurred in production:
[ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 [ 155757.807568] #PF: supervisor read access in kernel mode [ 155757.818024] #PF: error_code(0x0000) - not-present page [ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 [ 155757.839985] Oops: 0000 [#1] SMP [ 155757.846444] CPU: 7 PID: 1380944 Comm: ThriftSrv-pri3- Kdump: loaded Tainted: G S 6.4.3-0_fbk1_rc0_594_g8d0cbcaa67ba #1 [ 155757.870808] Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM16 05/24/2021 [ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 [ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 [ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 [ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 [ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 [ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 [ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 [ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 [ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 [ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 155758.091376] PKRU: 55555554 [ 155758.096957] Call Trace: [ 155758.102016] <TASK> [ 155758.106502] ? __die+0x78/0xc0 [ 155758.112793] ? page_fault_oops+0x286/0x380 [ 155758.121175] ? exc_page_fault+0x5d/0x110 [ 155758.129209] ? asm_exc_page_fault+0x22/0x30 [ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155758.148060] workingset_test_recent+0xda/0x1b0 [ 155758.157133] workingset_refault+0xca/0x1e0 [ 155758.165508] filemap_add_folio+0x4d/0x70 [ 155758.173538] page_cache_ra_unbounded+0xed/0x190 [ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 [ 155758.191738] filemap_read+0x68d/0xdf0 [ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 [ 155758.207981] ? __napi_schedule+0x55/0x90 [ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 [ 155758.224601] do_syscall_64+0x3d/0x80 [ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 155758.242473] RIP: 0033:0x7f62c29153b5 [ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b [ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 [ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 [ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 [ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c [ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 [ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 [ 155758.376661] </TASK>
This patch fixes the issue by getting a local reference inside unpack_shadow().
Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container")
Beyond mem_cgroup_get_nr_swap_pages(), we still use the eviction_memcg without grabbing a ref to it first in workingset_test_recent() (and in workingset_refault() before that) as well as lru_gen_test_recent().
Wouldn't the fix go back even further? or am I misinterpreting the problem?
Hmm I don't see eviction_memcg being used outside of *_test_recent (the rest just uses memcg = folio_memcg(folio), which if I'm not mistaken is the memcg that is refaulting the folio into memory).
Inside workingset_test_recent(), the only other place where eviction_memcg is used is for mem_cgroup_lruvec. This function call won't crash whether eviction_memcg is valid or not.
If eviction_memcg is invalid because the memory was already freed, we are basically dereferencing garbage in mem_cgroup_lruvec() aren't we?
The crash only happens during mem_cgroup_get_nr_swap_pages, which has an upward traversal from eviction_memcg to root.
Let me know if this does not make sense and/or is insufficient to ensure safe upward traversal from eviction_memcg to root!
Signed-off-by: Nhat Pham nphamcs@gmail.com Cc: stable@vger.kernel.org
mm/workingset.c | 65 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 22 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c index da58a26d0d4d..03cadad4e484 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -206,10 +206,19 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, return xa_mk_value(eviction); }
-static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
unsigned long *evictionp, bool *workingsetp)
+/*
- Unpacks the stored fields of a shadow entry into the given pointers.
- The memcg pointer is only populated if the memcg recorded in the shadow
- entry is valid. In this case, a reference to the memcg will be acquired,
- and a corresponding mem_cgroup_put() will be needed when we no longer
- need the memcg.
- */
+static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp,
pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp)
{ unsigned long entry = xa_to_value(shadow);
struct mem_cgroup *memcg; int memcgid, nid; bool workingset;
@@ -220,7 +229,24 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT;
*memcgidp = memcgid;
/*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
*
* Note that in rare events the ID could have been recycled
* for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
* algorithm's job to shake out the minimum access frequency
* for the active cache.
*/
memcg = mem_cgroup_from_id(memcgid);
if (memcg && css_tryget(&memcg->css))
*memcgp = memcg;
else
*memcgp = NULL;
*pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset;
@@ -262,15 +288,16 @@ static void *lru_gen_eviction(struct folio *folio) static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) {
int memcg_id; unsigned long min_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat;
unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
unpack_shadow(shadow, &memcg, &pgdat, token, workingset);
if (!mem_cgroup_disabled() && !memcg)
return false;
+Yu Zhao
There is a change of behavior here, right?
The existing code will continue if !mem_cgroup_disabled() && !memcg is true, and mem_cgroup_lruvec() will return the lruvec of the root memcg. Now we are just returning false.
Is this intentional?
Oh right, there is. Should have cc-ed Yu Zhao as well, my bad. get_maintainers.pl isn't always sufficient I guess :)
But yeah, this behavioral change is intentional.
Correct me if I'm wrong of course, but it seems like MGLRU should follow the same pattern here. That is, once we return from unpack_shadow, the possible scenarios are the same as prescribed in workingset_test_recent:
- If mem_cgroup is disabled, we can ignore this check.
- If mem_cgroup is enabled, then the only reason why we get NULL
memcg from unpack_shadow is if the eviction_memcg is no longer valid. We should not try to get its lruvec, or substitute it with the root memcg, but return false right away (i.e not recent).
I will leave this for Yu :)
On Thu, Aug 17, 2023 at 4:50 PM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 3:43 PM Nhat Pham nphamcs@gmail.com wrote:
On Thu, Aug 17, 2023 at 1:50 PM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 12:01 PM Nhat Pham nphamcs@gmail.com wrote:
In eviction recency check, we are currently not holding a local reference to the memcg that the refaulted folio belonged to when it was evicted. This could cause serious memcg lifetime issues, for e.g in the memcg hierarchy traversal done in mem_cgroup_get_nr_swap_pages(). This has occurred in production:
[ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 [ 155757.807568] #PF: supervisor read access in kernel mode [ 155757.818024] #PF: error_code(0x0000) - not-present page [ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 [ 155757.839985] Oops: 0000 [#1] SMP [ 155757.846444] CPU: 7 PID: 1380944 Comm: ThriftSrv-pri3- Kdump: loaded Tainted: G S 6.4.3-0_fbk1_rc0_594_g8d0cbcaa67ba #1 [ 155757.870808] Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM16 05/24/2021 [ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 [ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 [ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 [ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 [ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 [ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 [ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 [ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 [ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 [ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 155758.091376] PKRU: 55555554 [ 155758.096957] Call Trace: [ 155758.102016] <TASK> [ 155758.106502] ? __die+0x78/0xc0 [ 155758.112793] ? page_fault_oops+0x286/0x380 [ 155758.121175] ? exc_page_fault+0x5d/0x110 [ 155758.129209] ? asm_exc_page_fault+0x22/0x30 [ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155758.148060] workingset_test_recent+0xda/0x1b0 [ 155758.157133] workingset_refault+0xca/0x1e0 [ 155758.165508] filemap_add_folio+0x4d/0x70 [ 155758.173538] page_cache_ra_unbounded+0xed/0x190 [ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 [ 155758.191738] filemap_read+0x68d/0xdf0 [ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 [ 155758.207981] ? __napi_schedule+0x55/0x90 [ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 [ 155758.224601] do_syscall_64+0x3d/0x80 [ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 155758.242473] RIP: 0033:0x7f62c29153b5 [ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b [ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 [ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 [ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 [ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c [ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 [ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 [ 155758.376661] </TASK>
This patch fixes the issue by getting a local reference inside unpack_shadow().
Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container")
Beyond mem_cgroup_get_nr_swap_pages(), we still use the eviction_memcg without grabbing a ref to it first in workingset_test_recent() (and in workingset_refault() before that) as well as lru_gen_test_recent().
Wouldn't the fix go back even further? or am I misinterpreting the problem?
Hmm I don't see eviction_memcg being used outside of *_test_recent (the rest just uses memcg = folio_memcg(folio), which if I'm not mistaken is the memcg that is refaulting the folio into memory).
Inside workingset_test_recent(), the only other place where eviction_memcg is used is for mem_cgroup_lruvec. This function call won't crash whether eviction_memcg is valid or not.
If eviction_memcg is invalid because the memory was already freed, we are basically dereferencing garbage in mem_cgroup_lruvec() aren't we?
The crash only happens during mem_cgroup_get_nr_swap_pages, which has an upward traversal from eviction_memcg to root.
Let me know if this does not make sense and/or is insufficient to ensure safe upward traversal from eviction_memcg to root!
Signed-off-by: Nhat Pham nphamcs@gmail.com Cc: stable@vger.kernel.org
mm/workingset.c | 65 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 22 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c index da58a26d0d4d..03cadad4e484 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -206,10 +206,19 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, return xa_mk_value(eviction); }
-static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
unsigned long *evictionp, bool *workingsetp)
+/*
- Unpacks the stored fields of a shadow entry into the given pointers.
- The memcg pointer is only populated if the memcg recorded in the shadow
- entry is valid. In this case, a reference to the memcg will be acquired,
- and a corresponding mem_cgroup_put() will be needed when we no longer
- need the memcg.
- */
+static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp,
pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp)
{ unsigned long entry = xa_to_value(shadow);
struct mem_cgroup *memcg; int memcgid, nid; bool workingset;
@@ -220,7 +229,24 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT;
*memcgidp = memcgid;
/*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
*
* Note that in rare events the ID could have been recycled
* for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
* algorithm's job to shake out the minimum access frequency
* for the active cache.
*/
memcg = mem_cgroup_from_id(memcgid);
if (memcg && css_tryget(&memcg->css))
*memcgp = memcg;
else
*memcgp = NULL;
*pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset;
@@ -262,15 +288,16 @@ static void *lru_gen_eviction(struct folio *folio) static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) {
int memcg_id; unsigned long min_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat;
unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
unpack_shadow(shadow, &memcg, &pgdat, token, workingset);
if (!mem_cgroup_disabled() && !memcg)
return false;
+Yu Zhao
There is a change of behavior here, right?
The existing code will continue if !mem_cgroup_disabled() && !memcg is true, and mem_cgroup_lruvec() will return the lruvec of the root memcg. Now we are just returning false.
Is this intentional?
Oh right, there is. Should have cc-ed Yu Zhao as well, my bad. get_maintainers.pl isn't always sufficient I guess :)
But yeah, this behavioral change is intentional.
Correct me if I'm wrong of course, but it seems like MGLRU should follow the same pattern here. That is, once we return from unpack_shadow, the possible scenarios are the same as prescribed in workingset_test_recent:
- If mem_cgroup is disabled, we can ignore this check.
- If mem_cgroup is enabled, then the only reason why we get NULL
memcg from unpack_shadow is if the eviction_memcg is no longer valid. We should not try to get its lruvec, or substitute it with the root memcg, but return false right away (i.e not recent).
I will leave this for Yu :)
Thanks, Yosry.
Hi Nhat, it seems unnecessary to me to introduce a get/put into lru_gen_test_recent() because it doesn't suffer from the bug this patch tries to fix. In theory, the extra get/put can impact performance, though admittedly the impact is unlikely to be measurable. Regardless, the general practice is to fix the bug locally, i.e., when the mem_cgroup_get_nr_swap_pages() path is taken, rather than change the unrelated path. Thank you.
On Thu, Aug 17, 2023 at 05:12:17PM -0600, Yu Zhao wrote:
On Thu, Aug 17, 2023 at 4:50 PM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 3:43 PM Nhat Pham nphamcs@gmail.com wrote:
On Thu, Aug 17, 2023 at 1:50 PM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 12:01 PM Nhat Pham nphamcs@gmail.com wrote:
In eviction recency check, we are currently not holding a local reference to the memcg that the refaulted folio belonged to when it was evicted. This could cause serious memcg lifetime issues, for e.g in the memcg hierarchy traversal done in mem_cgroup_get_nr_swap_pages(). This has occurred in production:
[ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 [ 155757.807568] #PF: supervisor read access in kernel mode [ 155757.818024] #PF: error_code(0x0000) - not-present page [ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 [ 155757.839985] Oops: 0000 [#1] SMP [ 155757.846444] CPU: 7 PID: 1380944 Comm: ThriftSrv-pri3- Kdump: loaded Tainted: G S 6.4.3-0_fbk1_rc0_594_g8d0cbcaa67ba #1 [ 155757.870808] Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM16 05/24/2021 [ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 [ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 [ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 [ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 [ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 [ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 [ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 [ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 [ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 [ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 155758.091376] PKRU: 55555554 [ 155758.096957] Call Trace: [ 155758.102016] <TASK> [ 155758.106502] ? __die+0x78/0xc0 [ 155758.112793] ? page_fault_oops+0x286/0x380 [ 155758.121175] ? exc_page_fault+0x5d/0x110 [ 155758.129209] ? asm_exc_page_fault+0x22/0x30 [ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155758.148060] workingset_test_recent+0xda/0x1b0 [ 155758.157133] workingset_refault+0xca/0x1e0 [ 155758.165508] filemap_add_folio+0x4d/0x70 [ 155758.173538] page_cache_ra_unbounded+0xed/0x190 [ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 [ 155758.191738] filemap_read+0x68d/0xdf0 [ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 [ 155758.207981] ? __napi_schedule+0x55/0x90 [ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 [ 155758.224601] do_syscall_64+0x3d/0x80 [ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 155758.242473] RIP: 0033:0x7f62c29153b5 [ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b [ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 [ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 [ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 [ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c [ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 [ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 [ 155758.376661] </TASK>
This patch fixes the issue by getting a local reference inside unpack_shadow().
Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container")
Beyond mem_cgroup_get_nr_swap_pages(), we still use the eviction_memcg without grabbing a ref to it first in workingset_test_recent() (and in workingset_refault() before that) as well as lru_gen_test_recent().
Wouldn't the fix go back even further? or am I misinterpreting the problem?
Hmm I don't see eviction_memcg being used outside of *_test_recent (the rest just uses memcg = folio_memcg(folio), which if I'm not mistaken is the memcg that is refaulting the folio into memory).
Inside workingset_test_recent(), the only other place where eviction_memcg is used is for mem_cgroup_lruvec. This function call won't crash whether eviction_memcg is valid or not.
If eviction_memcg is invalid because the memory was already freed, we are basically dereferencing garbage in mem_cgroup_lruvec() aren't we?
The crash only happens during mem_cgroup_get_nr_swap_pages, which has an upward traversal from eviction_memcg to root.
Let me know if this does not make sense and/or is insufficient to ensure safe upward traversal from eviction_memcg to root!
Signed-off-by: Nhat Pham nphamcs@gmail.com Cc: stable@vger.kernel.org
mm/workingset.c | 65 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 22 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c index da58a26d0d4d..03cadad4e484 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -206,10 +206,19 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, return xa_mk_value(eviction); }
-static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
unsigned long *evictionp, bool *workingsetp)
+/*
- Unpacks the stored fields of a shadow entry into the given pointers.
- The memcg pointer is only populated if the memcg recorded in the shadow
- entry is valid. In this case, a reference to the memcg will be acquired,
- and a corresponding mem_cgroup_put() will be needed when we no longer
- need the memcg.
- */
+static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp,
pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp)
{ unsigned long entry = xa_to_value(shadow);
struct mem_cgroup *memcg; int memcgid, nid; bool workingset;
@@ -220,7 +229,24 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT;
*memcgidp = memcgid;
/*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
*
* Note that in rare events the ID could have been recycled
* for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
* algorithm's job to shake out the minimum access frequency
* for the active cache.
*/
memcg = mem_cgroup_from_id(memcgid);
if (memcg && css_tryget(&memcg->css))
*memcgp = memcg;
else
*memcgp = NULL;
*pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset;
@@ -262,15 +288,16 @@ static void *lru_gen_eviction(struct folio *folio) static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) {
int memcg_id; unsigned long min_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat;
unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
unpack_shadow(shadow, &memcg, &pgdat, token, workingset);
if (!mem_cgroup_disabled() && !memcg)
return false;
+Yu Zhao
There is a change of behavior here, right?
The existing code will continue if !mem_cgroup_disabled() && !memcg is true, and mem_cgroup_lruvec() will return the lruvec of the root memcg. Now we are just returning false.
Is this intentional?
Oh right, there is. Should have cc-ed Yu Zhao as well, my bad. get_maintainers.pl isn't always sufficient I guess :)
But yeah, this behavioral change is intentional.
Correct me if I'm wrong of course, but it seems like MGLRU should follow the same pattern here. That is, once we return from unpack_shadow, the possible scenarios are the same as prescribed in workingset_test_recent:
- If mem_cgroup is disabled, we can ignore this check.
- If mem_cgroup is enabled, then the only reason why we get NULL
memcg from unpack_shadow is if the eviction_memcg is no longer valid. We should not try to get its lruvec, or substitute it with the root memcg, but return false right away (i.e not recent).
I will leave this for Yu :)
Thanks, Yosry.
Hi Nhat, it seems unnecessary to me to introduce a get/put into lru_gen_test_recent() because it doesn't suffer from the bug this patch tries to fix. In theory, the extra get/put can impact performance, though admittedly the impact is unlikely to be measurable. Regardless, the general practice is to fix the bug locally, i.e., when the mem_cgroup_get_nr_swap_pages() path is taken, rather than change the unrelated path. Thank you.
Hey guys,
I had suggested to have it in unpack_shadow() to keep things simple, and not further complicate the lifetime rules in this code. The tryget() is against a per-cpu counter, so it's not expensive.
The NULL deref is evidence that while *some* cgroup members are still accessible once it's dead, not all of it is. There is no explicit guarantee from the cgroup code that anything BUT the tryget() is still valid against group that is under rcu freeing.
Since it isn't expensive, let's keep it simple and robust, and prevent future bugs of the same class, by always ensuring the cgroup is alive before accessing random members. Especially in non-cgroup code.
On Fri, Aug 18, 2023 at 6:49 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Thu, Aug 17, 2023 at 05:12:17PM -0600, Yu Zhao wrote:
On Thu, Aug 17, 2023 at 4:50 PM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 3:43 PM Nhat Pham nphamcs@gmail.com wrote:
On Thu, Aug 17, 2023 at 1:50 PM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 12:01 PM Nhat Pham nphamcs@gmail.com wrote:
In eviction recency check, we are currently not holding a local reference to the memcg that the refaulted folio belonged to when it was evicted. This could cause serious memcg lifetime issues, for e.g in the memcg hierarchy traversal done in mem_cgroup_get_nr_swap_pages(). This has occurred in production:
[ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 [ 155757.807568] #PF: supervisor read access in kernel mode [ 155757.818024] #PF: error_code(0x0000) - not-present page [ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 [ 155757.839985] Oops: 0000 [#1] SMP [ 155757.846444] CPU: 7 PID: 1380944 Comm: ThriftSrv-pri3- Kdump: loaded Tainted: G S 6.4.3-0_fbk1_rc0_594_g8d0cbcaa67ba #1 [ 155757.870808] Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM16 05/24/2021 [ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 [ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 [ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 [ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 [ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 [ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 [ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 [ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 [ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 [ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 155758.091376] PKRU: 55555554 [ 155758.096957] Call Trace: [ 155758.102016] <TASK> [ 155758.106502] ? __die+0x78/0xc0 [ 155758.112793] ? page_fault_oops+0x286/0x380 [ 155758.121175] ? exc_page_fault+0x5d/0x110 [ 155758.129209] ? asm_exc_page_fault+0x22/0x30 [ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155758.148060] workingset_test_recent+0xda/0x1b0 [ 155758.157133] workingset_refault+0xca/0x1e0 [ 155758.165508] filemap_add_folio+0x4d/0x70 [ 155758.173538] page_cache_ra_unbounded+0xed/0x190 [ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 [ 155758.191738] filemap_read+0x68d/0xdf0 [ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 [ 155758.207981] ? __napi_schedule+0x55/0x90 [ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 [ 155758.224601] do_syscall_64+0x3d/0x80 [ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 155758.242473] RIP: 0033:0x7f62c29153b5 [ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b [ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 [ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 [ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 [ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c [ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 [ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 [ 155758.376661] </TASK>
This patch fixes the issue by getting a local reference inside unpack_shadow().
Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container")
Beyond mem_cgroup_get_nr_swap_pages(), we still use the eviction_memcg without grabbing a ref to it first in workingset_test_recent() (and in workingset_refault() before that) as well as lru_gen_test_recent().
Wouldn't the fix go back even further? or am I misinterpreting the problem?
Hmm I don't see eviction_memcg being used outside of *_test_recent (the rest just uses memcg = folio_memcg(folio), which if I'm not mistaken is the memcg that is refaulting the folio into memory).
Inside workingset_test_recent(), the only other place where eviction_memcg is used is for mem_cgroup_lruvec. This function call won't crash whether eviction_memcg is valid or not.
If eviction_memcg is invalid because the memory was already freed, we are basically dereferencing garbage in mem_cgroup_lruvec() aren't we?
The crash only happens during mem_cgroup_get_nr_swap_pages, which has an upward traversal from eviction_memcg to root.
Let me know if this does not make sense and/or is insufficient to ensure safe upward traversal from eviction_memcg to root!
Signed-off-by: Nhat Pham nphamcs@gmail.com Cc: stable@vger.kernel.org
mm/workingset.c | 65 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 22 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c index da58a26d0d4d..03cadad4e484 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -206,10 +206,19 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, return xa_mk_value(eviction); }
-static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
unsigned long *evictionp, bool *workingsetp)
+/*
- Unpacks the stored fields of a shadow entry into the given pointers.
- The memcg pointer is only populated if the memcg recorded in the shadow
- entry is valid. In this case, a reference to the memcg will be acquired,
- and a corresponding mem_cgroup_put() will be needed when we no longer
- need the memcg.
- */
+static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp,
pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp)
{ unsigned long entry = xa_to_value(shadow);
struct mem_cgroup *memcg; int memcgid, nid; bool workingset;
@@ -220,7 +229,24 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT;
*memcgidp = memcgid;
/*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
*
* Note that in rare events the ID could have been recycled
* for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
* algorithm's job to shake out the minimum access frequency
* for the active cache.
*/
memcg = mem_cgroup_from_id(memcgid);
if (memcg && css_tryget(&memcg->css))
*memcgp = memcg;
else
*memcgp = NULL;
*pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset;
@@ -262,15 +288,16 @@ static void *lru_gen_eviction(struct folio *folio) static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) {
int memcg_id; unsigned long min_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat;
unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
unpack_shadow(shadow, &memcg, &pgdat, token, workingset);
if (!mem_cgroup_disabled() && !memcg)
return false;
+Yu Zhao
There is a change of behavior here, right?
The existing code will continue if !mem_cgroup_disabled() && !memcg is true, and mem_cgroup_lruvec() will return the lruvec of the root memcg. Now we are just returning false.
Is this intentional?
Oh right, there is. Should have cc-ed Yu Zhao as well, my bad. get_maintainers.pl isn't always sufficient I guess :)
But yeah, this behavioral change is intentional.
Correct me if I'm wrong of course, but it seems like MGLRU should follow the same pattern here. That is, once we return from unpack_shadow, the possible scenarios are the same as prescribed in workingset_test_recent:
- If mem_cgroup is disabled, we can ignore this check.
- If mem_cgroup is enabled, then the only reason why we get NULL
memcg from unpack_shadow is if the eviction_memcg is no longer valid. We should not try to get its lruvec, or substitute it with the root memcg, but return false right away (i.e not recent).
I will leave this for Yu :)
Thanks, Yosry.
Hi Nhat, it seems unnecessary to me to introduce a get/put into lru_gen_test_recent() because it doesn't suffer from the bug this patch tries to fix. In theory, the extra get/put can impact performance, though admittedly the impact is unlikely to be measurable. Regardless, the general practice is to fix the bug locally, i.e., when the mem_cgroup_get_nr_swap_pages() path is taken, rather than change the unrelated path. Thank you.
Hey guys,
I had suggested to have it in unpack_shadow() to keep things simple, and not further complicate the lifetime rules in this code. The tryget() is against a per-cpu counter, so it's not expensive.
The NULL deref is evidence that while *some* cgroup members are still accessible once it's dead, not all of it is. There is no explicit guarantee from the cgroup code that anything BUT the tryget() is still valid against group that is under rcu freeing.
Since it isn't expensive, let's keep it simple and robust, and prevent future bugs of the same class, by always ensuring the cgroup is alive before accessing random members. Especially in non-cgroup code.
I looked at this again today with fresh eyes, and I want to go back to what I initially said. Isn't RCU protection in this case enough to keep the memcg "valid" (i.e accessible, not garbage)? The tryget is not a lot of complexity or performance tax, but I want to really understand what's happening here.
Looking at the code again, this seems to be the sequence of events on the cgroup side: - css_put() puts the last reference invoking a call to css_release() - css_release() queues css_release_work_fn() - css_release() does some bookkeeping, makes some callbacks, and queues css_free_rwork_fn() to run *after* an RCU grace period. - css_free_rwork_fn() makes callbacks to free the memory, ultimately freeing the memcg.
On the memcg idr side, the removal sequence of events seem to be: - mem_cgroup_id_put() will decrement the id ref and check if falls to 0 - If the id ref falls to 0, we call mem_cgroup_id_remove() *then* css_put()
On the workingset_refault() side, the sequence of events seems to be: - rcu_read_lock() - memcg = mem_cgroup_from_id() - ... // use memcg - rcu_read_unlock()
So technically, after holding the rcu read lock, if we find the memcg in the idr, it must be valid, and it must not be freed until after the rcu read section is completed. It's not just the cgroup internal implementation, it's the contract between cgroup core and controllers such as memcg.
The memory controller expects a sequence of callbacks during freeing: css_offline() -> css_released() -> css_free(). So memcg code is within its right to access any fields of struct mem_cgroup that are not freed by the css_offline() or css_released() until css_free() is called, right?
Here is a guess / question, because I am not really familiar with memory barriers and such, but is it at all possible that the actual problem is reordering of instructions in mem_cgroup_id_put_many(), such that we actually execute css_put() *before* mem_cgroup_id_remove()?
If this happens it seems possible for this to happen:
cpu #1 cpu#2 css_put() /* css_free_rwork_fn is queued */ rcu_read_lock() mem_cgroup_from_id() mem_cgroup_id_remove() /* access memcg */
If I understand correctly, if css_free_rwork_fn() is queued before the rcu_read_lock in workingset_refault() begins, then it can be executed during the rcu read section, and the memcg can be freed at any point from under us. Perhaps what we need is memory barriers to ensure correct ordering in mem_cgroup_id_put_many()? I am not sure if rcu_read_lock() implies a barrier on the other side.
Sorry if this is all off, I am just trying to understand what's going on.
On Fri, Aug 18, 2023 at 7:57 AM Yosry Ahmed yosryahmed@google.com wrote:
On Fri, Aug 18, 2023 at 6:49 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Thu, Aug 17, 2023 at 05:12:17PM -0600, Yu Zhao wrote:
On Thu, Aug 17, 2023 at 4:50 PM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 3:43 PM Nhat Pham nphamcs@gmail.com wrote:
On Thu, Aug 17, 2023 at 1:50 PM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 12:01 PM Nhat Pham nphamcs@gmail.com wrote: > > In eviction recency check, we are currently not holding a local > reference to the memcg that the refaulted folio belonged to when it was > evicted. This could cause serious memcg lifetime issues, for e.g in the > memcg hierarchy traversal done in mem_cgroup_get_nr_swap_pages(). This > has occurred in production: > > [ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 > [ 155757.807568] #PF: supervisor read access in kernel mode > [ 155757.818024] #PF: error_code(0x0000) - not-present page > [ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 > [ 155757.839985] Oops: 0000 [#1] SMP > [ 155757.846444] CPU: 7 PID: 1380944 Comm: ThriftSrv-pri3- Kdump: loaded Tainted: G S 6.4.3-0_fbk1_rc0_594_g8d0cbcaa67ba #1 > [ 155757.870808] Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM16 05/24/2021 > [ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 > [ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 > [ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 > [ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 > [ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 > [ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 > [ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 > [ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 > [ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 > [ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 > [ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > [ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 > [ 155758.091376] PKRU: 55555554 > [ 155758.096957] Call Trace: > [ 155758.102016] <TASK> > [ 155758.106502] ? __die+0x78/0xc0 > [ 155758.112793] ? page_fault_oops+0x286/0x380 > [ 155758.121175] ? exc_page_fault+0x5d/0x110 > [ 155758.129209] ? asm_exc_page_fault+0x22/0x30 > [ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 > [ 155758.148060] workingset_test_recent+0xda/0x1b0 > [ 155758.157133] workingset_refault+0xca/0x1e0 > [ 155758.165508] filemap_add_folio+0x4d/0x70 > [ 155758.173538] page_cache_ra_unbounded+0xed/0x190 > [ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 > [ 155758.191738] filemap_read+0x68d/0xdf0 > [ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 > [ 155758.207981] ? __napi_schedule+0x55/0x90 > [ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 > [ 155758.224601] do_syscall_64+0x3d/0x80 > [ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 > [ 155758.242473] RIP: 0033:0x7f62c29153b5 > [ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b > [ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 > [ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 > [ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 > [ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c > [ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 > [ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 > [ 155758.376661] </TASK> > > This patch fixes the issue by getting a local reference inside > unpack_shadow(). > > Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container")
Beyond mem_cgroup_get_nr_swap_pages(), we still use the eviction_memcg without grabbing a ref to it first in workingset_test_recent() (and in workingset_refault() before that) as well as lru_gen_test_recent().
Wouldn't the fix go back even further? or am I misinterpreting the problem?
Hmm I don't see eviction_memcg being used outside of *_test_recent (the rest just uses memcg = folio_memcg(folio), which if I'm not mistaken is the memcg that is refaulting the folio into memory).
Inside workingset_test_recent(), the only other place where eviction_memcg is used is for mem_cgroup_lruvec. This function call won't crash whether eviction_memcg is valid or not.
If eviction_memcg is invalid because the memory was already freed, we are basically dereferencing garbage in mem_cgroup_lruvec() aren't we?
The crash only happens during mem_cgroup_get_nr_swap_pages, which has an upward traversal from eviction_memcg to root.
Let me know if this does not make sense and/or is insufficient to ensure safe upward traversal from eviction_memcg to root!
> Signed-off-by: Nhat Pham nphamcs@gmail.com > Cc: stable@vger.kernel.org > --- > mm/workingset.c | 65 ++++++++++++++++++++++++++++++++----------------- > 1 file changed, 43 insertions(+), 22 deletions(-) > > diff --git a/mm/workingset.c b/mm/workingset.c > index da58a26d0d4d..03cadad4e484 100644 > --- a/mm/workingset.c > +++ b/mm/workingset.c > @@ -206,10 +206,19 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, > return xa_mk_value(eviction); > } > > -static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, > - unsigned long *evictionp, bool *workingsetp) > +/* > + * Unpacks the stored fields of a shadow entry into the given pointers. > + * > + * The memcg pointer is only populated if the memcg recorded in the shadow > + * entry is valid. In this case, a reference to the memcg will be acquired, > + * and a corresponding mem_cgroup_put() will be needed when we no longer > + * need the memcg. > + */ > +static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp, > + pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) > { > unsigned long entry = xa_to_value(shadow); > + struct mem_cgroup *memcg; > int memcgid, nid; > bool workingset; > > @@ -220,7 +229,24 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, > memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); > entry >>= MEM_CGROUP_ID_SHIFT; > > - *memcgidp = memcgid; > + /* > + * Look up the memcg associated with the stored ID. It might > + * have been deleted since the folio's eviction. > + * > + * Note that in rare events the ID could have been recycled > + * for a new cgroup that refaults a shared folio. This is > + * impossible to tell from the available data. However, this > + * should be a rare and limited disturbance, and activations > + * are always speculative anyway. Ultimately, it's the aging > + * algorithm's job to shake out the minimum access frequency > + * for the active cache. > + */ > + memcg = mem_cgroup_from_id(memcgid); > + if (memcg && css_tryget(&memcg->css)) > + *memcgp = memcg; > + else > + *memcgp = NULL; > + > *pgdat = NODE_DATA(nid); > *evictionp = entry; > *workingsetp = workingset; > @@ -262,15 +288,16 @@ static void *lru_gen_eviction(struct folio *folio) > static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, > unsigned long *token, bool *workingset) > { > - int memcg_id; > unsigned long min_seq; > struct mem_cgroup *memcg; > struct pglist_data *pgdat; > > - unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); > + unpack_shadow(shadow, &memcg, &pgdat, token, workingset); > + if (!mem_cgroup_disabled() && !memcg) > + return false;
+Yu Zhao
There is a change of behavior here, right?
The existing code will continue if !mem_cgroup_disabled() && !memcg is true, and mem_cgroup_lruvec() will return the lruvec of the root memcg. Now we are just returning false.
Is this intentional?
Oh right, there is. Should have cc-ed Yu Zhao as well, my bad. get_maintainers.pl isn't always sufficient I guess :)
But yeah, this behavioral change is intentional.
Correct me if I'm wrong of course, but it seems like MGLRU should follow the same pattern here. That is, once we return from unpack_shadow, the possible scenarios are the same as prescribed in workingset_test_recent:
- If mem_cgroup is disabled, we can ignore this check.
- If mem_cgroup is enabled, then the only reason why we get NULL
memcg from unpack_shadow is if the eviction_memcg is no longer valid. We should not try to get its lruvec, or substitute it with the root memcg, but return false right away (i.e not recent).
I will leave this for Yu :)
Thanks, Yosry.
Hi Nhat, it seems unnecessary to me to introduce a get/put into lru_gen_test_recent() because it doesn't suffer from the bug this patch tries to fix. In theory, the extra get/put can impact performance, though admittedly the impact is unlikely to be measurable. Regardless, the general practice is to fix the bug locally, i.e., when the mem_cgroup_get_nr_swap_pages() path is taken, rather than change the unrelated path. Thank you.
Hey guys,
I had suggested to have it in unpack_shadow() to keep things simple, and not further complicate the lifetime rules in this code. The tryget() is against a per-cpu counter, so it's not expensive.
The NULL deref is evidence that while *some* cgroup members are still accessible once it's dead, not all of it is. There is no explicit guarantee from the cgroup code that anything BUT the tryget() is still valid against group that is under rcu freeing.
Since it isn't expensive, let's keep it simple and robust, and prevent future bugs of the same class, by always ensuring the cgroup is alive before accessing random members. Especially in non-cgroup code.
I looked at this again today with fresh eyes, and I want to go back to what I initially said. Isn't RCU protection in this case enough to keep the memcg "valid" (i.e accessible, not garbage)? The tryget is not a lot of complexity or performance tax, but I want to really understand what's happening here.
Looking at the code again, this seems to be the sequence of events on the cgroup side:
- css_put() puts the last reference invoking a call to css_release()
- css_release() queues css_release_work_fn()
- css_release() does some bookkeeping, makes some callbacks, and
queues css_free_rwork_fn() to run *after* an RCU grace period.
- css_free_rwork_fn() makes callbacks to free the memory, ultimately
freeing the memcg.
On the memcg idr side, the removal sequence of events seem to be:
- mem_cgroup_id_put() will decrement the id ref and check if falls to 0
- If the id ref falls to 0, we call mem_cgroup_id_remove() *then* css_put()
On the workingset_refault() side, the sequence of events seems to be:
- rcu_read_lock()
- memcg = mem_cgroup_from_id()
- ... // use memcg
- rcu_read_unlock()
So technically, after holding the rcu read lock, if we find the memcg in the idr, it must be valid, and it must not be freed until after the rcu read section is completed. It's not just the cgroup internal implementation, it's the contract between cgroup core and controllers such as memcg.
The memory controller expects a sequence of callbacks during freeing: css_offline() -> css_released() -> css_free(). So memcg code is within its right to access any fields of struct mem_cgroup that are not freed by the css_offline() or css_released() until css_free() is called, right?
Here is a guess / question, because I am not really familiar with memory barriers and such, but is it at all possible that the actual problem is reordering of instructions in mem_cgroup_id_put_many(), such that we actually execute css_put() *before* mem_cgroup_id_remove()?
If this happens it seems possible for this to happen:
cpu #1 cpu#2 css_put() /* css_free_rwork_fn is queued */ rcu_read_lock() mem_cgroup_from_id() mem_cgroup_id_remove() /* access memcg */
If I understand correctly, if css_free_rwork_fn() is queued before the rcu_read_lock in workingset_refault() begins, then it can be executed during the rcu read section, and the memcg can be freed at any point from under us. Perhaps what we need is memory barriers to ensure correct ordering in mem_cgroup_id_put_many()? I am not sure if rcu_read_lock() implies a barrier on the other side.
Sorry if this is all off, I am just trying to understand what's going on.
Ah that is wild. That does sound plausible. In this case, maybe something like this?
mem_cgroup_id_remove(memcg); /* * Preventing css_put from happening before id removal due to * instruction reordering. * * This guarantees that if a non-null memcg is acquired from ID within * an RCU read section, its css won't be freed for the * duration of this section. */ smp_mb(); /* Memcg ID pins CSS */ css_put(&memcg->css);
On Fri, Aug 18, 2023 at 9:24 AM Nhat Pham nphamcs@gmail.com wrote:
On Fri, Aug 18, 2023 at 7:57 AM Yosry Ahmed yosryahmed@google.com wrote:
On Fri, Aug 18, 2023 at 6:49 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Thu, Aug 17, 2023 at 05:12:17PM -0600, Yu Zhao wrote:
On Thu, Aug 17, 2023 at 4:50 PM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 3:43 PM Nhat Pham nphamcs@gmail.com wrote:
On Thu, Aug 17, 2023 at 1:50 PM Yosry Ahmed yosryahmed@google.com wrote: > > On Thu, Aug 17, 2023 at 12:01 PM Nhat Pham nphamcs@gmail.com wrote: > > > > In eviction recency check, we are currently not holding a local > > reference to the memcg that the refaulted folio belonged to when it was > > evicted. This could cause serious memcg lifetime issues, for e.g in the > > memcg hierarchy traversal done in mem_cgroup_get_nr_swap_pages(). This > > has occurred in production: > > > > [ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 > > [ 155757.807568] #PF: supervisor read access in kernel mode > > [ 155757.818024] #PF: error_code(0x0000) - not-present page > > [ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 > > [ 155757.839985] Oops: 0000 [#1] SMP > > [ 155757.846444] CPU: 7 PID: 1380944 Comm: ThriftSrv-pri3- Kdump: loaded Tainted: G S 6.4.3-0_fbk1_rc0_594_g8d0cbcaa67ba #1 > > [ 155757.870808] Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM16 05/24/2021 > > [ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 > > [ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 > > [ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 > > [ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 > > [ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 > > [ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 > > [ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 > > [ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 > > [ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 > > [ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > > [ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 > > [ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > > [ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 > > [ 155758.091376] PKRU: 55555554 > > [ 155758.096957] Call Trace: > > [ 155758.102016] <TASK> > > [ 155758.106502] ? __die+0x78/0xc0 > > [ 155758.112793] ? page_fault_oops+0x286/0x380 > > [ 155758.121175] ? exc_page_fault+0x5d/0x110 > > [ 155758.129209] ? asm_exc_page_fault+0x22/0x30 > > [ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 > > [ 155758.148060] workingset_test_recent+0xda/0x1b0 > > [ 155758.157133] workingset_refault+0xca/0x1e0 > > [ 155758.165508] filemap_add_folio+0x4d/0x70 > > [ 155758.173538] page_cache_ra_unbounded+0xed/0x190 > > [ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 > > [ 155758.191738] filemap_read+0x68d/0xdf0 > > [ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 > > [ 155758.207981] ? __napi_schedule+0x55/0x90 > > [ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 > > [ 155758.224601] do_syscall_64+0x3d/0x80 > > [ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 > > [ 155758.242473] RIP: 0033:0x7f62c29153b5 > > [ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b > > [ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 > > [ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 > > [ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 > > [ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c > > [ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 > > [ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 > > [ 155758.376661] </TASK> > > > > This patch fixes the issue by getting a local reference inside > > unpack_shadow(). > > > > Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container") > > Beyond mem_cgroup_get_nr_swap_pages(), we still use the eviction_memcg > without grabbing a ref to it first in workingset_test_recent() (and in > workingset_refault() before that) as well as lru_gen_test_recent(). > > Wouldn't the fix go back even further? or am I misinterpreting the problem? Hmm I don't see eviction_memcg being used outside of *_test_recent (the rest just uses memcg = folio_memcg(folio), which if I'm not mistaken is the memcg that is refaulting the folio into memory).
Inside workingset_test_recent(), the only other place where eviction_memcg is used is for mem_cgroup_lruvec. This function call won't crash whether eviction_memcg is valid or not.
If eviction_memcg is invalid because the memory was already freed, we are basically dereferencing garbage in mem_cgroup_lruvec() aren't we?
The crash only happens during mem_cgroup_get_nr_swap_pages, which has an upward traversal from eviction_memcg to root.
Let me know if this does not make sense and/or is insufficient to ensure safe upward traversal from eviction_memcg to root! > > > > > Signed-off-by: Nhat Pham nphamcs@gmail.com > > Cc: stable@vger.kernel.org > > --- > > mm/workingset.c | 65 ++++++++++++++++++++++++++++++++----------------- > > 1 file changed, 43 insertions(+), 22 deletions(-) > > > > diff --git a/mm/workingset.c b/mm/workingset.c > > index da58a26d0d4d..03cadad4e484 100644 > > --- a/mm/workingset.c > > +++ b/mm/workingset.c > > @@ -206,10 +206,19 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, > > return xa_mk_value(eviction); > > } > > > > -static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, > > - unsigned long *evictionp, bool *workingsetp) > > +/* > > + * Unpacks the stored fields of a shadow entry into the given pointers. > > + * > > + * The memcg pointer is only populated if the memcg recorded in the shadow > > + * entry is valid. In this case, a reference to the memcg will be acquired, > > + * and a corresponding mem_cgroup_put() will be needed when we no longer > > + * need the memcg. > > + */ > > +static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp, > > + pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) > > { > > unsigned long entry = xa_to_value(shadow); > > + struct mem_cgroup *memcg; > > int memcgid, nid; > > bool workingset; > > > > @@ -220,7 +229,24 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, > > memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); > > entry >>= MEM_CGROUP_ID_SHIFT; > > > > - *memcgidp = memcgid; > > + /* > > + * Look up the memcg associated with the stored ID. It might > > + * have been deleted since the folio's eviction. > > + * > > + * Note that in rare events the ID could have been recycled > > + * for a new cgroup that refaults a shared folio. This is > > + * impossible to tell from the available data. However, this > > + * should be a rare and limited disturbance, and activations > > + * are always speculative anyway. Ultimately, it's the aging > > + * algorithm's job to shake out the minimum access frequency > > + * for the active cache. > > + */ > > + memcg = mem_cgroup_from_id(memcgid); > > + if (memcg && css_tryget(&memcg->css)) > > + *memcgp = memcg; > > + else > > + *memcgp = NULL; > > + > > *pgdat = NODE_DATA(nid); > > *evictionp = entry; > > *workingsetp = workingset; > > @@ -262,15 +288,16 @@ static void *lru_gen_eviction(struct folio *folio) > > static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, > > unsigned long *token, bool *workingset) > > { > > - int memcg_id; > > unsigned long min_seq; > > struct mem_cgroup *memcg; > > struct pglist_data *pgdat; > > > > - unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); > > + unpack_shadow(shadow, &memcg, &pgdat, token, workingset); > > + if (!mem_cgroup_disabled() && !memcg) > > + return false; > > +Yu Zhao > > There is a change of behavior here, right? > > The existing code will continue if !mem_cgroup_disabled() && !memcg is > true, and mem_cgroup_lruvec() will return the lruvec of the root > memcg. Now we are just returning false. > > Is this intentional? Oh right, there is. Should have cc-ed Yu Zhao as well, my bad. get_maintainers.pl isn't always sufficient I guess :)
But yeah, this behavioral change is intentional.
Correct me if I'm wrong of course, but it seems like MGLRU should follow the same pattern here. That is, once we return from unpack_shadow, the possible scenarios are the same as prescribed in workingset_test_recent:
- If mem_cgroup is disabled, we can ignore this check.
- If mem_cgroup is enabled, then the only reason why we get NULL
memcg from unpack_shadow is if the eviction_memcg is no longer valid. We should not try to get its lruvec, or substitute it with the root memcg, but return false right away (i.e not recent). >
I will leave this for Yu :)
Thanks, Yosry.
Hi Nhat, it seems unnecessary to me to introduce a get/put into lru_gen_test_recent() because it doesn't suffer from the bug this patch tries to fix. In theory, the extra get/put can impact performance, though admittedly the impact is unlikely to be measurable. Regardless, the general practice is to fix the bug locally, i.e., when the mem_cgroup_get_nr_swap_pages() path is taken, rather than change the unrelated path. Thank you.
Hey guys,
I had suggested to have it in unpack_shadow() to keep things simple, and not further complicate the lifetime rules in this code. The tryget() is against a per-cpu counter, so it's not expensive.
The NULL deref is evidence that while *some* cgroup members are still accessible once it's dead, not all of it is. There is no explicit guarantee from the cgroup code that anything BUT the tryget() is still valid against group that is under rcu freeing.
Since it isn't expensive, let's keep it simple and robust, and prevent future bugs of the same class, by always ensuring the cgroup is alive before accessing random members. Especially in non-cgroup code.
I looked at this again today with fresh eyes, and I want to go back to what I initially said. Isn't RCU protection in this case enough to keep the memcg "valid" (i.e accessible, not garbage)? The tryget is not a lot of complexity or performance tax, but I want to really understand what's happening here.
Looking at the code again, this seems to be the sequence of events on the cgroup side:
- css_put() puts the last reference invoking a call to css_release()
- css_release() queues css_release_work_fn()
- css_release() does some bookkeeping, makes some callbacks, and
queues css_free_rwork_fn() to run *after* an RCU grace period.
- css_free_rwork_fn() makes callbacks to free the memory, ultimately
freeing the memcg.
On the memcg idr side, the removal sequence of events seem to be:
- mem_cgroup_id_put() will decrement the id ref and check if falls to 0
- If the id ref falls to 0, we call mem_cgroup_id_remove() *then* css_put()
On the workingset_refault() side, the sequence of events seems to be:
- rcu_read_lock()
- memcg = mem_cgroup_from_id()
- ... // use memcg
- rcu_read_unlock()
So technically, after holding the rcu read lock, if we find the memcg in the idr, it must be valid, and it must not be freed until after the rcu read section is completed. It's not just the cgroup internal implementation, it's the contract between cgroup core and controllers such as memcg.
The memory controller expects a sequence of callbacks during freeing: css_offline() -> css_released() -> css_free(). So memcg code is within its right to access any fields of struct mem_cgroup that are not freed by the css_offline() or css_released() until css_free() is called, right?
Here is a guess / question, because I am not really familiar with memory barriers and such, but is it at all possible that the actual problem is reordering of instructions in mem_cgroup_id_put_many(), such that we actually execute css_put() *before* mem_cgroup_id_remove()?
If this happens it seems possible for this to happen:
cpu #1 cpu#2 css_put() /* css_free_rwork_fn is queued */ rcu_read_lock() mem_cgroup_from_id() mem_cgroup_id_remove() /* access memcg */
If I understand correctly, if css_free_rwork_fn() is queued before the rcu_read_lock in workingset_refault() begins, then it can be executed during the rcu read section, and the memcg can be freed at any point from under us. Perhaps what we need is memory barriers to ensure correct ordering in mem_cgroup_id_put_many()? I am not sure if rcu_read_lock() implies a barrier on the other side.
Sorry if this is all off, I am just trying to understand what's going on.
Ah that is wild. That does sound plausible. In this case, maybe something like this?
mem_cgroup_id_remove(memcg); /*
- Preventing css_put from happening before id removal due to
- instruction reordering.
This is redefining what smp_mb() is, probably unnecessary.
- This guarantees that if a non-null memcg is acquired from ID within
- an RCU read section, its css won't be freed for the
- duration of this section.
*/ smp_mb(); /* Memcg ID pins CSS */ css_put(&memcg->css);
I am not the best person to answer this question, ideally someone with more understanding of memory barriers should chime in here to: - Confirm my theory is correct. - Confirm smp_mb() is the correct primitive to use. I am guessing smp_wmb() is enough here. - Confirm that we don't need an additional read barrier on the read side, ideally rcu_read_lock() is enough, but I am not sure.
On Fri, Aug 18, 2023 at 07:56:37AM -0700, Yosry Ahmed wrote:
On Fri, Aug 18, 2023 at 6:49 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Thu, Aug 17, 2023 at 05:12:17PM -0600, Yu Zhao wrote:
On Thu, Aug 17, 2023 at 4:50 PM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 3:43 PM Nhat Pham nphamcs@gmail.com wrote:
On Thu, Aug 17, 2023 at 1:50 PM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 12:01 PM Nhat Pham nphamcs@gmail.com wrote: > > In eviction recency check, we are currently not holding a local > reference to the memcg that the refaulted folio belonged to when it was > evicted. This could cause serious memcg lifetime issues, for e.g in the > memcg hierarchy traversal done in mem_cgroup_get_nr_swap_pages(). This > has occurred in production: > > [ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 > [ 155757.807568] #PF: supervisor read access in kernel mode > [ 155757.818024] #PF: error_code(0x0000) - not-present page > [ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 > [ 155757.839985] Oops: 0000 [#1] SMP > [ 155757.846444] CPU: 7 PID: 1380944 Comm: ThriftSrv-pri3- Kdump: loaded Tainted: G S 6.4.3-0_fbk1_rc0_594_g8d0cbcaa67ba #1 > [ 155757.870808] Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM16 05/24/2021 > [ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 > [ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 > [ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 > [ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 > [ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 > [ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 > [ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 > [ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 > [ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 > [ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 > [ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > [ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 > [ 155758.091376] PKRU: 55555554 > [ 155758.096957] Call Trace: > [ 155758.102016] <TASK> > [ 155758.106502] ? __die+0x78/0xc0 > [ 155758.112793] ? page_fault_oops+0x286/0x380 > [ 155758.121175] ? exc_page_fault+0x5d/0x110 > [ 155758.129209] ? asm_exc_page_fault+0x22/0x30 > [ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 > [ 155758.148060] workingset_test_recent+0xda/0x1b0 > [ 155758.157133] workingset_refault+0xca/0x1e0 > [ 155758.165508] filemap_add_folio+0x4d/0x70 > [ 155758.173538] page_cache_ra_unbounded+0xed/0x190 > [ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 > [ 155758.191738] filemap_read+0x68d/0xdf0 > [ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 > [ 155758.207981] ? __napi_schedule+0x55/0x90 > [ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 > [ 155758.224601] do_syscall_64+0x3d/0x80 > [ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 > [ 155758.242473] RIP: 0033:0x7f62c29153b5 > [ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b > [ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 > [ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 > [ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 > [ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c > [ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 > [ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 > [ 155758.376661] </TASK> > > This patch fixes the issue by getting a local reference inside > unpack_shadow(). > > Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container")
Beyond mem_cgroup_get_nr_swap_pages(), we still use the eviction_memcg without grabbing a ref to it first in workingset_test_recent() (and in workingset_refault() before that) as well as lru_gen_test_recent().
Wouldn't the fix go back even further? or am I misinterpreting the problem?
Hmm I don't see eviction_memcg being used outside of *_test_recent (the rest just uses memcg = folio_memcg(folio), which if I'm not mistaken is the memcg that is refaulting the folio into memory).
Inside workingset_test_recent(), the only other place where eviction_memcg is used is for mem_cgroup_lruvec. This function call won't crash whether eviction_memcg is valid or not.
If eviction_memcg is invalid because the memory was already freed, we are basically dereferencing garbage in mem_cgroup_lruvec() aren't we?
The crash only happens during mem_cgroup_get_nr_swap_pages, which has an upward traversal from eviction_memcg to root.
Let me know if this does not make sense and/or is insufficient to ensure safe upward traversal from eviction_memcg to root!
> Signed-off-by: Nhat Pham nphamcs@gmail.com > Cc: stable@vger.kernel.org > --- > mm/workingset.c | 65 ++++++++++++++++++++++++++++++++----------------- > 1 file changed, 43 insertions(+), 22 deletions(-) > > diff --git a/mm/workingset.c b/mm/workingset.c > index da58a26d0d4d..03cadad4e484 100644 > --- a/mm/workingset.c > +++ b/mm/workingset.c > @@ -206,10 +206,19 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, > return xa_mk_value(eviction); > } > > -static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, > - unsigned long *evictionp, bool *workingsetp) > +/* > + * Unpacks the stored fields of a shadow entry into the given pointers. > + * > + * The memcg pointer is only populated if the memcg recorded in the shadow > + * entry is valid. In this case, a reference to the memcg will be acquired, > + * and a corresponding mem_cgroup_put() will be needed when we no longer > + * need the memcg. > + */ > +static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp, > + pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) > { > unsigned long entry = xa_to_value(shadow); > + struct mem_cgroup *memcg; > int memcgid, nid; > bool workingset; > > @@ -220,7 +229,24 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, > memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); > entry >>= MEM_CGROUP_ID_SHIFT; > > - *memcgidp = memcgid; > + /* > + * Look up the memcg associated with the stored ID. It might > + * have been deleted since the folio's eviction. > + * > + * Note that in rare events the ID could have been recycled > + * for a new cgroup that refaults a shared folio. This is > + * impossible to tell from the available data. However, this > + * should be a rare and limited disturbance, and activations > + * are always speculative anyway. Ultimately, it's the aging > + * algorithm's job to shake out the minimum access frequency > + * for the active cache. > + */ > + memcg = mem_cgroup_from_id(memcgid); > + if (memcg && css_tryget(&memcg->css)) > + *memcgp = memcg; > + else > + *memcgp = NULL; > + > *pgdat = NODE_DATA(nid); > *evictionp = entry; > *workingsetp = workingset; > @@ -262,15 +288,16 @@ static void *lru_gen_eviction(struct folio *folio) > static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, > unsigned long *token, bool *workingset) > { > - int memcg_id; > unsigned long min_seq; > struct mem_cgroup *memcg; > struct pglist_data *pgdat; > > - unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); > + unpack_shadow(shadow, &memcg, &pgdat, token, workingset); > + if (!mem_cgroup_disabled() && !memcg) > + return false;
+Yu Zhao
There is a change of behavior here, right?
The existing code will continue if !mem_cgroup_disabled() && !memcg is true, and mem_cgroup_lruvec() will return the lruvec of the root memcg. Now we are just returning false.
Is this intentional?
Oh right, there is. Should have cc-ed Yu Zhao as well, my bad. get_maintainers.pl isn't always sufficient I guess :)
But yeah, this behavioral change is intentional.
Correct me if I'm wrong of course, but it seems like MGLRU should follow the same pattern here. That is, once we return from unpack_shadow, the possible scenarios are the same as prescribed in workingset_test_recent:
- If mem_cgroup is disabled, we can ignore this check.
- If mem_cgroup is enabled, then the only reason why we get NULL
memcg from unpack_shadow is if the eviction_memcg is no longer valid. We should not try to get its lruvec, or substitute it with the root memcg, but return false right away (i.e not recent).
I will leave this for Yu :)
Thanks, Yosry.
Hi Nhat, it seems unnecessary to me to introduce a get/put into lru_gen_test_recent() because it doesn't suffer from the bug this patch tries to fix. In theory, the extra get/put can impact performance, though admittedly the impact is unlikely to be measurable. Regardless, the general practice is to fix the bug locally, i.e., when the mem_cgroup_get_nr_swap_pages() path is taken, rather than change the unrelated path. Thank you.
Hey guys,
I had suggested to have it in unpack_shadow() to keep things simple, and not further complicate the lifetime rules in this code. The tryget() is against a per-cpu counter, so it's not expensive.
The NULL deref is evidence that while *some* cgroup members are still accessible once it's dead, not all of it is. There is no explicit guarantee from the cgroup code that anything BUT the tryget() is still valid against group that is under rcu freeing.
Since it isn't expensive, let's keep it simple and robust, and prevent future bugs of the same class, by always ensuring the cgroup is alive before accessing random members. Especially in non-cgroup code.
I looked at this again today with fresh eyes, and I want to go back to what I initially said. Isn't RCU protection in this case enough to keep the memcg "valid" (i.e accessible, not garbage)? The tryget is not a lot of complexity or performance tax, but I want to really understand what's happening here.
Hm. I think you're right that something is still missing from the analysis.
Looking at the code again, this seems to be the sequence of events on the cgroup side:
- css_put() puts the last reference invoking a call to css_release()
- css_release() queues css_release_work_fn()
- css_release() does some bookkeeping, makes some callbacks, and
queues css_free_rwork_fn() to run *after* an RCU grace period.
- css_free_rwork_fn() makes callbacks to free the memory, ultimately
freeing the memcg.
On the memcg idr side, the removal sequence of events seem to be:
- mem_cgroup_id_put() will decrement the id ref and check if falls to 0
- If the id ref falls to 0, we call mem_cgroup_id_remove() *then* css_put()
On the workingset_refault() side, the sequence of events seems to be:
- rcu_read_lock()
- memcg = mem_cgroup_from_id()
- ... // use memcg
- rcu_read_unlock()
So technically, after holding the rcu read lock, if we find the memcg in the idr, it must be valid, and it must not be freed until after the rcu read section is completed. It's not just the cgroup internal implementation, it's the contract between cgroup core and controllers such as memcg.
The memory controller expects a sequence of callbacks during freeing: css_offline() -> css_released() -> css_free(). So memcg code is within its right to access any fields of struct mem_cgroup that are not freed by the css_offline() or css_released() until css_free() is called, right?
Here is a guess / question, because I am not really familiar with memory barriers and such, but is it at all possible that the actual problem is reordering of instructions in mem_cgroup_id_put_many(), such that we actually execute css_put() *before* mem_cgroup_id_remove()?
If this happens it seems possible for this to happen:
cpu #1 cpu#2 css_put() /* css_free_rwork_fn is queued */ rcu_read_lock() mem_cgroup_from_id() mem_cgroup_id_remove() /* access memcg */
I don't quite see how that'd possible. IDR uses rcu_assign_pointer() during deletion, which inserts the necessary barriering. My understanding is that this should always be safe:
rcu_read_lock() (writer serialization, in this case ref count == 0) foo = idr_find(x) idr_remove(x) if (foo) kfree_rcu(foo) LOAD(foo->bar) rcu_read_unlock()
On Fri, Aug 18, 2023 at 10:35 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 07:56:37AM -0700, Yosry Ahmed wrote:
On Fri, Aug 18, 2023 at 6:49 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Thu, Aug 17, 2023 at 05:12:17PM -0600, Yu Zhao wrote:
On Thu, Aug 17, 2023 at 4:50 PM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 3:43 PM Nhat Pham nphamcs@gmail.com wrote:
On Thu, Aug 17, 2023 at 1:50 PM Yosry Ahmed yosryahmed@google.com wrote: > > On Thu, Aug 17, 2023 at 12:01 PM Nhat Pham nphamcs@gmail.com wrote: > > > > In eviction recency check, we are currently not holding a local > > reference to the memcg that the refaulted folio belonged to when it was > > evicted. This could cause serious memcg lifetime issues, for e.g in the > > memcg hierarchy traversal done in mem_cgroup_get_nr_swap_pages(). This > > has occurred in production: > > > > [ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 > > [ 155757.807568] #PF: supervisor read access in kernel mode > > [ 155757.818024] #PF: error_code(0x0000) - not-present page > > [ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 > > [ 155757.839985] Oops: 0000 [#1] SMP > > [ 155757.846444] CPU: 7 PID: 1380944 Comm: ThriftSrv-pri3- Kdump: loaded Tainted: G S 6.4.3-0_fbk1_rc0_594_g8d0cbcaa67ba #1 > > [ 155757.870808] Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM16 05/24/2021 > > [ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 > > [ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 > > [ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 > > [ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 > > [ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 > > [ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 > > [ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 > > [ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 > > [ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 > > [ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > > [ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 > > [ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > > [ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 > > [ 155758.091376] PKRU: 55555554 > > [ 155758.096957] Call Trace: > > [ 155758.102016] <TASK> > > [ 155758.106502] ? __die+0x78/0xc0 > > [ 155758.112793] ? page_fault_oops+0x286/0x380 > > [ 155758.121175] ? exc_page_fault+0x5d/0x110 > > [ 155758.129209] ? asm_exc_page_fault+0x22/0x30 > > [ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 > > [ 155758.148060] workingset_test_recent+0xda/0x1b0 > > [ 155758.157133] workingset_refault+0xca/0x1e0 > > [ 155758.165508] filemap_add_folio+0x4d/0x70 > > [ 155758.173538] page_cache_ra_unbounded+0xed/0x190 > > [ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 > > [ 155758.191738] filemap_read+0x68d/0xdf0 > > [ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 > > [ 155758.207981] ? __napi_schedule+0x55/0x90 > > [ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 > > [ 155758.224601] do_syscall_64+0x3d/0x80 > > [ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 > > [ 155758.242473] RIP: 0033:0x7f62c29153b5 > > [ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b > > [ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 > > [ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 > > [ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 > > [ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c > > [ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 > > [ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 > > [ 155758.376661] </TASK> > > > > This patch fixes the issue by getting a local reference inside > > unpack_shadow(). > > > > Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container") > > Beyond mem_cgroup_get_nr_swap_pages(), we still use the eviction_memcg > without grabbing a ref to it first in workingset_test_recent() (and in > workingset_refault() before that) as well as lru_gen_test_recent(). > > Wouldn't the fix go back even further? or am I misinterpreting the problem? Hmm I don't see eviction_memcg being used outside of *_test_recent (the rest just uses memcg = folio_memcg(folio), which if I'm not mistaken is the memcg that is refaulting the folio into memory).
Inside workingset_test_recent(), the only other place where eviction_memcg is used is for mem_cgroup_lruvec. This function call won't crash whether eviction_memcg is valid or not.
If eviction_memcg is invalid because the memory was already freed, we are basically dereferencing garbage in mem_cgroup_lruvec() aren't we?
The crash only happens during mem_cgroup_get_nr_swap_pages, which has an upward traversal from eviction_memcg to root.
Let me know if this does not make sense and/or is insufficient to ensure safe upward traversal from eviction_memcg to root! > > > > > Signed-off-by: Nhat Pham nphamcs@gmail.com > > Cc: stable@vger.kernel.org > > --- > > mm/workingset.c | 65 ++++++++++++++++++++++++++++++++----------------- > > 1 file changed, 43 insertions(+), 22 deletions(-) > > > > diff --git a/mm/workingset.c b/mm/workingset.c > > index da58a26d0d4d..03cadad4e484 100644 > > --- a/mm/workingset.c > > +++ b/mm/workingset.c > > @@ -206,10 +206,19 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, > > return xa_mk_value(eviction); > > } > > > > -static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, > > - unsigned long *evictionp, bool *workingsetp) > > +/* > > + * Unpacks the stored fields of a shadow entry into the given pointers. > > + * > > + * The memcg pointer is only populated if the memcg recorded in the shadow > > + * entry is valid. In this case, a reference to the memcg will be acquired, > > + * and a corresponding mem_cgroup_put() will be needed when we no longer > > + * need the memcg. > > + */ > > +static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp, > > + pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) > > { > > unsigned long entry = xa_to_value(shadow); > > + struct mem_cgroup *memcg; > > int memcgid, nid; > > bool workingset; > > > > @@ -220,7 +229,24 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, > > memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); > > entry >>= MEM_CGROUP_ID_SHIFT; > > > > - *memcgidp = memcgid; > > + /* > > + * Look up the memcg associated with the stored ID. It might > > + * have been deleted since the folio's eviction. > > + * > > + * Note that in rare events the ID could have been recycled > > + * for a new cgroup that refaults a shared folio. This is > > + * impossible to tell from the available data. However, this > > + * should be a rare and limited disturbance, and activations > > + * are always speculative anyway. Ultimately, it's the aging > > + * algorithm's job to shake out the minimum access frequency > > + * for the active cache. > > + */ > > + memcg = mem_cgroup_from_id(memcgid); > > + if (memcg && css_tryget(&memcg->css)) > > + *memcgp = memcg; > > + else > > + *memcgp = NULL; > > + > > *pgdat = NODE_DATA(nid); > > *evictionp = entry; > > *workingsetp = workingset; > > @@ -262,15 +288,16 @@ static void *lru_gen_eviction(struct folio *folio) > > static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, > > unsigned long *token, bool *workingset) > > { > > - int memcg_id; > > unsigned long min_seq; > > struct mem_cgroup *memcg; > > struct pglist_data *pgdat; > > > > - unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); > > + unpack_shadow(shadow, &memcg, &pgdat, token, workingset); > > + if (!mem_cgroup_disabled() && !memcg) > > + return false; > > +Yu Zhao > > There is a change of behavior here, right? > > The existing code will continue if !mem_cgroup_disabled() && !memcg is > true, and mem_cgroup_lruvec() will return the lruvec of the root > memcg. Now we are just returning false. > > Is this intentional? Oh right, there is. Should have cc-ed Yu Zhao as well, my bad. get_maintainers.pl isn't always sufficient I guess :)
But yeah, this behavioral change is intentional.
Correct me if I'm wrong of course, but it seems like MGLRU should follow the same pattern here. That is, once we return from unpack_shadow, the possible scenarios are the same as prescribed in workingset_test_recent:
- If mem_cgroup is disabled, we can ignore this check.
- If mem_cgroup is enabled, then the only reason why we get NULL
memcg from unpack_shadow is if the eviction_memcg is no longer valid. We should not try to get its lruvec, or substitute it with the root memcg, but return false right away (i.e not recent). >
I will leave this for Yu :)
Thanks, Yosry.
Hi Nhat, it seems unnecessary to me to introduce a get/put into lru_gen_test_recent() because it doesn't suffer from the bug this patch tries to fix. In theory, the extra get/put can impact performance, though admittedly the impact is unlikely to be measurable. Regardless, the general practice is to fix the bug locally, i.e., when the mem_cgroup_get_nr_swap_pages() path is taken, rather than change the unrelated path. Thank you.
Hey guys,
I had suggested to have it in unpack_shadow() to keep things simple, and not further complicate the lifetime rules in this code. The tryget() is against a per-cpu counter, so it's not expensive.
The NULL deref is evidence that while *some* cgroup members are still accessible once it's dead, not all of it is. There is no explicit guarantee from the cgroup code that anything BUT the tryget() is still valid against group that is under rcu freeing.
Since it isn't expensive, let's keep it simple and robust, and prevent future bugs of the same class, by always ensuring the cgroup is alive before accessing random members. Especially in non-cgroup code.
I looked at this again today with fresh eyes, and I want to go back to what I initially said. Isn't RCU protection in this case enough to keep the memcg "valid" (i.e accessible, not garbage)? The tryget is not a lot of complexity or performance tax, but I want to really understand what's happening here.
Hm. I think you're right that something is still missing from the analysis.
Looking at the code again, this seems to be the sequence of events on the cgroup side:
- css_put() puts the last reference invoking a call to css_release()
- css_release() queues css_release_work_fn()
- css_release() does some bookkeeping, makes some callbacks, and
queues css_free_rwork_fn() to run *after* an RCU grace period.
- css_free_rwork_fn() makes callbacks to free the memory, ultimately
freeing the memcg.
On the memcg idr side, the removal sequence of events seem to be:
- mem_cgroup_id_put() will decrement the id ref and check if falls to 0
- If the id ref falls to 0, we call mem_cgroup_id_remove() *then* css_put()
On the workingset_refault() side, the sequence of events seems to be:
- rcu_read_lock()
- memcg = mem_cgroup_from_id()
- ... // use memcg
- rcu_read_unlock()
So technically, after holding the rcu read lock, if we find the memcg in the idr, it must be valid, and it must not be freed until after the rcu read section is completed. It's not just the cgroup internal implementation, it's the contract between cgroup core and controllers such as memcg.
The memory controller expects a sequence of callbacks during freeing: css_offline() -> css_released() -> css_free(). So memcg code is within its right to access any fields of struct mem_cgroup that are not freed by the css_offline() or css_released() until css_free() is called, right?
Here is a guess / question, because I am not really familiar with memory barriers and such, but is it at all possible that the actual problem is reordering of instructions in mem_cgroup_id_put_many(), such that we actually execute css_put() *before* mem_cgroup_id_remove()?
If this happens it seems possible for this to happen:
cpu #1 cpu#2 css_put() /* css_free_rwork_fn is queued */ rcu_read_lock() mem_cgroup_from_id() mem_cgroup_id_remove() /* access memcg */
I don't quite see how that'd possible. IDR uses rcu_assign_pointer() during deletion, which inserts the necessary barriering. My understanding is that this should always be safe:
rcu_read_lock() (writer serialization, in this case ref count == 0) foo = idr_find(x) idr_remove(x) if (foo) kfree_rcu(foo) LOAD(foo->bar) rcu_read_unlock()
How does a barrier inside IDR removal protect against the memcg being freed here though?
If css_put() is executed out-of-order before mem_cgroup_id_remove(), the memcg can be freed even before mem_cgroup_id_remove() is called, right?
On Fri, Aug 18, 2023 at 10:45:56AM -0700, Yosry Ahmed wrote:
On Fri, Aug 18, 2023 at 10:35 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 07:56:37AM -0700, Yosry Ahmed wrote:
If this happens it seems possible for this to happen:
cpu #1 cpu#2 css_put() /* css_free_rwork_fn is queued */ rcu_read_lock() mem_cgroup_from_id() mem_cgroup_id_remove() /* access memcg */
I don't quite see how that'd possible. IDR uses rcu_assign_pointer() during deletion, which inserts the necessary barriering. My understanding is that this should always be safe:
rcu_read_lock() (writer serialization, in this case ref count == 0) foo = idr_find(x) idr_remove(x) if (foo) kfree_rcu(foo) LOAD(foo->bar) rcu_read_unlock()
How does a barrier inside IDR removal protect against the memcg being freed here though?
If css_put() is executed out-of-order before mem_cgroup_id_remove(), the memcg can be freed even before mem_cgroup_id_remove() is called, right?
css_put() can start earlier, but it's not allowed to reorder the rcu callback that frees past the rcu_assign_pointer() in idr_remove().
This is what RCU and its access primitives guarantees. It ensures that after "unpublishing" the pointer, all concurrent RCU-protected accesses to the object have finished, and the memory can be freed.
On Fri, Aug 18, 2023 at 11:35 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 10:45:56AM -0700, Yosry Ahmed wrote:
On Fri, Aug 18, 2023 at 10:35 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 07:56:37AM -0700, Yosry Ahmed wrote:
If this happens it seems possible for this to happen:
cpu #1 cpu#2 css_put() /* css_free_rwork_fn is queued */ rcu_read_lock() mem_cgroup_from_id() mem_cgroup_id_remove() /* access memcg */
I don't quite see how that'd possible. IDR uses rcu_assign_pointer() during deletion, which inserts the necessary barriering. My understanding is that this should always be safe:
rcu_read_lock() (writer serialization, in this case ref count == 0) foo = idr_find(x) idr_remove(x) if (foo) kfree_rcu(foo) LOAD(foo->bar) rcu_read_unlock()
How does a barrier inside IDR removal protect against the memcg being freed here though?
If css_put() is executed out-of-order before mem_cgroup_id_remove(), the memcg can be freed even before mem_cgroup_id_remove() is called, right?
css_put() can start earlier, but it's not allowed to reorder the rcu callback that frees past the rcu_assign_pointer() in idr_remove().
This is what RCU and its access primitives guarantees. It ensures that after "unpublishing" the pointer, all concurrent RCU-protected accesses to the object have finished, and the memory can be freed.
I am not sure I understand, this is the scenario I mean:
cpu#1 cpu#2 cpu#3 css_put() /* schedule free */ rcu_read_lock() idr_remove() mem_cgroup_from_id()
/* free memcg */ /* use memcg */
If I understand correctly you are saying that the scheduled free callback cannot run before idr_remove() due to the barrier in there, but it can run after the rcu_read_lock() in cpu #2 because it was scheduled before that RCU critical section started, right?
On Fri, Aug 18, 2023 at 11:44:45AM -0700, Yosry Ahmed wrote:
On Fri, Aug 18, 2023 at 11:35 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 10:45:56AM -0700, Yosry Ahmed wrote:
On Fri, Aug 18, 2023 at 10:35 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 07:56:37AM -0700, Yosry Ahmed wrote:
If this happens it seems possible for this to happen:
cpu #1 cpu#2 css_put() /* css_free_rwork_fn is queued */ rcu_read_lock() mem_cgroup_from_id() mem_cgroup_id_remove() /* access memcg */
I don't quite see how that'd possible. IDR uses rcu_assign_pointer() during deletion, which inserts the necessary barriering. My understanding is that this should always be safe:
rcu_read_lock() (writer serialization, in this case ref count == 0) foo = idr_find(x) idr_remove(x) if (foo) kfree_rcu(foo) LOAD(foo->bar) rcu_read_unlock()
How does a barrier inside IDR removal protect against the memcg being freed here though?
If css_put() is executed out-of-order before mem_cgroup_id_remove(), the memcg can be freed even before mem_cgroup_id_remove() is called, right?
css_put() can start earlier, but it's not allowed to reorder the rcu callback that frees past the rcu_assign_pointer() in idr_remove().
This is what RCU and its access primitives guarantees. It ensures that after "unpublishing" the pointer, all concurrent RCU-protected accesses to the object have finished, and the memory can be freed.
I am not sure I understand, this is the scenario I mean:
cpu#1 cpu#2 cpu#3 css_put() /* schedule free */ rcu_read_lock() idr_remove() mem_cgroup_from_id()
/* free memcg */ /* use memcg */
If I understand correctly you are saying that the scheduled free callback cannot run before idr_remove() due to the barrier in there, but it can run after the rcu_read_lock() in cpu #2 because it was scheduled before that RCU critical section started, right?
Isn't there a simpler explanation. The memcg whose id is stored in the shadow entry has been freed and there is an ongoing new memcg allocation which by chance has acquired the same id and has not yet initialized completely. More specifically the new memcg creation is between css_alloc() and init_and_link_css() and there is a refault for the shadow entry holding that id.
On Fri, Aug 18, 2023 at 3:35 PM Shakeel Butt shakeelb@google.com wrote:
On Fri, Aug 18, 2023 at 11:44:45AM -0700, Yosry Ahmed wrote:
On Fri, Aug 18, 2023 at 11:35 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 10:45:56AM -0700, Yosry Ahmed wrote:
On Fri, Aug 18, 2023 at 10:35 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 07:56:37AM -0700, Yosry Ahmed wrote:
If this happens it seems possible for this to happen:
cpu #1 cpu#2 css_put() /* css_free_rwork_fn is queued */ rcu_read_lock() mem_cgroup_from_id() mem_cgroup_id_remove() /* access memcg */
I don't quite see how that'd possible. IDR uses rcu_assign_pointer() during deletion, which inserts the necessary barriering. My understanding is that this should always be safe:
rcu_read_lock() (writer serialization, in this case ref count == 0) foo = idr_find(x) idr_remove(x) if (foo) kfree_rcu(foo) LOAD(foo->bar) rcu_read_unlock()
How does a barrier inside IDR removal protect against the memcg being freed here though?
If css_put() is executed out-of-order before mem_cgroup_id_remove(), the memcg can be freed even before mem_cgroup_id_remove() is called, right?
css_put() can start earlier, but it's not allowed to reorder the rcu callback that frees past the rcu_assign_pointer() in idr_remove().
This is what RCU and its access primitives guarantees. It ensures that after "unpublishing" the pointer, all concurrent RCU-protected accesses to the object have finished, and the memory can be freed.
I am not sure I understand, this is the scenario I mean:
cpu#1 cpu#2 cpu#3 css_put() /* schedule free */ rcu_read_lock() idr_remove() mem_cgroup_from_id()
/* free memcg */ /* use memcg */
If I understand correctly you are saying that the scheduled free callback cannot run before idr_remove() due to the barrier in there, but it can run after the rcu_read_lock() in cpu #2 because it was scheduled before that RCU critical section started, right?
Isn't there a simpler explanation. The memcg whose id is stored in the shadow entry has been freed and there is an ongoing new memcg allocation which by chance has acquired the same id and has not yet initialized completely. More specifically the new memcg creation is between css_alloc() and init_and_link_css() and there is a refault for the shadow entry holding that id.
I think so, and this fix would just crash at tryget() instead when hitting the problem.
On Fri, Aug 18, 2023 at 2:52 PM Yu Zhao yuzhao@google.com wrote:
On Fri, Aug 18, 2023 at 3:35 PM Shakeel Butt shakeelb@google.com wrote:
On Fri, Aug 18, 2023 at 11:44:45AM -0700, Yosry Ahmed wrote:
On Fri, Aug 18, 2023 at 11:35 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 10:45:56AM -0700, Yosry Ahmed wrote:
On Fri, Aug 18, 2023 at 10:35 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 07:56:37AM -0700, Yosry Ahmed wrote: > If this happens it seems possible for this to happen: > > cpu #1 cpu#2 > css_put() > /* css_free_rwork_fn is queued */ > rcu_read_lock() > mem_cgroup_from_id() > mem_cgroup_id_remove() > /* access memcg */
I don't quite see how that'd possible. IDR uses rcu_assign_pointer() during deletion, which inserts the necessary barriering. My understanding is that this should always be safe:
rcu_read_lock() (writer serialization, in this case ref count == 0) foo = idr_find(x) idr_remove(x) if (foo) kfree_rcu(foo) LOAD(foo->bar) rcu_read_unlock()
How does a barrier inside IDR removal protect against the memcg being freed here though?
If css_put() is executed out-of-order before mem_cgroup_id_remove(), the memcg can be freed even before mem_cgroup_id_remove() is called, right?
css_put() can start earlier, but it's not allowed to reorder the rcu callback that frees past the rcu_assign_pointer() in idr_remove().
This is what RCU and its access primitives guarantees. It ensures that after "unpublishing" the pointer, all concurrent RCU-protected accesses to the object have finished, and the memory can be freed.
I am not sure I understand, this is the scenario I mean:
cpu#1 cpu#2 cpu#3 css_put() /* schedule free */ rcu_read_lock() idr_remove() mem_cgroup_from_id()
/* free memcg */ /* use memcg */
If I understand correctly you are saying that the scheduled free callback cannot run before idr_remove() due to the barrier in there, but it can run after the rcu_read_lock() in cpu #2 because it was scheduled before that RCU critical section started, right?
Isn't there a simpler explanation. The memcg whose id is stored in the shadow entry has been freed and there is an ongoing new memcg allocation which by chance has acquired the same id and has not yet initialized completely. More specifically the new memcg creation is between css_alloc() and init_and_link_css() and there is a refault for the shadow entry holding that id.
That's actually very plausible.
I think so, and this fix would just crash at tryget() instead when hitting the problem.
It seems like mem_cgroup_from_id() completely disregards memcg->id.ref. In the case that Shakeel mentioned, memcg->id.ref should have a count of 0. Perhaps we should update it to respect the id refcount? Maybe try to acquire a ref first before looking up the idr?
On Fri, Aug 18, 2023 at 09:35:02PM +0000, Shakeel Butt wrote:
On Fri, Aug 18, 2023 at 11:44:45AM -0700, Yosry Ahmed wrote:
On Fri, Aug 18, 2023 at 11:35 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 10:45:56AM -0700, Yosry Ahmed wrote:
On Fri, Aug 18, 2023 at 10:35 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 07:56:37AM -0700, Yosry Ahmed wrote:
If this happens it seems possible for this to happen:
cpu #1 cpu#2 css_put() /* css_free_rwork_fn is queued */ rcu_read_lock() mem_cgroup_from_id() mem_cgroup_id_remove() /* access memcg */
I don't quite see how that'd possible. IDR uses rcu_assign_pointer() during deletion, which inserts the necessary barriering. My understanding is that this should always be safe:
rcu_read_lock() (writer serialization, in this case ref count == 0) foo = idr_find(x) idr_remove(x) if (foo) kfree_rcu(foo) LOAD(foo->bar) rcu_read_unlock()
How does a barrier inside IDR removal protect against the memcg being freed here though?
If css_put() is executed out-of-order before mem_cgroup_id_remove(), the memcg can be freed even before mem_cgroup_id_remove() is called, right?
css_put() can start earlier, but it's not allowed to reorder the rcu callback that frees past the rcu_assign_pointer() in idr_remove().
This is what RCU and its access primitives guarantees. It ensures that after "unpublishing" the pointer, all concurrent RCU-protected accesses to the object have finished, and the memory can be freed.
I am not sure I understand, this is the scenario I mean:
cpu#1 cpu#2 cpu#3 css_put() /* schedule free */ rcu_read_lock() idr_remove() mem_cgroup_from_id()
/* free memcg */ /* use memcg */
If I understand correctly you are saying that the scheduled free callback cannot run before idr_remove() due to the barrier in there, but it can run after the rcu_read_lock() in cpu #2 because it was scheduled before that RCU critical section started, right?
Isn't there a simpler explanation. The memcg whose id is stored in the shadow entry has been freed and there is an ongoing new memcg allocation which by chance has acquired the same id and has not yet initialized completely. More specifically the new memcg creation is between css_alloc() and init_and_link_css() and there is a refault for the shadow entry holding that id.
Good catch.
It's late on a Friday, but I'm thinking we can just move that idr_replace() from css_alloc to first thing in css_online(). Make sure the css is fully initialized before it's published.
On Fri, Aug 18, 2023 at 11:44:45AM -0700, Yosry Ahmed wrote:
On Fri, Aug 18, 2023 at 11:35 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 10:45:56AM -0700, Yosry Ahmed wrote:
On Fri, Aug 18, 2023 at 10:35 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 07:56:37AM -0700, Yosry Ahmed wrote:
If this happens it seems possible for this to happen:
cpu #1 cpu#2 css_put() /* css_free_rwork_fn is queued */ rcu_read_lock() mem_cgroup_from_id() mem_cgroup_id_remove() /* access memcg */
I don't quite see how that'd possible. IDR uses rcu_assign_pointer() during deletion, which inserts the necessary barriering. My understanding is that this should always be safe:
rcu_read_lock() (writer serialization, in this case ref count == 0) foo = idr_find(x) idr_remove(x) if (foo) kfree_rcu(foo) LOAD(foo->bar) rcu_read_unlock()
How does a barrier inside IDR removal protect against the memcg being freed here though?
If css_put() is executed out-of-order before mem_cgroup_id_remove(), the memcg can be freed even before mem_cgroup_id_remove() is called, right?
css_put() can start earlier, but it's not allowed to reorder the rcu callback that frees past the rcu_assign_pointer() in idr_remove().
This is what RCU and its access primitives guarantees. It ensures that after "unpublishing" the pointer, all concurrent RCU-protected accesses to the object have finished, and the memory can be freed.
I am not sure I understand, this is the scenario I mean:
cpu#1 cpu#2 cpu#3 css_put() /* schedule free */ rcu_read_lock() idr_remove() mem_cgroup_from_id()
/* free memcg */ /* use memcg */
idr_remove() cannot be re-ordered after scheduling the free. Think about it, this is the common rcu-freeing pattern:
rcu_assign_pointer(p, NULL); call_rcu(rh, free_pointee);
on the write side, and:
rcu_read_lock(); pointee = rcu_dereference(p); if (pointee) do_stuff(pointee); rcu_read_unlock();
on the read side.
In our case, the rcu_assign_pointer() is in idr_remove(). And the rcu_dereference() is in mem_cgroup_from_id() -> idr_find() -> radix_tree_lookup() -> radix_tree_descend().
So if we find the memcg in the idr under rcu lock, the cgroup rcu work is guaranteed to not run until the lock is dropped. If we don't find it, it may or may not have already run.
On Fri, Aug 18, 2023 at 3:19 PM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 11:44:45AM -0700, Yosry Ahmed wrote:
On Fri, Aug 18, 2023 at 11:35 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 10:45:56AM -0700, Yosry Ahmed wrote:
On Fri, Aug 18, 2023 at 10:35 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Fri, Aug 18, 2023 at 07:56:37AM -0700, Yosry Ahmed wrote:
If this happens it seems possible for this to happen:
cpu #1 cpu#2 css_put() /* css_free_rwork_fn is queued */ rcu_read_lock() mem_cgroup_from_id() mem_cgroup_id_remove() /* access memcg */
I don't quite see how that'd possible. IDR uses rcu_assign_pointer() during deletion, which inserts the necessary barriering. My understanding is that this should always be safe:
rcu_read_lock() (writer serialization, in this case ref count == 0) foo = idr_find(x) idr_remove(x) if (foo) kfree_rcu(foo) LOAD(foo->bar) rcu_read_unlock()
How does a barrier inside IDR removal protect against the memcg being freed here though?
If css_put() is executed out-of-order before mem_cgroup_id_remove(), the memcg can be freed even before mem_cgroup_id_remove() is called, right?
css_put() can start earlier, but it's not allowed to reorder the rcu callback that frees past the rcu_assign_pointer() in idr_remove().
This is what RCU and its access primitives guarantees. It ensures that after "unpublishing" the pointer, all concurrent RCU-protected accesses to the object have finished, and the memory can be freed.
I am not sure I understand, this is the scenario I mean:
cpu#1 cpu#2 cpu#3 css_put() /* schedule free */ rcu_read_lock() idr_remove() mem_cgroup_from_id()
/* free memcg */ /* use memcg */
idr_remove() cannot be re-ordered after scheduling the free. Think about it, this is the common rcu-freeing pattern:
rcu_assign_pointer(p, NULL); call_rcu(rh, free_pointee);
on the write side, and:
rcu_read_lock(); pointee = rcu_dereference(p); if (pointee) do_stuff(pointee); rcu_read_unlock();
on the read side.
In our case, the rcu_assign_pointer() is in idr_remove(). And the rcu_dereference() is in mem_cgroup_from_id() -> idr_find() -> radix_tree_lookup() -> radix_tree_descend().
So if we find the memcg in the idr under rcu lock, the cgroup rcu work is guaranteed to not run until the lock is dropped. If we don't find it, it may or may not have already run.
Yeah I missed the implicit barrier there, thanks for bearing with me. I think Shakeel might have found the actual problem here (see his response).
On Thu, Aug 17, 2023 at 3:50 PM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 3:43 PM Nhat Pham nphamcs@gmail.com wrote:
On Thu, Aug 17, 2023 at 1:50 PM Yosry Ahmed yosryahmed@google.com wrote:
On Thu, Aug 17, 2023 at 12:01 PM Nhat Pham nphamcs@gmail.com wrote:
In eviction recency check, we are currently not holding a local reference to the memcg that the refaulted folio belonged to when it was evicted. This could cause serious memcg lifetime issues, for e.g in the memcg hierarchy traversal done in mem_cgroup_get_nr_swap_pages(). This has occurred in production:
[ 155757.793456] BUG: kernel NULL pointer dereference, address: 00000000000000c0 [ 155757.807568] #PF: supervisor read access in kernel mode [ 155757.818024] #PF: error_code(0x0000) - not-present page [ 155757.828482] PGD 401f77067 P4D 401f77067 PUD 401f76067 PMD 0 [ 155757.839985] Oops: 0000 [#1] SMP [ 155757.846444] CPU: 7 PID: 1380944 Comm: ThriftSrv-pri3- Kdump: loaded Tainted: G S 6.4.3-0_fbk1_rc0_594_g8d0cbcaa67ba #1 [ 155757.870808] Hardware name: Wiwynn Twin Lakes MP/Twin Lakes Passive MP, BIOS YMM16 05/24/2021 [ 155757.887870] RIP: 0010:mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155757.899377] Code: 29 19 4a 02 48 39 f9 74 63 48 8b 97 c0 00 00 00 48 8b b7 58 02 00 00 48 2b b7 c0 01 00 00 48 39 f0 48 0f 4d c6 48 39 d1 74 42 <48> 8b b2 c0 00 00 00 48 8b ba 58 02 00 00 48 2b ba c0 01 00 00 48 [ 155757.937125] RSP: 0018:ffffc9002ecdfbc8 EFLAGS: 00010286 [ 155757.947755] RAX: 00000000003a3b1c RBX: 000007ffffffffff RCX: ffff888280183000 [ 155757.962202] RDX: 0000000000000000 RSI: 0007ffffffffffff RDI: ffff888bbc2d1000 [ 155757.976648] RBP: 0000000000000001 R08: 000000000000000b R09: ffff888ad9cedba0 [ 155757.991094] R10: ffffea0039c07900 R11: 0000000000000010 R12: ffff888b23a7b000 [ 155758.005540] R13: 0000000000000000 R14: ffff888bbc2d1000 R15: 000007ffffc71354 [ 155758.019991] FS: 00007f6234c68640(0000) GS:ffff88903f9c0000(0000) knlGS:0000000000000000 [ 155758.036356] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 155758.048023] CR2: 00000000000000c0 CR3: 0000000a83eb8004 CR4: 00000000007706e0 [ 155758.062473] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 155758.076924] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 155758.091376] PKRU: 55555554 [ 155758.096957] Call Trace: [ 155758.102016] <TASK> [ 155758.106502] ? __die+0x78/0xc0 [ 155758.112793] ? page_fault_oops+0x286/0x380 [ 155758.121175] ? exc_page_fault+0x5d/0x110 [ 155758.129209] ? asm_exc_page_fault+0x22/0x30 [ 155758.137763] ? mem_cgroup_get_nr_swap_pages+0x3d/0xb0 [ 155758.148060] workingset_test_recent+0xda/0x1b0 [ 155758.157133] workingset_refault+0xca/0x1e0 [ 155758.165508] filemap_add_folio+0x4d/0x70 [ 155758.173538] page_cache_ra_unbounded+0xed/0x190 [ 155758.182919] page_cache_sync_ra+0xd6/0x1e0 [ 155758.191738] filemap_read+0x68d/0xdf0 [ 155758.199495] ? mlx5e_napi_poll+0x123/0x940 [ 155758.207981] ? __napi_schedule+0x55/0x90 [ 155758.216095] __x64_sys_pread64+0x1d6/0x2c0 [ 155758.224601] do_syscall_64+0x3d/0x80 [ 155758.232058] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 155758.242473] RIP: 0033:0x7f62c29153b5 [ 155758.249938] Code: e8 48 89 75 f0 89 7d f8 48 89 4d e0 e8 b4 e6 f7 ff 41 89 c0 4c 8b 55 e0 48 8b 55 e8 48 8b 75 f0 8b 7d f8 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 45 f8 e8 e7 e6 f7 ff 48 8b [ 155758.288005] RSP: 002b:00007f6234c5ffd0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 [ 155758.303474] RAX: ffffffffffffffda RBX: 00007f628c4e70c0 RCX: 00007f62c29153b5 [ 155758.318075] RDX: 000000000003c041 RSI: 00007f61d2986000 RDI: 0000000000000076 [ 155758.332678] RBP: 00007f6234c5fff0 R08: 0000000000000000 R09: 0000000064d5230c [ 155758.347452] R10: 000000000027d450 R11: 0000000000000293 R12: 000000000003c041 [ 155758.362044] R13: 00007f61d2986000 R14: 00007f629e11b060 R15: 000000000027d450 [ 155758.376661] </TASK>
This patch fixes the issue by getting a local reference inside unpack_shadow().
Fixes: f78dfc7b77d5 ("workingset: fix confusion around eviction vs refault container")
Beyond mem_cgroup_get_nr_swap_pages(), we still use the eviction_memcg without grabbing a ref to it first in workingset_test_recent() (and in workingset_refault() before that) as well as lru_gen_test_recent().
Wouldn't the fix go back even further? or am I misinterpreting the problem?
Hmm I don't see eviction_memcg being used outside of *_test_recent (the rest just uses memcg = folio_memcg(folio), which if I'm not mistaken is the memcg that is refaulting the folio into memory).
Inside workingset_test_recent(), the only other place where eviction_memcg is used is for mem_cgroup_lruvec. This function call won't crash whether eviction_memcg is valid or not.
If eviction_memcg is invalid because the memory was already freed, we are basically dereferencing garbage in mem_cgroup_lruvec() aren't we?
Ah I see what you mean. Also, I think "valid" is too loaded a word here. I guess I meant refcnt (of the memcg's css) == 0, but I don't know the term/state to describe this, better than "valid" anyway.
Let me try to make it clear(-er):
When eviction_memcg's refcnt goes to 0, the memory allocated to eviction_memcg might still be around. This is because kfree(memcg) is only triggered when mem_cgroup_css_free(memcg) happens. I believe this freeing is ultimately done in a deferred workqueue (i.e asynchronously)?
(This is based on the documentation in kernel/cgroup/cgroup.c, above css_free_rwork_fn() definition. This work function is where css freeing is actually done).
As long as the memory is still around and not garbage, all the deref happening in mem_cgroup_lruvec() should still give us sane values.
But yeah, it seems like you're right that getting lruvec from eviction_memcg might fail as well! We've only observed the mem_cgroup_get_nr_swap_pages failure (itself is also relatively rare) but it seems we could potentially run into another bug. Maybe we haven't run 6.4 enough to see the other case yet.
But either way, I still believe we absolutely need to hold a reference here to preserve the "validity" of eviction_memcg (whether for grabbing the lruvec or hierarchy traversal). It'll prevent css_free from happening, and if you look at css_free_rwork_fn, you'll see that it also keeps another reference to the parent css from being putted as well.
I hope I'm still making sense... This is a bit convoluted, I admit - please fact check me :)
The crash only happens during mem_cgroup_get_nr_swap_pages, which has an upward traversal from eviction_memcg to root.
Let me know if this does not make sense and/or is insufficient to ensure safe upward traversal from eviction_memcg to root!
Signed-off-by: Nhat Pham nphamcs@gmail.com Cc: stable@vger.kernel.org
mm/workingset.c | 65 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 22 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c index da58a26d0d4d..03cadad4e484 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -206,10 +206,19 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, return xa_mk_value(eviction); }
-static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
unsigned long *evictionp, bool *workingsetp)
+/*
- Unpacks the stored fields of a shadow entry into the given pointers.
- The memcg pointer is only populated if the memcg recorded in the shadow
- entry is valid. In this case, a reference to the memcg will be acquired,
- and a corresponding mem_cgroup_put() will be needed when we no longer
- need the memcg.
- */
+static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp,
pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp)
{ unsigned long entry = xa_to_value(shadow);
struct mem_cgroup *memcg; int memcgid, nid; bool workingset;
@@ -220,7 +229,24 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT;
*memcgidp = memcgid;
/*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
*
* Note that in rare events the ID could have been recycled
* for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
* algorithm's job to shake out the minimum access frequency
* for the active cache.
*/
memcg = mem_cgroup_from_id(memcgid);
if (memcg && css_tryget(&memcg->css))
*memcgp = memcg;
else
*memcgp = NULL;
*pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset;
@@ -262,15 +288,16 @@ static void *lru_gen_eviction(struct folio *folio) static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) {
int memcg_id; unsigned long min_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat;
unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
unpack_shadow(shadow, &memcg, &pgdat, token, workingset);
if (!mem_cgroup_disabled() && !memcg)
return false;
+Yu Zhao
There is a change of behavior here, right?
The existing code will continue if !mem_cgroup_disabled() && !memcg is true, and mem_cgroup_lruvec() will return the lruvec of the root memcg. Now we are just returning false.
Is this intentional?
Oh right, there is. Should have cc-ed Yu Zhao as well, my bad. get_maintainers.pl isn't always sufficient I guess :)
But yeah, this behavioral change is intentional.
Correct me if I'm wrong of course, but it seems like MGLRU should follow the same pattern here. That is, once we return from unpack_shadow, the possible scenarios are the same as prescribed in workingset_test_recent:
- If mem_cgroup is disabled, we can ignore this check.
- If mem_cgroup is enabled, then the only reason why we get NULL
memcg from unpack_shadow is if the eviction_memcg is no longer valid. We should not try to get its lruvec, or substitute it with the root memcg, but return false right away (i.e not recent).
I will leave this for Yu :)
Hi Nhat,
kernel test robot noticed the following build errors:
[auto build test ERROR on linus/master] [also build test ERROR on v6.5-rc6 next-20230817] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Nhat-Pham/workingset-ensure-m... base: linus/master patch link: https://lore.kernel.org/r/20230817190126.3155299-1-nphamcs%40gmail.com patch subject: [PATCH v2] workingset: ensure memcg is valid for recency check config: loongarch-allnoconfig (https://download.01.org/0day-ci/archive/20230818/202308181116.FsJVPmJZ-lkp@i...) compiler: loongarch64-linux-gcc (GCC) 12.3.0 reproduce: (https://download.01.org/0day-ci/archive/20230818/202308181116.FsJVPmJZ-lkp@i...)
If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot lkp@intel.com | Closes: https://lore.kernel.org/oe-kbuild-all/202308181116.FsJVPmJZ-lkp@intel.com/
All errors (new ones prefixed by >>):
mm/workingset.c: In function 'unpack_shadow':
mm/workingset.c:245:22: error: implicit declaration of function 'css_tryget'; did you mean 'wb_tryget'? [-Werror=implicit-function-declaration]
245 | if (memcg && css_tryget(&memcg->css)) | ^~~~~~~~~~ | wb_tryget mm/workingset.c:245:39: error: invalid use of undefined type 'struct mem_cgroup' 245 | if (memcg && css_tryget(&memcg->css)) | ^~ cc1: some warnings being treated as errors
vim +245 mm/workingset.c
208 209 /* 210 * Unpacks the stored fields of a shadow entry into the given pointers. 211 * 212 * The memcg pointer is only populated if the memcg recorded in the shadow 213 * entry is valid. In this case, a reference to the memcg will be acquired, 214 * and a corresponding mem_cgroup_put() will be needed when we no longer 215 * need the memcg. 216 */ 217 static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp, 218 pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) 219 { 220 unsigned long entry = xa_to_value(shadow); 221 struct mem_cgroup *memcg; 222 int memcgid, nid; 223 bool workingset; 224 225 workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1); 226 entry >>= WORKINGSET_SHIFT; 227 nid = entry & ((1UL << NODES_SHIFT) - 1); 228 entry >>= NODES_SHIFT; 229 memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); 230 entry >>= MEM_CGROUP_ID_SHIFT; 231 232 /* 233 * Look up the memcg associated with the stored ID. It might 234 * have been deleted since the folio's eviction. 235 * 236 * Note that in rare events the ID could have been recycled 237 * for a new cgroup that refaults a shared folio. This is 238 * impossible to tell from the available data. However, this 239 * should be a rare and limited disturbance, and activations 240 * are always speculative anyway. Ultimately, it's the aging 241 * algorithm's job to shake out the minimum access frequency 242 * for the active cache. 243 */ 244 memcg = mem_cgroup_from_id(memcgid);
245 if (memcg && css_tryget(&memcg->css))
246 *memcgp = memcg; 247 else 248 *memcgp = NULL; 249 250 *pgdat = NODE_DATA(nid); 251 *evictionp = entry; 252 *workingsetp = workingset; 253 } 254
Hi Nhat,
kernel test robot noticed the following build errors:
[auto build test ERROR on linus/master] [also build test ERROR on v6.5-rc6 next-20230817] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Nhat-Pham/workingset-ensure-m... base: linus/master patch link: https://lore.kernel.org/r/20230817190126.3155299-1-nphamcs%40gmail.com patch subject: [PATCH v2] workingset: ensure memcg is valid for recency check config: x86_64-buildonly-randconfig-r003-20230818 (https://download.01.org/0day-ci/archive/20230818/202308181130.VIAl2viu-lkp@i...) compiler: gcc-9 (Debian 9.3.0-22) 9.3.0 reproduce: (https://download.01.org/0day-ci/archive/20230818/202308181130.VIAl2viu-lkp@i...)
If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot lkp@intel.com | Closes: https://lore.kernel.org/oe-kbuild-all/202308181130.VIAl2viu-lkp@intel.com/
All errors (new ones prefixed by >>):
mm/workingset.c: In function 'unpack_shadow':
mm/workingset.c:245:32: error: dereferencing pointer to incomplete type 'struct mem_cgroup'
245 | if (memcg && css_tryget(&memcg->css)) | ^~
vim +245 mm/workingset.c
208 209 /* 210 * Unpacks the stored fields of a shadow entry into the given pointers. 211 * 212 * The memcg pointer is only populated if the memcg recorded in the shadow 213 * entry is valid. In this case, a reference to the memcg will be acquired, 214 * and a corresponding mem_cgroup_put() will be needed when we no longer 215 * need the memcg. 216 */ 217 static void unpack_shadow(void *shadow, struct mem_cgroup **memcgp, 218 pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) 219 { 220 unsigned long entry = xa_to_value(shadow); 221 struct mem_cgroup *memcg; 222 int memcgid, nid; 223 bool workingset; 224 225 workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1); 226 entry >>= WORKINGSET_SHIFT; 227 nid = entry & ((1UL << NODES_SHIFT) - 1); 228 entry >>= NODES_SHIFT; 229 memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); 230 entry >>= MEM_CGROUP_ID_SHIFT; 231 232 /* 233 * Look up the memcg associated with the stored ID. It might 234 * have been deleted since the folio's eviction. 235 * 236 * Note that in rare events the ID could have been recycled 237 * for a new cgroup that refaults a shared folio. This is 238 * impossible to tell from the available data. However, this 239 * should be a rare and limited disturbance, and activations 240 * are always speculative anyway. Ultimately, it's the aging 241 * algorithm's job to shake out the minimum access frequency 242 * for the active cache. 243 */ 244 memcg = mem_cgroup_from_id(memcgid);
245 if (memcg && css_tryget(&memcg->css))
246 *memcgp = memcg; 247 else 248 *memcgp = NULL; 249 250 *pgdat = NODE_DATA(nid); 251 *evictionp = entry; 252 *workingsetp = workingset; 253 } 254
On Thu, Aug 17, 2023 at 12:01:26PM -0700, Nhat Pham wrote:
static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) {
- int memcg_id; unsigned long min_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat;
- unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
- unpack_shadow(shadow, &memcg, &pgdat, token, workingset);
- if (!mem_cgroup_disabled() && !memcg)
return false;
- memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- mem_cgroup_put(memcg);
min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
This isn't quite right. lruvec's lifetime is bound to the memcg, so the put has to happen after the code is done accessing it.
lru_gen_refault() is still using the eviction lruvec after the recency test - but only if eviction_lruvec == refault_lruvec. This gives me pause, because they're frequently different. This is a common setup:
root - slice - service 1 `- service 2
where slice has the limit and will be the cause of evictions, whereas the service groups have the actual pages and see the refaults.
workingset_eviction() and workingset_refault() will do recency evaluation against slice, and refault accounting against service X.
MGLRU will use service X to determine recency, which 1) will not properly activate when one service is thrashing while the other is dominating the memory allowance of slice, and 2) will not detect refaults of pages shared between the services.
Maybe it's time to fix this first and bring the two codebases in unison. Then the recency test and eviction group lifetime can be encapsulated in test_recent(), and _refault() can just use folio_memcg() to do the activation and accounting against.
linux-stable-mirror@lists.linaro.org