[..]
+/********************************* +* lru functions +**********************************/ +static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) +{
struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);
Could we avoid the need for get/put with an rcu_read_lock() instead?
I think we can, I'm not entirely sure of the consequences though. By the look of it I'd say it's safe but I wouldn't trust my judgement on this.
It just seems like we have a pattern of short-lived get/put. If RCU gives enough protection it should be simpler. IIUC taking a reference does not protect against offlining or reparenting, so I am not sure if taking a reference here would provide any more protection than
[..]
@@ -686,7 +716,36 @@ static int zswap_reclaim_entry(struct zswap_pool *pool) zswap_entry_put(tree, entry); unlock: spin_unlock(&tree->lock);
return ret ? -EAGAIN : 0;
spin_lock(lock);
return ret;
+}
+static int shrink_memcg(struct mem_cgroup *memcg) +{
struct zswap_pool *pool;
int nid, shrunk = 0;
pool = zswap_pool_current_get();
if (!pool)
return -EINVAL;
/*
* Skip zombies because their LRUs are reparented and we would be
* reclaiming from the parent instead of the dead memcgroup.
nit: s/memcgroup/memcg.
*/
if (memcg && !mem_cgroup_online(memcg))
goto out;
If we move this above zswap_pool_current_get(), we can return directly and remove the label. I noticed we will return -EAGAIN if memcg is offline. IIUC -EAGAIN for the caller will move on to the next memcg, but I am wondering if a different errno would be clearer here.
True, I remember spending some time staring at error codes but couldn't find a better one. What if we use -EINVAL for retryable errors, and use something else for the one where there is no pool? -ENODEV?
Do you mean -EINVAL for non-retryable errors? Perhaps -ENOENT is more appropriate as a return for offline memcgs?
[..]
static void shrink_worker(struct work_struct *w) @@ -695,10 +754,13 @@ static void shrink_worker(struct work_struct *w) shrink_work); int ret, failures = 0;
/* global reclaim will select cgroup in a round-robin fashion. */ do {
ret = zswap_reclaim_entry(pool);
pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
Perhaps next_shrink_memcg is a better name here?
Will change if you have a strong preference, I'd keep it shorter because it's always used in conjunction with a memcg type or function.
I'd rather have the more explicit name unless it causes some annoying line breaks or so.
ret = shrink_memcg(pool->next_shrink);
if (ret) {
zswap_reject_reclaim_fail++; if (ret != -EAGAIN) break; if (++failures == MAX_RECLAIM_RETRIES)
@@ -764,8 +826,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) */ kref_init(&pool->kref); INIT_LIST_HEAD(&pool->list);
INIT_LIST_HEAD(&pool->lru);
spin_lock_init(&pool->lru_lock);
list_lru_init_memcg(&pool->list_lru, NULL); INIT_WORK(&pool->shrink_work, shrink_worker); zswap_pool_debug("created", pool);
@@ -831,6 +892,9 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx);
list_lru_destroy(&pool->list_lru);
if (pool->next_shrink)
mem_cgroup_put(pool->next_shrink); for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) zpool_destroy_pool(pool->zpools[i]); kfree(pool);
@@ -1076,7 +1140,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
/* try to allocate swap cache page */ page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
&page_was_allocated);
&page_was_allocated, true); if (!page) { ret = -ENOMEM; goto fail;
@@ -1142,7 +1206,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* start writeback */ __swap_writepage(page, &wbc); put_page(page);
zswap_written_back_pages++; return ret;
@@ -1199,8 +1262,10 @@ bool zswap_store(struct folio *folio) struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; struct zpool *zpool;
int lru_alloc_ret; unsigned int dlen = PAGE_SIZE; unsigned long handle, value; char *buf;
@@ -1230,15 +1295,15 @@ bool zswap_store(struct folio *folio) zswap_invalidate_entry(tree, dupentry); } spin_unlock(&tree->lock);
/*
* XXX: zswap reclaim does not work with cgroups yet. Without a
* cgroup-aware entry LRU, we will push out entries system-wide based on
* local cgroup limits.
*/ objcg = get_obj_cgroup_from_folio(folio);
if (objcg && !obj_cgroup_may_zswap(objcg))
goto reject;
if (objcg && !obj_cgroup_may_zswap(objcg)) {
memcg = get_mem_cgroup_from_objcg(objcg);
if (shrink_memcg(memcg)) {
mem_cgroup_put(memcg);
goto reject;
}
mem_cgroup_put(memcg);
} /* reclaim space if needed */ if (zswap_is_full()) {
@@ -1254,10 +1319,15 @@ bool zswap_store(struct folio *folio) zswap_pool_reached_full = false; }
pool = zswap_pool_current_get();
if (!pool)
goto reject;
Why do we need to move zswap_pool_current_get() up here?
Ah, thanks. This is a leftover from a previous version where the pool was needed to allocate the entry.
/* allocate entry */
entry = zswap_entry_cache_alloc(GFP_KERNEL);
entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++;
zswap_pool_put(pool); goto reject; }
@@ -1269,6 +1339,7 @@ bool zswap_store(struct folio *folio) entry->length = 0; entry->value = value; atomic_inc(&zswap_same_filled_pages);
zswap_pool_put(pool); goto insert_entry; } kunmap_atomic(src);
@@ -1278,9 +1349,15 @@ bool zswap_store(struct folio *folio) goto freepage;
/* if entry is successfully added, it keeps the reference */
entry->pool = zswap_pool_current_get();
if (!entry->pool)
goto freepage;
entry->pool = pool;
if (objcg) {
memcg = get_mem_cgroup_from_objcg(objcg);
lru_alloc_ret = memcg_list_lru_alloc(memcg, &pool->list_lru, GFP_KERNEL);
mem_cgroup_put(memcg);
if (lru_alloc_ret)
goto freepage;
} /* compress */ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
@@ -1358,9 +1435,8 @@ bool zswap_store(struct folio *folio) zswap_invalidate_entry(tree, dupentry); } if (entry->length) {
spin_lock(&entry->pool->lru_lock);
list_add(&entry->lru, &entry->pool->lru);
spin_unlock(&entry->pool->lru_lock);
INIT_LIST_HEAD(&entry->lru);
zswap_lru_add(&pool->list_lru, entry); } spin_unlock(&tree->lock);
@@ -1373,8 +1449,8 @@ bool zswap_store(struct folio *folio)
put_dstmem: mutex_unlock(acomp_ctx->mutex);
zswap_pool_put(entry->pool);
freepage:
zswap_pool_put(entry->pool); zswap_entry_cache_free(entry);
reject: if (objcg) @@ -1467,9 +1543,8 @@ bool zswap_load(struct folio *folio) zswap_invalidate_entry(tree, entry); folio_mark_dirty(folio); } else if (entry->length) {
spin_lock(&entry->pool->lru_lock);
list_move(&entry->lru, &entry->pool->lru);
spin_unlock(&entry->pool->lru_lock);
zswap_lru_del(&entry->pool->list_lru, entry);
zswap_lru_add(&entry->pool->list_lru, entry); } zswap_entry_put(tree, entry); spin_unlock(&tree->lock);
-- 2.34.1