Re: [PATCH v3 2/5] zswap: make shrinking memcg-aware

19 Oct 2023


      [..]
...
...
...
+/*********************************
+* lru functions
+**********************************/
+static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
+{

  struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);



Could we avoid the need for get/put with an rcu_read_lock() instead?
I think we can, I'm not entirely sure of the consequences though. By the
look of it I'd say it's safe but I wouldn't trust my judgement on this.
It just seems like we have a pattern of short-lived get/put. If RCU
gives enough protection it should be simpler. IIUC taking a reference
does not protect against offlining or reparenting, so I am not sure if
taking a reference here would provide any more protection than
...
...
[..]
...
...
...
@@ -686,7 +716,36 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
        zswap_entry_put(tree, entry);
 unlock:
        spin_unlock(&tree->lock);

  return ret ? -EAGAIN : 0;




  spin_lock(lock);


  return ret;



+}



+static int shrink_memcg(struct mem_cgroup *memcg)
+{

  struct zswap_pool *pool;


  int nid, shrunk = 0;



  pool = zswap_pool_current_get();


  if (!pool)


          return -EINVAL;



  /*


   * Skip zombies because their LRUs are reparented and we would be


   * reclaiming from the parent instead of the dead memcgroup.



nit: s/memcgroup/memcg.
...

   */


  if (memcg && !mem_cgroup_online(memcg))


          goto out;



If we move this above zswap_pool_current_get(), we can return directly
and remove the label. I noticed we will return -EAGAIN if memcg is
offline. IIUC -EAGAIN for the caller will move on to the next memcg,
but I am wondering if a different errno would be clearer here.
True, I remember spending some time staring at error codes but couldn't find a
better one. What if we use -EINVAL for retryable errors, and use something else
for the one where there is no pool? -ENODEV?
Do you mean -EINVAL for non-retryable errors? Perhaps -ENOENT is more
appropriate as a return for offline memcgs?
...
...
[..]
...
...
...
static void shrink_worker(struct work_struct *w)
@@ -695,10 +754,13 @@ static void shrink_worker(struct work_struct *w)
                                                shrink_work);
        int ret, failures = 0;

  /* global reclaim will select cgroup in a round-robin fashion. */
  do {




          ret = zswap_reclaim_entry(pool);




          pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);



Perhaps next_shrink_memcg is a better name here?
Will change if you have a strong preference, I'd keep it shorter because it's
always used in conjunction with a memcg type or function.
I'd rather have the more explicit name unless it causes some annoying
line breaks or so.
...
...
...


          ret = shrink_memcg(pool->next_shrink);


          if (ret) {




                  zswap_reject_reclaim_fail++;
                  if (ret != -EAGAIN)
                          break;
                  if (++failures == MAX_RECLAIM_RETRIES)



@@ -764,8 +826,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
         */
        kref_init(&pool->kref);
        INIT_LIST_HEAD(&pool->list);

  INIT_LIST_HEAD(&pool->lru);


  spin_lock_init(&pool->lru_lock);




  list_lru_init_memcg(&pool->list_lru, NULL);
  INIT_WORK(&pool->shrink_work, shrink_worker);

  zswap_pool_debug("created", pool);



@@ -831,6 +892,9 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
    cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
    free_percpu(pool->acomp_ctx);


  list_lru_destroy(&pool->list_lru);


  if (pool->next_shrink)


          mem_cgroup_put(pool->next_shrink);
  for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
          zpool_destroy_pool(pool->zpools[i]);
  kfree(pool);



@@ -1076,7 +1140,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
    /* try to allocate swap cache page */
    page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,


                                 &page_was_allocated);




                                 &page_was_allocated, true);
  if (!page) {
          ret = -ENOMEM;
          goto fail;



@@ -1142,7 +1206,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
        /* start writeback */
        __swap_writepage(page, &wbc);
        put_page(page);

  zswap_written_back_pages++;

  return ret;




@@ -1199,8 +1262,10 @@ bool zswap_store(struct folio *folio)
        struct scatterlist input, output;
        struct crypto_acomp_ctx *acomp_ctx;
        struct obj_cgroup *objcg = NULL;

  struct mem_cgroup *memcg = NULL;
  struct zswap_pool *pool;
  struct zpool *zpool;


  int lru_alloc_ret;
  unsigned int dlen = PAGE_SIZE;
  unsigned long handle, value;
  char *buf;



@@ -1230,15 +1295,15 @@ bool zswap_store(struct folio *folio)
                zswap_invalidate_entry(tree, dupentry);
        }
        spin_unlock(&tree->lock);


  /*


   * XXX: zswap reclaim does not work with cgroups yet. Without a


   * cgroup-aware entry LRU, we will push out entries system-wide based on


   * local cgroup limits.


   */
  objcg = get_obj_cgroup_from_folio(folio);


  if (objcg && !obj_cgroup_may_zswap(objcg))


          goto reject;




  if (objcg && !obj_cgroup_may_zswap(objcg)) {


          memcg = get_mem_cgroup_from_objcg(objcg);


          if (shrink_memcg(memcg)) {


                  mem_cgroup_put(memcg);


                  goto reject;


          }


          mem_cgroup_put(memcg);


  }

  /* reclaim space if needed */
  if (zswap_is_full()) {



@@ -1254,10 +1319,15 @@ bool zswap_store(struct folio *folio)
                        zswap_pool_reached_full = false;
        }

  pool = zswap_pool_current_get();


  if (!pool)


          goto reject;




Why do we need to move zswap_pool_current_get() up here?
Ah, thanks. This is a leftover from a previous version where the pool was needed
to allocate the entry.
...
...
    /* allocate entry */


  entry = zswap_entry_cache_alloc(GFP_KERNEL);




  entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
  if (!entry) {
          zswap_reject_kmemcache_fail++;


          zswap_pool_put(pool);
          goto reject;
  }




@@ -1269,6 +1339,7 @@ bool zswap_store(struct folio *folio)
                        entry->length = 0;
                        entry->value = value;
                        atomic_inc(&zswap_same_filled_pages);

                  zswap_pool_put(pool);
                  goto insert_entry;
          }
          kunmap_atomic(src);



@@ -1278,9 +1349,15 @@ bool zswap_store(struct folio *folio)
                goto freepage;
    /* if entry is successfully added, it keeps the reference */


  entry->pool = zswap_pool_current_get();


  if (!entry->pool)


          goto freepage;




  entry->pool = pool;


  if (objcg) {


          memcg = get_mem_cgroup_from_objcg(objcg);


          lru_alloc_ret = memcg_list_lru_alloc(memcg, &pool->list_lru, GFP_KERNEL);


          mem_cgroup_put(memcg);



          if (lru_alloc_ret)


                  goto freepage;


  }

  /* compress */
  acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);



@@ -1358,9 +1435,8 @@ bool zswap_store(struct folio *folio)
                zswap_invalidate_entry(tree, dupentry);
        }
        if (entry->length) {

          spin_lock(&entry->pool->lru_lock);


          list_add(&entry->lru, &entry->pool->lru);


          spin_unlock(&entry->pool->lru_lock);




          INIT_LIST_HEAD(&entry->lru);


          zswap_lru_add(&pool->list_lru, entry);
  }
  spin_unlock(&tree->lock);




@@ -1373,8 +1449,8 @@ bool zswap_store(struct folio *folio)
put_dstmem:
        mutex_unlock(acomp_ctx->mutex);

  zswap_pool_put(entry->pool);



freepage:

  zswap_pool_put(entry->pool);
  zswap_entry_cache_free(entry);



reject:
        if (objcg)
@@ -1467,9 +1543,8 @@ bool zswap_load(struct folio *folio)
                zswap_invalidate_entry(tree, entry);
                folio_mark_dirty(folio);
        } else if (entry->length) {

          spin_lock(&entry->pool->lru_lock);


          list_move(&entry->lru, &entry->pool->lru);


          spin_unlock(&entry->pool->lru_lock);




          zswap_lru_del(&entry->pool->list_lru, entry);


          zswap_lru_add(&entry->pool->list_lru, entry);
  }
  zswap_entry_put(tree, entry);
  spin_unlock(&tree->lock);



--
2.34.1

    

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

Re: [PATCH v3 2/5] zswap: make shrinking memcg-aware