Hi David,
thanks for pursuing this! A couple of comments below.
On Mon, Jul 22, 2024 at 07:55:53PM -0400, David Finkel wrote:
@@ -1322,11 +1322,16 @@ PAGE_SIZE multiple when read back. reclaim induced by memory.reclaim. memory.peak
- A read-only single value file which exists on non-root
- cgroups.
- A read-write single value file which exists on non-root cgroups.
- The max memory usage recorded for the cgroup and its descendants since
- either the creation of the cgroup or the most recent reset for that FD.
- The max memory usage recorded for the cgroup and its
- descendants since the creation of the cgroup.
- A write of the string "reset" to this file resets it to the
- current memory usage for subsequent reads through the same
- file descriptor.
- Attempts to write any other non-empty string will return EINVAL
- (modulo leading and trailing whitespace).
Why not allow any write to reset? This makes it harder to use, and I'm not sure accidental writes are a likely mistake to make.
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 2150ca60394b..7001ed74e339 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -12,6 +12,7 @@ #include <linux/sched.h> #include <linux/cpumask.h> #include <linux/nodemask.h> +#include <linux/list.h> #include <linux/rculist.h> #include <linux/cgroupstats.h> #include <linux/fs.h> @@ -855,4 +856,11 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {} struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id); +struct memcg_peak_mem_ctx {
- long local_watermark;
- struct list_head peers;
+};
Since this is generic cgroup code, and can be conceivably used by other controllers, let's keep the naming generic as well. How about:
struct cgroup_of_peak { long value; struct list_head list; };
cgroup-defs.h would be a better place for it.
+struct memcg_peak_mem_ctx *memcg_extract_peak_mem_ctx(struct kernfs_open_file *of);
of_peak()
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 030d34e9d117..cbc390234605 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -198,6 +198,11 @@ struct mem_cgroup { struct page_counter kmem; /* v1 only */ struct page_counter tcpmem; /* v1 only */
- /* lists of memcg peak watching contexts on swap and memory */
- struct list_head peak_memory_local_watermark_watchers;
- struct list_head peak_swap_local_watermark_watchers;
- spinlock_t swap_memory_peak_watchers_lock;
These names are too long. How about:
/* Registered local usage peak watchers */ struct list_head memory_peaks; struct list_head swap_peaks; spinlock_t peaks_lock;
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 8cd858d912c4..06bb84218960 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -26,6 +26,7 @@ struct page_counter { atomic_long_t children_low_usage; unsigned long watermark;
- unsigned long local_watermark; /* track min of fd-local resets */ unsigned long failcnt;
/* Keep all the read most fields in a separete cacheline. */ @@ -78,7 +79,15 @@ int page_counter_memparse(const char *buf, const char *max, static inline void page_counter_reset_watermark(struct page_counter *counter) {
- counter->watermark = page_counter_read(counter);
- unsigned long cur = page_counter_read(counter);
cur -> usage
@@ -6907,12 +6912,109 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } -static u64 memory_peak_read(struct cgroup_subsys_state *css,
struct cftype *cft)
+inline int swap_memory_peak_show(
- struct seq_file *sf, void *v, bool swap_cg)
{
Leave inlining to the compiler. Just static int.
The name can be simply peak_show().
Customary coding style is to line wrap at the last parameter that fits. Don't wrap if the line fits within 80 cols.
static int peak_show(struct seq_file *sf, void *v, ..., ...) { ... }
- struct cgroup_subsys_state *css = seq_css(sf); struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct page_counter *pc;
- struct kernfs_open_file *of = sf->private;
- struct memcg_peak_mem_ctx *ctx = memcg_extract_peak_mem_ctx(of);
- s64 fd_peak = ctx->local_watermark;
- return (u64)memcg->memory.watermark * PAGE_SIZE;
- if (swap_cg)
pc = &memcg->swap;
- else
pc = &memcg->memory;
- if (fd_peak == -1) {
seq_printf(sf, "%llu\n", (u64)pc->watermark * PAGE_SIZE);
return 0;
- }
- s64 pc_peak = pc->local_watermark;
- s64 wm = fd_peak > pc_peak ? fd_peak : pc_peak;
- seq_printf(sf, "%lld\n", wm * PAGE_SIZE);
- return 0;
+}
As per Roman's feedback, don't mix decls and code.
You can simplify it by extracting css and memcg in the callers, then pass the right struct page counter *pc directly.
That should eliminate most local variables as well.
static int peak_show(struct seq_file *sf, void *v, struct page_counter *pc) { struct cgroup_of_peak *ofp = of_peak(sf->private); u64 peak;
/* User wants global or local peak? */ if (ofp->value == -1) peak = pc->watermark; else peak = max(ofp->value, pc->local_watermark);
seq_printf(sf, "%lld\n", peak * PAGE_SIZE); }
+static int memory_peak_show(struct seq_file *sf, void *v) +{
- return swap_memory_peak_show(sf, v, false);
And then do:
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
return peak_show(sf, v, &memcg->memory);
Then do the same with ... &memcg->swap.
+inline ssize_t swap_memory_peak_write(
- struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off, bool swap_cg)
+{
Same feedback as above. Please don't inline explicitly (unless it really is measurably a performance improvement in a critical path), and stick to surrounding coding style.
Here too, pass page_counter directly and save the branches.
- unsigned long cur;
- struct memcg_peak_mem_ctx *peer_ctx;
- struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- struct memcg_peak_mem_ctx *ctx = memcg_extract_peak_mem_ctx(of);
- struct page_counter *pc;
- struct list_head *watchers, *pos;
- buf = strstrip(buf);
- /* Only allow "reset" to keep the API clear */
- if (strcmp(buf, "reset"))
return -EINVAL;
- if (swap_cg) {
pc = &memcg->swap;
watchers = &memcg->peak_swap_local_watermark_watchers;
- } else {
pc = &memcg->memory;
watchers = &memcg->peak_memory_local_watermark_watchers;
- }
- spin_lock(&memcg->swap_memory_peak_watchers_lock);
- page_counter_reset_local_watermark(pc);
- cur = pc->local_watermark;
- list_for_each(pos, watchers) {
list_for_each_entry()
peer_ctx = list_entry(pos, typeof(*ctx), peers);
if (cur > peer_ctx->local_watermark)
peer_ctx->local_watermark = cur;
- }
I don't think this is quite right. local_peak could be higher than the current usage when a new watcher shows up. The other watchers should retain the higher local_peak, not the current usage.
- if (ctx->local_watermark == -1)
/* only append to the list if we're not already there */
list_add_tail(&ctx->peers, watchers);
- ctx->local_watermark = cur;
This makes me think that page_counter_reset_local_watermark() is not a good helper. It obscures what's going on. Try without it.
AFAICS the list ordering doesn't matter, so keep it simple and use a plain list_add().
/* * A new local peak is being tracked in pc->local_watermark. * Save current local peak in all watchers. */ list_for_each_entry(pos, ...) if (pc->local_watermark > pos->value) pos->value = pc->local_watermark;
pc->local_watermark = page_counter_read(pc);
/* Initital write, register watcher */ if (ofp->value == -1) list_add()
ofp->value = pc->local_watermark;
diff --git a/mm/page_counter.c b/mm/page_counter.c index db20d6452b71..724d31508664 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -79,9 +79,22 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) /* * This is indeed racy, but we can live with some * inaccuracy in the watermark.
*
* Notably, we have two watermarks to allow for both a globally
* visible peak and one that can be reset at a smaller scope.
*
* Since we reset both watermarks when the global reset occurs,
* we can guarantee that watermark >= local_watermark, so we
* don't need to do both comparisons every time.
*
* On systems with branch predictors, the inner condition should
*/* be almost free.
if (new > READ_ONCE(c->watermark))
WRITE_ONCE(c->watermark, new);
if (new > READ_ONCE(c->local_watermark)) {
WRITE_ONCE(c->local_watermark, new);
if (new > READ_ONCE(c->watermark))
WRITE_ONCE(c->watermark, new);
}}
} @@ -131,10 +144,23 @@ bool page_counter_try_charge(struct page_counter *counter, propagate_protected_usage(c, new); /* * Just like with failcnt, we can live with some
* inaccuracy in the watermark.
* inaccuracy in the watermarks.
*
* Notably, we have two watermarks to allow for both a globally
* visible peak and one that can be reset at a smaller scope.
*
* Since we reset both watermarks when the global reset occurs,
* we can guarantee that watermark >= local_watermark, so we
* don't need to do both comparisons every time.
*
* On systems with branch predictors, the inner condition should
* be almost free.
/* See comment in page_counter_charge() */
diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index 432db923bced..1e2d46636a0c 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -141,6 +141,16 @@ long cg_read_long(const char *cgroup, const char *control) return atol(buf); }
This should be in patch #2.