cpuacct.stat in no-root cgroups shows user time without guest time included int it. This doesn't match with user time shown in root cpuacct.stat and /proc/<pid>/stat.
Make account_guest_time() to add user time to cgroup's cpustat to fix this.
Fixes: ef12fefabf94 ("cpuacct: add per-cgroup utime/stime statistics") Signed-off-by: Andrey Ryabinin arbn@yandex-team.com Cc: stable@vger.kernel.org --- kernel/sched/cputime.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5f611658eeab..95a9c5603d29 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -139,8 +139,6 @@ void account_user_time(struct task_struct *p, u64 cputime) */ void account_guest_time(struct task_struct *p, u64 cputime) { - u64 *cpustat = kcpustat_this_cpu->cpustat; - /* Add guest time to process. */ p->utime += cputime; account_group_user_time(p, cputime); @@ -148,11 +146,11 @@ void account_guest_time(struct task_struct *p, u64 cputime)
/* Add guest time to cpustat. */ if (task_nice(p) > 0) { - cpustat[CPUTIME_NICE] += cputime; - cpustat[CPUTIME_GUEST_NICE] += cputime; + task_group_account_field(p, CPUTIME_NICE, cputime); + task_group_account_field(p, CPUTIME_GUEST_NICE, cputime); } else { - cpustat[CPUTIME_USER] += cputime; - cpustat[CPUTIME_GUEST] += cputime; + task_group_account_field(p, CPUTIME_USER, cputime); + task_group_account_field(p, CPUTIME_GUEST, cputime); } }
Global CPUTIME_USER counter already includes CPUTIME_GUEST Also CPUTIME_NICE already includes CPUTIME_GUEST_NICE.
Remove additions of CPUTIME_GUEST[_NICE] to total ->sum_exec_runtime to not account them twice.
Fixes: 936f2a70f207 ("cgroup: add cpu.stat file to root cgroup") Signed-off-by: Andrey Ryabinin arbn@yandex-team.com Cc: stable@vger.kernel.org --- kernel/cgroup/rstat.c | 2 -- 1 file changed, 2 deletions(-)
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index d51175cedfca..89ca9b61aa0d 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -421,8 +421,6 @@ static void root_cgroup_cputime(struct task_cputime *cputime) cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; - cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST]; - cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE]; } }
Andrey Ryabinin arbn@yandex-team.com writes:
Global CPUTIME_USER counter already includes CPUTIME_GUEST Also CPUTIME_NICE already includes CPUTIME_GUEST_NICE.
Remove additions of CPUTIME_GUEST[_NICE] to total ->sum_exec_runtime to not account them twice.
Yes, that's just wrong. usage_usec looks ok now.
Reviewed-by: Daniel Jordan daniel.m.jordan@oracle.com Tested-by: Daniel Jordan daniel.m.jordan@oracle.com
cpuacct has 2 different ways of accounting and showing user and system times.
The first one uses cpuacct_account_field() to account times and cpuacct.stat file to expose them. And this one seems to work ok.
The second one is uses cpuacct_charge() function for accounting and set of cpuacct.usage* files to show times. Despite some attempts to fix it in the past it still doesn't work. E.g. while running KVM guest the cpuacct_charge() accounts most of the guest time as system time. This doesn't match with user&system times shown in cpuacct.stat or proc/<pid>/stat.
Use cpustats accounted in cpuacct_account_field() as the source of user/sys times for cpuacct.usage* files. Make cpuacct_charge() to account only summary execution time.
Fixes: d740037fac70 ("sched/cpuacct: Split usage accounting into user_usage and sys_usage") Signed-off-by: Andrey Ryabinin arbn@yandex-team.com Cc: stable@vger.kernel.org --- kernel/sched/cpuacct.c | 77 +++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 43 deletions(-)
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 941c28cf9738..7eff79faab0d 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -29,7 +29,7 @@ struct cpuacct_usage { struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every CPU */ - struct cpuacct_usage __percpu *cpuusage; + u64 __percpu *cpuusage; struct kernel_cpustat __percpu *cpustat; };
@@ -49,7 +49,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return css_ca(ca->css.parent); }
-static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage); +static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); static struct cpuacct root_cpuacct = { .cpustat = &kernel_cpustat, .cpuusage = &root_cpuacct_cpuusage, @@ -68,7 +68,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) if (!ca) goto out;
- ca->cpuusage = alloc_percpu(struct cpuacct_usage); + ca->cpuusage = alloc_percpu(u64); if (!ca->cpuusage) goto out_free_ca;
@@ -99,7 +99,8 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, enum cpuacct_stat_index index) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; u64 data;
/* @@ -115,14 +116,17 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, raw_spin_lock_irq(&cpu_rq(cpu)->lock); #endif
- if (index == CPUACCT_STAT_NSTATS) { - int i = 0; - - data = 0; - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - data += cpuusage->usages[i]; - } else { - data = cpuusage->usages[index]; + switch (index) { + case CPUACCT_STAT_USER: + data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE]; + break; + case CPUACCT_STAT_SYSTEM: + data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] + + cpustat[CPUTIME_SOFTIRQ]; + break; + case CPUACCT_STAT_NSTATS: + data = *cpuusage; + break; }
#ifndef CONFIG_64BIT @@ -132,10 +136,14 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, return data; }
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - int i; + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; + + /* Don't allow to reset global kernel_cpustat */ + if (ca == &root_cpuacct) + return;
#ifndef CONFIG_64BIT /* @@ -143,9 +151,10 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) */ raw_spin_lock_irq(&cpu_rq(cpu)->lock); #endif - - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - cpuusage->usages[i] = val; + *cpuusage = 0; + cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0; + cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0; + cpustat[CPUTIME_SOFTIRQ] = 0;
#ifndef CONFIG_64BIT raw_spin_unlock_irq(&cpu_rq(cpu)->lock); @@ -196,7 +205,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, return -EINVAL;
for_each_possible_cpu(cpu) - cpuacct_cpuusage_write(ca, cpu, 0); + cpuacct_cpuusage_write(ca, cpu);
return 0; } @@ -243,25 +252,12 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) seq_puts(m, "\n");
for_each_possible_cpu(cpu) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - seq_printf(m, "%d", cpu);
- for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit - * platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); -#endif - - seq_printf(m, " %llu", cpuusage->usages[index]); + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) + seq_printf(m, " %llu", + cpuacct_cpuusage_read(ca, cpu, index));
-#ifndef CONFIG_64BIT - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#endif - } seq_puts(m, "\n"); } return 0; @@ -278,8 +274,8 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) for_each_possible_cpu(cpu) { u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; - val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; + val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; + val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; @@ -339,16 +335,11 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int index = CPUACCT_STAT_SYSTEM; - struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk); - - if (regs && user_mode(regs)) - index = CPUACCT_STAT_USER;
rcu_read_lock();
for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - __this_cpu_add(ca->cpuusage->usages[index], cputime); + __this_cpu_add(*ca->cpuusage, cputime);
rcu_read_unlock(); }
Andrey Ryabinin arbn@yandex-team.com writes:
cpuacct has 2 different ways of accounting and showing user and system times.
The first one uses cpuacct_account_field() to account times and cpuacct.stat file to expose them. And this one seems to work ok.
The second one is uses cpuacct_charge() function for accounting and set of cpuacct.usage* files to show times. Despite some attempts to fix it in the past it still doesn't work. E.g. while running KVM guest the cpuacct_charge() accounts most of the guest time as system time. This doesn't match with user&system times shown in cpuacct.stat or proc/<pid>/stat.
I couldn't reproduce this running a cpu bound load in a kvm guest on a nohz_full cpu on 5.11. The time is almost entirely in cpuacct.usage and _user, while _sys stays low.
Could you say more about how you're seeing this? Don't really doubt there's a problem, just wondering what you're doing.
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 941c28cf9738..7eff79faab0d 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -29,7 +29,7 @@ struct cpuacct_usage { struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every CPU */
- struct cpuacct_usage __percpu *cpuusage;
Definition of struct cpuacct_usage can go away now.
@@ -99,7 +99,8 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, enum cpuacct_stat_index index) {
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; u64 data;
There's a BUG_ON below this that could probably be WARN_ON_ONCE while you're here
@@ -278,8 +274,8 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) for_each_possible_cpu(cpu) { u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
unnecessary whitespace change?
On 3/18/21 1:22 AM, Daniel Jordan wrote:
Andrey Ryabinin arbn@yandex-team.com writes:
cpuacct has 2 different ways of accounting and showing user and system times.
The first one uses cpuacct_account_field() to account times and cpuacct.stat file to expose them. And this one seems to work ok.
The second one is uses cpuacct_charge() function for accounting and set of cpuacct.usage* files to show times. Despite some attempts to fix it in the past it still doesn't work. E.g. while running KVM guest the cpuacct_charge() accounts most of the guest time as system time. This doesn't match with user&system times shown in cpuacct.stat or proc/<pid>/stat.
I couldn't reproduce this running a cpu bound load in a kvm guest on a nohz_full cpu on 5.11. The time is almost entirely in cpuacct.usage and _user, while _sys stays low.
Could you say more about how you're seeing this? Don't really doubt there's a problem, just wondering what you're doing.
Yeah, I it's almost unnoticable if you run some load in guest like qemu.
But more simple case with busy loop in KVM_RUN triggers this:
# git clone https://github.com/aryabinin/kvmsample # make # mkdir /sys/fs/cgroup/cpuacct/test # echo $$ > /sys/fs/cgroup/cpuacct/test/tasks # ./kvmsample & # for i in {1..5}; do cat /sys/fs/cgroup/cpuacct/test/cpuacct.usage_sys; sleep 1; done 1976535645 2979839428 3979832704 4983603153 5983604157
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 941c28cf9738..7eff79faab0d 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -29,7 +29,7 @@ struct cpuacct_usage { struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every CPU */
- struct cpuacct_usage __percpu *cpuusage;
Definition of struct cpuacct_usage can go away now.
Done.
@@ -99,7 +99,8 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, enum cpuacct_stat_index index) {
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; u64 data;
There's a BUG_ON below this that could probably be WARN_ON_ONCE while you're here
Sure.
@@ -278,8 +274,8 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) for_each_possible_cpu(cpu) { u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
unnecessary whitespace change?
yup
Andrey Ryabinin arbn@yandex-team.com writes:
cpuacct.stat in no-root cgroups shows user time without guest time included int it. This doesn't match with user time shown in root cpuacct.stat and /proc/<pid>/stat.
Yeah, that's inconsistent.
Make account_guest_time() to add user time to cgroup's cpustat to fix this.
Yep.
cgroup2's cpu.stat is broken the same way for child cgroups, and this happily fixes it. Probably deserves a mention in the changelog.
The problem with cgroup2 was, if the workload was mostly guest time, cpu.stat's user and system together reflected it, but it was split unevenly across the two. I think guest time wasn't actually included in either bucket, it was just that the little user and system time there was got scaled up in cgroup_base_stat_cputime_show -> cputime_adjust to match sum_exec_runtime, which did have it.
The stats look ok now for both cgroup1 and 2. Just slightly unsure whether we want to change the way both interfaces expose the accounting in case something out there depends on it. Seems like we should, but it'd be good to hear more opinions.
@@ -148,11 +146,11 @@ void account_guest_time(struct task_struct *p, u64 cputime) /* Add guest time to cpustat. */ if (task_nice(p) > 0) {
cpustat[CPUTIME_NICE] += cputime;
cpustat[CPUTIME_GUEST_NICE] += cputime;
task_group_account_field(p, CPUTIME_NICE, cputime);
} else {task_group_account_field(p, CPUTIME_GUEST_NICE, cputime);
cpustat[CPUTIME_USER] += cputime;
cpustat[CPUTIME_GUEST] += cputime;
task_group_account_field(p, CPUTIME_USER, cputime);
}task_group_account_field(p, CPUTIME_GUEST, cputime);
Makes sense for _USER and _NICE, but it doesn't seem cgroup1 or 2 actually use _GUEST and _GUEST_NICE.
Could go either way. Consistency is nice, but I probably wouldn't change the GUEST ones so people aren't confused about why they're accounted. It's also extra cycles for nothing, even though most of the data is probably in the cache.
Sorry for abandoning this, got distracted by lots of other stuff.
On 3/18/21 1:09 AM, Daniel Jordan wrote:
Andrey Ryabinin arbn@yandex-team.com writes:
cpuacct.stat in no-root cgroups shows user time without guest time included int it. This doesn't match with user time shown in root cpuacct.stat and /proc/<pid>/stat.
Yeah, that's inconsistent.
Make account_guest_time() to add user time to cgroup's cpustat to fix this.
Yep.
cgroup2's cpu.stat is broken the same way for child cgroups, and this happily fixes it. Probably deserves a mention in the changelog.
Sure.
The problem with cgroup2 was, if the workload was mostly guest time, cpu.stat's user and system together reflected it, but it was split unevenly across the two. I think guest time wasn't actually included in either bucket, it was just that the little user and system time there was got scaled up in cgroup_base_stat_cputime_show -> cputime_adjust to match sum_exec_runtime, which did have it.
The stats look ok now for both cgroup1 and 2. Just slightly unsure whether we want to change the way both interfaces expose the accounting in case something out there depends on it. Seems like we should, but it'd be good to hear more opinions.
@@ -148,11 +146,11 @@ void account_guest_time(struct task_struct *p, u64 cputime) /* Add guest time to cpustat. */ if (task_nice(p) > 0) {
cpustat[CPUTIME_NICE] += cputime;
cpustat[CPUTIME_GUEST_NICE] += cputime;
task_group_account_field(p, CPUTIME_NICE, cputime);
} else {task_group_account_field(p, CPUTIME_GUEST_NICE, cputime);
cpustat[CPUTIME_USER] += cputime;
cpustat[CPUTIME_GUEST] += cputime;
task_group_account_field(p, CPUTIME_USER, cputime);
}task_group_account_field(p, CPUTIME_GUEST, cputime);
Makes sense for _USER and _NICE, but it doesn't seem cgroup1 or 2 actually use _GUEST and _GUEST_NICE.
Could go either way. Consistency is nice, but I probably wouldn't change the GUEST ones so people aren't confused about why they're accounted. It's also extra cycles for nothing, even though most of the data is probably in the cache.
Agreed, will live the _GUEST* as is.
cpuacct.stat in no-root cgroups shows user time without guest time included int it. This doesn't match with user time shown in root cpuacct.stat and /proc/<pid>/stat. This also affects cgroup2's cpu.stat in the same way.
Make account_guest_time() to add user time to cgroup's cpustat to fix this.
Fixes: ef12fefabf94 ("cpuacct: add per-cgroup utime/stime statistics") Signed-off-by: Andrey Ryabinin arbn@yandex-team.com Cc: stable@vger.kernel.org --- Changes since v1: - Don't CPUTIME_GUEST* since they aren't used cgroups --- kernel/sched/cputime.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 872e481d5098..042a6dbce8f3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -148,10 +148,10 @@ void account_guest_time(struct task_struct *p, u64 cputime)
/* Add guest time to cpustat. */ if (task_nice(p) > 0) { - cpustat[CPUTIME_NICE] += cputime; + task_group_account_field(p, CPUTIME_NICE, cputime); cpustat[CPUTIME_GUEST_NICE] += cputime; } else { - cpustat[CPUTIME_USER] += cputime; + task_group_account_field(p, CPUTIME_USER, cputime); cpustat[CPUTIME_GUEST] += cputime; } }
Global CPUTIME_USER counter already includes CPUTIME_GUEST Also CPUTIME_NICE already includes CPUTIME_GUEST_NICE.
Remove additions of CPUTIME_GUEST[_NICE] to total ->sum_exec_runtime to not account them twice.
Fixes: 936f2a70f207 ("cgroup: add cpu.stat file to root cgroup") Signed-off-by: Andrey Ryabinin arbn@yandex-team.com Cc: stable@vger.kernel.org Reviewed-by: Daniel Jordan daniel.m.jordan@oracle.com Tested-by: Daniel Jordan daniel.m.jordan@oracle.com --- Changes since v1: - Add review/tested tags in changelog --- kernel/cgroup/rstat.c | 2 -- 1 file changed, 2 deletions(-)
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index b264ab5652ba..1486768f2318 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -433,8 +433,6 @@ static void root_cgroup_cputime(struct task_cputime *cputime) cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; - cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST]; - cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE]; } }
cpuacct has 2 different ways of accounting and showing user and system times.
The first one uses cpuacct_account_field() to account times and cpuacct.stat file to expose them. And this one seems to work ok.
The second one is uses cpuacct_charge() function for accounting and set of cpuacct.usage* files to show times. Despite some attempts to fix it in the past it still doesn't work. Sometimes while running KVM guest the cpuacct_charge() accounts most of the guest time as system time. This doesn't match with user&system times shown in cpuacct.stat or proc/<pid>/stat.
Demonstration: # git clone https://github.com/aryabinin/kvmsample # make # mkdir /sys/fs/cgroup/cpuacct/test # echo $$ > /sys/fs/cgroup/cpuacct/test/tasks # ./kvmsample & # for i in {1..5}; do cat /sys/fs/cgroup/cpuacct/test/cpuacct.usage_sys; sleep 1; done 1976535645 2979839428 3979832704 4983603153 5983604157
Use cpustats accounted in cpuacct_account_field() as the source of user/sys times for cpuacct.usage* files. Make cpuacct_charge() to account only summary execution time.
Fixes: d740037fac70 ("sched/cpuacct: Split usage accounting into user_usage and sys_usage") Signed-off-by: Andrey Ryabinin arbn@yandex-team.com Cc: stable@vger.kernel.org --- Changes since v1: - remove struct cpuacct_usage; --- kernel/sched/cpuacct.c | 79 +++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 47 deletions(-)
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index f347cf9e4634..9de7dd51beb0 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -21,15 +21,11 @@ static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_SYSTEM] = "system", };
-struct cpuacct_usage { - u64 usages[CPUACCT_STAT_NSTATS]; -}; - /* track CPU usage of a group of tasks and its child groups */ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every CPU */ - struct cpuacct_usage __percpu *cpuusage; + u64 __percpu *cpuusage; struct kernel_cpustat __percpu *cpustat; };
@@ -49,7 +45,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return css_ca(ca->css.parent); }
-static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage); +static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); static struct cpuacct root_cpuacct = { .cpustat = &kernel_cpustat, .cpuusage = &root_cpuacct_cpuusage, @@ -68,7 +64,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) if (!ca) goto out;
- ca->cpuusage = alloc_percpu(struct cpuacct_usage); + ca->cpuusage = alloc_percpu(u64); if (!ca->cpuusage) goto out_free_ca;
@@ -99,7 +95,8 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, enum cpuacct_stat_index index) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; u64 data;
/* @@ -116,14 +113,17 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, raw_spin_rq_lock_irq(cpu_rq(cpu)); #endif
- if (index == CPUACCT_STAT_NSTATS) { - int i = 0; - - data = 0; - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - data += cpuusage->usages[i]; - } else { - data = cpuusage->usages[index]; + switch (index) { + case CPUACCT_STAT_USER: + data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE]; + break; + case CPUACCT_STAT_SYSTEM: + data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] + + cpustat[CPUTIME_SOFTIRQ]; + break; + case CPUACCT_STAT_NSTATS: + data = *cpuusage; + break; }
#ifndef CONFIG_64BIT @@ -133,10 +133,14 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, return data; }
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - int i; + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; + + /* Don't allow to reset global kernel_cpustat */ + if (ca == &root_cpuacct) + return;
#ifndef CONFIG_64BIT /* @@ -144,9 +148,10 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) */ raw_spin_rq_lock_irq(cpu_rq(cpu)); #endif - - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - cpuusage->usages[i] = val; + *cpuusage = 0; + cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0; + cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0; + cpustat[CPUTIME_SOFTIRQ] = 0;
#ifndef CONFIG_64BIT raw_spin_rq_unlock_irq(cpu_rq(cpu)); @@ -197,7 +202,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, return -EINVAL;
for_each_possible_cpu(cpu) - cpuacct_cpuusage_write(ca, cpu, 0); + cpuacct_cpuusage_write(ca, cpu);
return 0; } @@ -244,25 +249,10 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) seq_puts(m, "\n");
for_each_possible_cpu(cpu) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - seq_printf(m, "%d", cpu); - - for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit - * platforms. - */ - raw_spin_rq_lock_irq(cpu_rq(cpu)); -#endif - - seq_printf(m, " %llu", cpuusage->usages[index]); - -#ifndef CONFIG_64BIT - raw_spin_rq_unlock_irq(cpu_rq(cpu)); -#endif - } + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) + seq_printf(m, " %llu", + cpuacct_cpuusage_read(ca, cpu, index)); seq_puts(m, "\n"); } return 0; @@ -340,16 +330,11 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int index = CPUACCT_STAT_SYSTEM; - struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk); - - if (regs && user_mode(regs)) - index = CPUACCT_STAT_USER;
rcu_read_lock();
for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - __this_cpu_add(ca->cpuusage->usages[index], cputime); + __this_cpu_add(*ca->cpuusage, cputime);
rcu_read_unlock(); }
On Fri, Aug 20, 2021 at 12:40:04PM +0300, Andrey Ryabinin wrote:
cpuacct has 2 different ways of accounting and showing user and system times.
The first one uses cpuacct_account_field() to account times and cpuacct.stat file to expose them. And this one seems to work ok.
The second one is uses cpuacct_charge() function for accounting and set of cpuacct.usage* files to show times. Despite some attempts to fix it in the past it still doesn't work. Sometimes while running KVM guest the cpuacct_charge() accounts most of the guest time as system time. This doesn't match with user&system times shown in cpuacct.stat or proc/<pid>/stat.
Demonstration: # git clone https://github.com/aryabinin/kvmsample # make # mkdir /sys/fs/cgroup/cpuacct/test # echo $$ > /sys/fs/cgroup/cpuacct/test/tasks # ./kvmsample & # for i in {1..5}; do cat /sys/fs/cgroup/cpuacct/test/cpuacct.usage_sys; sleep 1; done 1976535645 2979839428 3979832704 4983603153 5983604157
Thanks for expanding on this, and fixing broken cpuacct_charge.
For the series, Reviewed-by: Daniel Jordan daniel.m.jordan@oracle.com
On Fri, Aug 20, 2021 at 12:40:01PM +0300, Andrey Ryabinin wrote:
cpuacct.stat in no-root cgroups shows user time without guest time included int it. This doesn't match with user time shown in root cpuacct.stat and /proc/<pid>/stat. This also affects cgroup2's cpu.stat in the same way.
Make account_guest_time() to add user time to cgroup's cpustat to fix this.
Fixes: ef12fefabf94 ("cpuacct: add per-cgroup utime/stime statistics") Signed-off-by: Andrey Ryabinin arbn@yandex-team.com Cc: stable@vger.kernel.org
The fact that this has been broken for so long, prolly from the beginning, gives me some pause but the patches looks fine to me.
For the series,
Acked-by: Tejun Heo tj@kernel.org
Thanks.
linux-stable-mirror@lists.linaro.org