v2: - Fix test_cpuset_prs.sh problems reported by test robot - Relax restriction imposed between cpuset.cpus.exclusive and cpuset.cpus of sibling cpusets. - Make cpuset.cpus.exclusive independent of cpuset.cpus. - Update test_cpuset_prs.sh accordingly.
[v1] https://lore.kernel.org/lkml/20240605171858.1323464-1-longman@redhat.com/
This patchset attempts to address the following cpuset issues. 1) While reviewing the generate_sched_domains() function, I found a bug in generating sched domains for remote non-isolating partitions. 2) Test robot had reported a test_cpuset_prs.sh test failure. 3) The current exclusivity test between cpuset.cpus.exclusive and cpuset.cpus and the restriction that the set effective exclusive CPUs has to be a subset of cpuset.cpus make it harder to preconfigure the cgroup hierarchy to enable remote partition.
The test_cpuset_prs.sh script is updated to match changes made in this patchset and was run to verify that the new code did not cause any regression.
Waiman Long (5): cgroup/cpuset: Fix remote root partition creation problem selftest/cgroup: Fix test_cpuset_prs.sh problems reported by test robot cgroup/cpuset: Delay setting of CS_CPU_EXCLUSIVE until valid partition cgroup/cpuset: Make cpuset.cpus.exclusive independent of cpuset.cpus selftest/cgroup: Update test_cpuset_prs.sh to match changes
Documentation/admin-guide/cgroup-v2.rst | 12 +- kernel/cgroup/cpuset.c | 158 +++++++++++++----- .../selftests/cgroup/test_cpuset_prs.sh | 75 ++++++--- 3 files changed, 180 insertions(+), 65 deletions(-)
Since commit 181c8e091aae ("cgroup/cpuset: Introduce remote partition"), a remote partition can be created underneath a non-partition root cpuset as long as its exclusive_cpus are set to distribute exclusive CPUs down to its children. The generate_sched_domains() function, however, doesn't take into account this new behavior and hence will fail to create the sched domain needed for a remote root (non-isolated) partition.
There are two issues related to remote partition support. First of all, generate_sched_domains() has a fast path that is activated if root_load_balance is true and top_cpuset.nr_subparts is non-zero. The later condition isn't quite correct for remote partitions as nr_subparts just shows the number of local child partitions underneath it. There can be no local child partition under top_cpuset even if there are remote partitions further down the hierarchy. Fix that by checking for subpartitions_cpus which contains exclusive CPUs allocated to both local and remote partitions.
Secondly, the valid partition check for subtree skipping in the csa[] generation loop isn't enough as remote partition does not need to have a partition root parent. Fix this problem by breaking csa[] array generation loop of generate_sched_domains() into v1 and v2 specific parts and checking a cpuset's exclusive_cpus before skipping its subtree in the v2 case.
Also simplify generate_sched_domains() for cgroup v2 as only non-isolating partition roots should be included in building the cpuset array and none of the v1 scheduling attributes other than a different way to create an isolated partition are supported.
Fixes: 181c8e091aae ("cgroup/cpuset: Introduce remote partition") Signed-off-by: Waiman Long longman@redhat.com --- kernel/cgroup/cpuset.c | 55 ++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 13 deletions(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f9b97f65e204..fb71d710a603 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -169,7 +169,7 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level;
- /* number of valid sub-partitions */ + /* number of valid local child partitions */ int nr_subparts;
/* partition root state */ @@ -957,13 +957,14 @@ static int generate_sched_domains(cpumask_var_t **domains, int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); + bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
doms = NULL; dattr = NULL; csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && !top_cpuset.nr_subparts) { + if (root_load_balance && cpumask_empty(subpartitions_cpus)) { single_root_domain: ndoms = 1; doms = alloc_sched_domains(ndoms); @@ -992,16 +993,18 @@ static int generate_sched_domains(cpumask_var_t **domains, cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { if (cp == &top_cpuset) continue; + + if (cgrpv2) + goto v2; + /* + * v1: * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The * latter: All child cpusets contain a subset of the * parent's cpus, so just skip them, and then we call * update_domain_attr_tree() to calc relax_domain_level of * the corresponding sched domain. - * - * If root is load-balancing, we can skip @cp if it - * is a subset of the root's effective_cpus. */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && @@ -1009,16 +1012,28 @@ static int generate_sched_domains(cpumask_var_t **domains, housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue;
- if (root_load_balance && - cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) - continue; - if (is_sched_load_balance(cp) && !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp;
- /* skip @cp's subtree if not a partition root */ - if (!is_partition_valid(cp)) + /* skip @cp's subtree */ + pos_css = css_rightmost_descendant(pos_css); + continue; + +v2: + /* + * Only valid partition roots that are not isolated and with + * non-empty effective_cpus will be saved into csn[]. + */ + if ((cp->partition_root_state == PRS_ROOT) && + !cpumask_empty(cp->effective_cpus)) + csa[csn++] = cp; + + /* + * Skip @cp's subtree if not a partition root and has no + * exclusive CPUs to be granted to child cpusets. + */ + if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus)) pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); @@ -1072,6 +1087,20 @@ static int generate_sched_domains(cpumask_var_t **domains, dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), GFP_KERNEL);
+ /* + * Cgroup v2 doesn't support domain attributes, just set all of them + * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a + * subset of HK_TYPE_DOMAIN housekeeping CPUs. + */ + if (cgrpv2) { + for (i = 0; i < ndoms; i++) { + cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; + } + goto done; + } + for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; struct cpumask *dp; @@ -1231,7 +1260,7 @@ static void rebuild_sched_domains_locked(void) * root should be only a subset of the active CPUs. Since a CPU in any * partition root could be offlined, all must be checked. */ - if (top_cpuset.nr_subparts) { + if (!cpumask_empty(subpartitions_cpus)) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (!is_partition_valid(cs)) { @@ -4575,7 +4604,7 @@ static void cpuset_handle_hotplug(void) * In the rare case that hotplug removes all the cpus in * subpartitions_cpus, we assumed that cpus are updated. */ - if (!cpus_updated && top_cpuset.nr_subparts) + if (!cpus_updated && !cpumask_empty(subpartitions_cpus)) cpus_updated = true;
/* For v1, synchronize cpus_allowed to cpu_active_mask */
The test robot reported two different problems when running the test_cpuset_prs.sh test.
# ./test_cpuset_prs.sh: line 106: echo: write error: Input/output error # : # Effective cpus changed to 0-1,4-7 after test 4!
The write error is caused by writing to /dev/console. It looks like some systems may not have /dev/console configured or in a writeable state. Fix this by checking the existence of /dev/console before attempting to write it.
After the completion of each test run, the script will check if the cpuset state is reset back to the original state. That usually takes a while to happen. The test script inserts some artificial delay to make sure that the reset has completed. The current setting is about 80ms. That may not be enough in some cases especially if the test system is slow. Double it to 160ms to minimize the chance of this type of failure.
Reported-by: kernel test robot oliver.sang@intel.com Closes: https://lore.kernel.org/oe-lkp/202406141712.dbbaa8fd-oliver.sang@intel.com Signed-off-by: Waiman Long longman@redhat.com --- .../selftests/cgroup/test_cpuset_prs.sh | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-)
diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index b5eb1be2248c..96bf2b5c5eb6 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -28,6 +28,14 @@ CPULIST=$(cat $CGROUP2/cpuset.cpus.effective) NR_CPUS=$(lscpu | grep "^CPU(s):" | sed -e "s/.*:[[:space:]]*//") [[ $NR_CPUS -lt 8 ]] && skip_test "Test needs at least 8 cpus available!"
+# Check to see if /dev/console exists and is writable +if [[ -c /dev/console && -w /dev/console ]] +then + CONSOLE=/dev/console +else + CONSOLE=/dev/null +fi + # Set verbose flag and delay factor PROG=$1 VERBOSE=0 @@ -103,8 +111,8 @@ console_msg() { MSG=$1 echo "$MSG" - echo "" > /dev/console - echo "$MSG" > /dev/console + echo "" > $CONSOLE + echo "$MSG" > $CONSOLE pause 0.01 }
@@ -694,9 +702,9 @@ null_isolcpus_check() [[ $VERBOSE -gt 0 ]] || return 0 # Retry a few times before printing error RETRY=0 - while [[ $RETRY -lt 5 ]] + while [[ $RETRY -lt 8 ]] do - pause 0.01 + pause 0.02 check_isolcpus "." [[ $? -eq 0 ]] && return 0 ((RETRY++)) @@ -726,7 +734,7 @@ run_state_test()
while [[ $I -lt $CNT ]] do - echo "Running test $I ..." > /dev/console + echo "Running test $I ..." > $CONSOLE [[ $VERBOSE -gt 1 ]] && { echo "" eval echo ${$TEST[$I]} @@ -783,7 +791,7 @@ run_state_test() while [[ $NEWLIST != $CPULIST && $RETRY -lt 8 ]] do # Wait a bit longer & recheck a few times - pause 0.01 + pause 0.02 ((RETRY++)) NEWLIST=$(cat cpuset.cpus.effective) done
The CS_CPU_EXCLUSIVE flag is currently set whenever cpuset.cpus.exclusive is set to make sure that the exclusivity test will be run to ensure its exclusiveness. At the same time, this flag can be changed whenever the partition root state is changed. For example, the CS_CPU_EXCLUSIVE flag will be reset whenever a partition root becomes invalid. This makes using CS_CPU_EXCLUSIVE to ensure exclusiveness a bit fragile.
The current scheme also makes setting up a cpuset.cpus.exclusive hierarchy to enable remote partition harder as cpuset.cpus.exclusive cannot overlap with any cpuset.cpus of sibling cpusets if their cpuset.cpus.exclusive aren't set.
Solve these issues by deferring the setting of CS_CPU_EXCLUSIVE flag until the cpuset become a valid partition root while adding new checks in validate_change() to ensure that cpuset.cpus.exclusive of sibling cpusets cannot overlap.
An additional check is also added to validate_change() to make sure that cpuset.cpus of one cpuset cannot be a subset of cpuset.cpus.exclusive of a sibling cpuset to avoid the problem that none of those CPUs will be available when these exclusive CPUs are extracted out to a newly enabled partition root. The Documentation/admin-guide/cgroup-v2.rst file is updated to document the new constraints.
Signed-off-by: Waiman Long longman@redhat.com --- Documentation/admin-guide/cgroup-v2.rst | 8 ++++-- kernel/cgroup/cpuset.c | 36 ++++++++++++++++++++----- 2 files changed, 35 insertions(+), 9 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index f886c6ad704e..722e4762c4e0 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2359,8 +2359,12 @@ Cpuset Interface Files is always a subset of it.
Users can manually set it to a value that is different from - "cpuset.cpus". The only constraint in setting it is that the - list of CPUs must be exclusive with respect to its sibling. + "cpuset.cpus". One constraint in setting it is that the list of + CPUs must be exclusive with respect to "cpuset.cpus.exclusive" + of its sibling. If "cpuset.cpus.exclusive" of a sibling cgroup + isn't set, its "cpuset.cpus" value, if set, cannot be a subset + of it to leave at least one CPU available when the exclusive + CPUs are taken away.
For a parent cgroup, any one of its exclusive CPUs can only be distributed to at most one of its child cgroups. Having an diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index fb71d710a603..144bfc319809 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -825,17 +825,41 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/* * If either I or some sibling (!= me) is exclusive, we can't - * overlap + * overlap. exclusive_cpus cannot overlap with each other if set. */ ret = -EINVAL; cpuset_for_each_child(c, css, par) { - if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && - c != cur) { + bool txset, cxset; /* Are exclusive_cpus set? */ + + if (c == cur) + continue; + + txset = !cpumask_empty(trial->exclusive_cpus); + cxset = !cpumask_empty(c->exclusive_cpus); + if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || + (txset && cxset)) { if (!cpusets_are_exclusive(trial, c)) goto out; + } else if (txset || cxset) { + struct cpumask *xcpus, *acpus; + + /* + * When just one of the exclusive_cpus's is set, + * cpus_allowed of the other cpuset, if set, cannot be + * a subset of it or none of those CPUs will be + * available if these exclusive CPUs are activated. + */ + if (txset) { + xcpus = trial->exclusive_cpus; + acpus = c->cpus_allowed; + } else { + xcpus = c->exclusive_cpus; + acpus = trial->cpus_allowed; + } + if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) + goto out; } if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) goto out; } @@ -1375,7 +1399,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, */ static int update_partition_exclusive(struct cpuset *cs, int new_prs) { - bool exclusive = (new_prs > 0); + bool exclusive = (new_prs > PRS_MEMBER);
if (exclusive && !is_cpu_exclusive(cs)) { if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) @@ -2620,8 +2644,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, retval = cpulist_parse(buf, trialcs->exclusive_cpus); if (retval < 0) return retval; - if (!is_cpu_exclusive(cs)) - set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags); }
/* Nothing to do if the CPUs didn't change */
The "cpuset.cpus.exclusive.effective" value is currently limited to a subset of its "cpuset.cpus". This makes the exclusive CPUs distribution hierarchy subsumed within the larger "cpuset.cpus" hierarchy. We have to decide on what CPUs are used locally and what CPUs can be passed down as exclusive CPUs down the hierarchy and combine them into "cpuset.cpus".
The advantage of the current scheme is to have only one hierarchy to worry about. However, it make it harder to use as all the "cpuset.cpus" values have to be properly set along the way down to the designated remote partition root. It also makes it more cumbersome to find out what CPUs can be used locally.
Make creation of remote partition simpler by breaking the dependency of "cpuset.cpus.exclusive" on "cpuset.cpus" and make them independent entities. Now we have two separate hierarchies - one for setting "cpuset.cpus.effective" and the other one for setting "cpuset.cpus.exclusive.effective". We may not need to set "cpuset.cpus" when we activate a partition root anymore.
Also update Documentation/admin-guide/cgroup-v2.rst and cpuset.c comment to document this change.
Suggested-by: Petr Malat oss@malat.biz Signed-off-by: Waiman Long longman@redhat.com --- Documentation/admin-guide/cgroup-v2.rst | 4 +- kernel/cgroup/cpuset.c | 67 +++++++++++++++++-------- 2 files changed, 49 insertions(+), 22 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 722e4762c4e0..2e4e74bea6ef 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2380,8 +2380,8 @@ Cpuset Interface Files cpuset-enabled cgroups.
This file shows the effective set of exclusive CPUs that - can be used to create a partition root. The content of this - file will always be a subset of "cpuset.cpus" and its parent's + can be used to create a partition root. The content + of this file will always be a subset of its parent's "cpuset.cpus.exclusive.effective" if its parent is not the root cgroup. It will also be a subset of "cpuset.cpus.exclusive" if it is set. If "cpuset.cpus.exclusive" is not set, it is diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 144bfc319809..fe76045aa528 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -87,7 +87,7 @@ static const char * const perr_strings[] = { [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", [PERR_HOTPLUG] = "No cpu available due to hotplug", - [PERR_CPUSEMPTY] = "cpuset.cpus is empty", + [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", };
@@ -127,19 +127,28 @@ struct cpuset { /* * Exclusive CPUs dedicated to current cgroup (default hierarchy only) * - * This exclusive CPUs must be a subset of cpus_allowed. A parent - * cgroup can only grant exclusive CPUs to one of its children. + * The effective_cpus of a valid partition root comes solely from its + * effective_xcpus and some of the effective_xcpus may be distributed + * to sub-partitions below & hence excluded from its effective_cpus. + * For a valid partition root, its effective_cpus have no relationship + * with cpus_allowed unless its exclusive_cpus isn't set. * - * When the cgroup becomes a valid partition root, effective_xcpus - * defaults to cpus_allowed if not set. The effective_cpus of a valid - * partition root comes solely from its effective_xcpus and some of the - * effective_xcpus may be distributed to sub-partitions below & hence - * excluded from its effective_cpus. + * This value will only be set if either exclusive_cpus is set or + * when this cpuset becomes a local partition root. */ cpumask_var_t effective_xcpus;
/* * Exclusive CPUs as requested by the user (default hierarchy only) + * + * Its value is independent of cpus_allowed and designates the set of + * CPUs that can be granted to the current cpuset or its children when + * it becomes a valid partition root. The effective set of exclusive + * CPUs granted (effective_xcpus) depends on whether those exclusive + * CPUs are passed down by its ancestors and not yet taken up by + * another sibling partition root along the way. + * + * If its value isn't set, it defaults to cpus_allowed. */ cpumask_var_t exclusive_cpus;
@@ -230,6 +239,17 @@ static struct list_head remote_children; * 2 - partition root without load balancing (isolated) * -1 - invalid partition root * -2 - invalid isolated partition root + * + * There are 2 types of partitions - local or remote. Local partitions are + * those whose parents are partition root themselves. Setting of + * cpuset.cpus.exclusive are optional in setting up local partitions. + * Remote partitions are those whose parents are not partition roots. Passing + * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor + * nodes are mandatory in creating a remote partition. + * + * For simplicity, a local partition can be created under a local or remote + * partition but a remote partition cannot have any partition root in its + * ancestor chain except the cgroup root. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 @@ -709,6 +729,19 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); }
+/* Return user specified exclusive CPUs */ +static inline struct cpumask *user_xcpus(struct cpuset *cs) +{ + return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed + : cs->exclusive_cpus; +} + +static inline bool xcpus_empty(struct cpuset *cs) +{ + return cpumask_empty(cs->cpus_allowed) && + cpumask_empty(cs->exclusive_cpus); +} + static inline struct cpumask *fetch_xcpus(struct cpuset *cs) { return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : @@ -1593,7 +1626,7 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); * Return: true if xcpus is not empty, false otherwise. * * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), - * it must be a subset of cpus_allowed and parent's effective_xcpus. + * it must be a subset of parent's effective_xcpus. */ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, struct cpumask *xcpus) @@ -1603,12 +1636,7 @@ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, if (!xcpus) xcpus = cs->effective_xcpus;
- if (!cpumask_empty(cs->exclusive_cpus)) - cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed); - else - cpumask_copy(xcpus, cs->cpus_allowed); - - return cpumask_and(xcpus, xcpus, parent->effective_xcpus); + return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); }
static inline bool is_remote_partition(struct cpuset *cs) @@ -1887,8 +1915,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ adding = deleting = false; old_prs = new_prs = cs->partition_root_state; - xcpus = !cpumask_empty(cs->exclusive_cpus) - ? cs->effective_xcpus : cs->cpus_allowed; + xcpus = user_xcpus(cs);
if (cmd == partcmd_invalidate) { if (is_prs_invalid(old_prs)) @@ -1916,7 +1943,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, return is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART; } - if (!newmask && cpumask_empty(cs->cpus_allowed)) + if (!newmask && xcpus_empty(cs)) return PERR_CPUSEMPTY;
nocpu = tasks_nocpu_error(parent, cs, xcpus); @@ -3130,9 +3157,9 @@ static int update_prstate(struct cpuset *cs, int new_prs) ? partcmd_enable : partcmd_enablei;
/* - * cpus_allowed cannot be empty. + * cpus_allowed and exclusive_cpus cannot be both empty. */ - if (cpumask_empty(cs->cpus_allowed)) { + if (xcpus_empty(cs)) { err = PERR_CPUSEMPTY; goto out; }
Unlike the list of isolated CPUs, it is not easy to programamatically determine what sched domains are being created by the scheduler just by examinng the data in various kernfs filesystems. The easiest way to get this information is by enabling /sys/kernel/debug/sched/verbose file to make those information displayed in the console. This is also what the test_cpuset_prs.sh script is doing when the -v flag is given.
It is rather hard to fetch the data from the console and compare it to the expected result. An easier way is to dump the expected sched-domain information out to the console so that they can be visually compared with the actual sched domain data. However, this have to be done manually by visual inspection and so will only be done once in a while.
Moreover the preceding cpuset commits also change the cpuset behavior requiring corresponding chanages in some test cases as well as new test cases to test the newly added functionality.
Signed-off-by: Waiman Long longman@redhat.com --- .../selftests/cgroup/test_cpuset_prs.sh | 55 ++++++++++++++----- 1 file changed, 40 insertions(+), 15 deletions(-)
diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 96bf2b5c5eb6..7c08cc153367 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -169,6 +169,14 @@ test_add_proc() # T = put a task into cgroup # O<c>=<v> = Write <v> to CPU online file of <c> # +# ECPUs - effective CPUs of cpusets +# Pstate - partition root state +# ISOLCPUS - isolated CPUs (<icpus>[,<icpus2>]) +# +# Note that if there are 2 fields in ISOLCPUS, the first one is for +# sched-debug matching which includes offline CPUs and single-CPU partitions +# while the second one is for matching cpuset.cpus.isolated. +# SETUP_A123_PARTITIONS="C1-3:P1:S+ C2-3:P1:S+ C3:P1" TEST_MATRIX=( # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS @@ -228,23 +236,29 @@ TEST_MATRIX=( " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3:P2 . . 0 A1:0-1,A2:2-3,A3:2-3 A1:P0,A2:P2 2-3" " C0-3:S+ C1-3:S+ C2-3 . X2-3 X3:P2 . . 0 A1:0-2,A2:3,A3:3 A1:P0,A2:P2 3" " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2 . 0 A1:0-1,A2:1,A3:2-3 A1:P0,A3:P2 2-3" - " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:C3 . 0 A1:0-2,A2:1-2,A3:3 A1:P0,A3:P2 3" + " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:C3 . 0 A1:0-1,A2:1,A3:2-3 A1:P0,A3:P2 2-3" " C0-3:S+ C1-3:S+ C2-3 C2-3 . . . P2 0 A1:0-3,A2:1-3,A3:2-3,B1:2-3 A1:P0,A3:P0,B1:P-2" " C0-3:S+ C1-3:S+ C2-3 C4-5 . . . P2 0 B1:4-5 B1:P2 4-5" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2 0 A3:2-3,B1:4 A3:P2,B1:P2 2-4" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2:C1-3 P2 0 A3:2-3,B1:4 A3:P2,B1:P2 2-4" " C0-3:S+ C1-3:S+ C2-3 C4 X1-3 X1-3:P2 P2 . 0 A2:1,A3:2-3 A2:P2,A3:P2 1-3" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2:C4-5 0 A3:2-3,B1:4-5 A3:P2,B1:P2 2-5" + " C4:X0-3:S+ X1-3:S+ X2-3 . . P2 . . 0 A1:4,A2:1-3,A3:1-3 A2:P2 1-3" + " C4:X0-3:S+ X1-3:S+ X2-3 . . . P2 . 0 A1:4,A2:4,A3:2-3 A3:P2 2-3"
# Nested remote/local partition tests " C0-3:S+ C1-3:S+ C2-3 C4-5 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:,A3:2-3,B1:4-5 \ A1:P0,A2:P1,A3:P2,B1:P1 2-3" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:,A3:2-3,B1:4 \ A1:P0,A2:P1,A3:P2,B1:P1 2-4,2-3" + " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 . P1 0 A1:0-1,A2:2-3,A3:2-3,B1:4 \ + A1:P0,A2:P1,A3:P0,B1:P1" " C0-3:S+ C1-3:S+ C3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:2,A3:3,B1:4 \ A1:P0,A2:P1,A3:P2,B1:P1 2-4,3" " C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X4:P1 . 0 A1:0-1,A2:2-3,A3:4 \ A1:P0,A2:P2,A3:P1 2-4,2-3" + " C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X3-4:P1 . 0 A1:0-1,A2:2,A3:3-4 \ + A1:P0,A2:P2,A3:P1 2" " C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \ . . X5 . . 0 A1:0-4,A2:1-4,A3:2-4 \ A1:P0,A2:P-2,A3:P-1" @@ -270,8 +284,8 @@ TEST_MATRIX=( . . X2-3 P2 . . 0 A1:0-2,A2:3,XA2:3 A2:P2 3"
# Invalid to valid local partition direct transition tests - " C1-3:S+:P2 C2-3:X1:P2 . . . . . . 0 A1:1-3,XA1:1-3,A2:2-3:XA2: A1:P2,A2:P-2 1-3" - " C1-3:S+:P2 C2-3:X1:P2 . . . X3:P2 . . 0 A1:1-2,XA1:1-3,A2:3:XA2:3 A1:P2,A2:P2 1-3" + " C1-3:S+:P2 X4:P2 . . . . . . 0 A1:1-3,XA1:1-3,A2:1-3:XA2: A1:P2,A2:P-2 1-3" + " C1-3:S+:P2 X4:P2 . . . X3:P2 . . 0 A1:1-2,XA1:1-3,A2:3:XA2:3 A1:P2,A2:P2 1-3" " C0-3:P2 . . C4-6 C0-4 . . . 0 A1:0-4,B1:4-6 A1:P-2,B1:P0" " C0-3:P2 . . C4-6 C0-4:C0-3 . . . 0 A1:0-3,B1:4-6 A1:P2,B1:P0 0-3" " C0-3:P2 . . C3-5:C4-5 . . . . 0 A1:0-3,B1:4-5 A1:P2,B1:P0 0-3" @@ -282,21 +296,18 @@ TEST_MATRIX=( " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \ . . X4 . . 0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3" " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \ - . . C4 . . 0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3" + . . C4:X . . 0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3" # Local partition CPU change tests " C0-5:S+:P2 C4-5:S+:P1 . . . C3-5 . . 0 A1:0-2,A2:3-5 A1:P2,A2:P1 0-2" " C0-5:S+:P2 C4-5:S+:P1 . . C1-5 . . . 0 A1:1-3,A2:4-5 A1:P2,A2:P1 1-3"
# cpus_allowed/exclusive_cpus update tests " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ - . C4 . P2 . 0 A1:4,A2:4,XA2:,XA3:,A3:4 \ + . X:C4 . P2 . 0 A1:4,A2:4,XA2:,XA3:,A3:4 \ A1:P0,A3:P-2" " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ . X1 . P2 . 0 A1:0-3,A2:1-3,XA1:1,XA2:,XA3:,A3:2-3 \ A1:P0,A3:P-2" - " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ - . . C3 P2 . 0 A1:0-2,A2:0-2,XA2:3,XA3:3,A3:3 \ - A1:P0,A3:P2 3" " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ . . X3 P2 . 0 A1:0-2,A2:1-2,XA2:3,XA3:3,A3:3 \ A1:P0,A3:P2 3" @@ -304,10 +315,7 @@ TEST_MATRIX=( . . X3 . . 0 A1:0-3,A2:1-3,XA2:3,XA3:3,A3:2-3 \ A1:P0,A3:P-2" " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \ - . . C3 . . 0 A1:0-3,A2:3,XA2:3,XA3:3,A3:3 \ - A1:P0,A3:P-2" - " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \ - . C4 . . . 0 A1:4,A2:4,A3:4,XA1:,XA2:,XA3 \ + . X4 . . . 0 A1:0-3,A2:1-3,A3:2-3,XA1:4,XA2:,XA3 \ A1:P0,A3:P-2"
# old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS @@ -354,6 +362,9 @@ TEST_MATRIX=( " C0-1:P1 . . P1:C2-3 C0-2 . . . 0 A1:0-2,B1:2-3 A1:P-1,B1:P-1" " C0-1 . . P1:C2-3 C0-2 . . . 0 A1:0-2,B1:2-3 A1:P0,B1:P-1"
+ # cpuset.cpus can overlap with sibling cpuset.cpus.exclusive but not subsumed by it + " C0-3 . . C4-5 X5 . . . 0 A1:0-3,B1:4-5" + # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # Failure cases: @@ -363,6 +374,9 @@ TEST_MATRIX=(
# Changes to cpuset.cpus.exclusive that violate exclusivity rule is rejected " C0-3 . . C4-5 X0-3 . . X3-5 1 A1:0-3,B1:4-5" + + # cpuset.cpus cannot be a subset of sibling cpuset.cpus.exclusive + " C0-3 . . C4-5 X3-5 . . . 1 A1:0-3,B1:4-5" )
# @@ -564,14 +578,15 @@ check_cgroup_states() do set -- $(echo $CHK | sed -e "s/:/ /g") CGRP=$1 + CGRP_DIR=$CGRP STATE=$2 FILE= EVAL=$(expr substr $STATE 2 2) - [[ $CGRP = A2 ]] && CGRP=A1/A2 - [[ $CGRP = A3 ]] && CGRP=A1/A2/A3 + [[ $CGRP = A2 ]] && CGRP_DIR=A1/A2 + [[ $CGRP = A3 ]] && CGRP_DIR=A1/A2/A3
case $STATE in - P*) FILE=$CGRP/cpuset.cpus.partition + P*) FILE=$CGRP_DIR/cpuset.cpus.partition ;; *) echo "Unknown state: $STATE!" exit 1 @@ -595,6 +610,16 @@ check_cgroup_states() ;; esac [[ $EVAL != $VAL ]] && return 1 + + # + # For root partition, dump sched-domains info to console if + # verbose mode set for manual comparison with sched debug info. + # + [[ $VAL -eq 1 && $VERBOSE -gt 0 ]] && { + DOMS=$(cat $CGRP_DIR/cpuset.cpus.effective) + [[ -n "$DOMS" ]] && + echo " [$CGRP] sched-domain: $DOMS" > $CONSOLE + } done return 0 }
On Mon, Jun 17, 2024 at 10:39:40AM -0400, Waiman Long wrote:
v2:
- Fix test_cpuset_prs.sh problems reported by test robot
- Relax restriction imposed between cpuset.cpus.exclusive and cpuset.cpus of sibling cpusets.
- Make cpuset.cpus.exclusive independent of cpuset.cpus.
- Update test_cpuset_prs.sh accordingly.
[v1] https://lore.kernel.org/lkml/20240605171858.1323464-1-longman@redhat.com/
This patchset attempts to address the following cpuset issues.
- While reviewing the generate_sched_domains() function, I found a bug in generating sched domains for remote non-isolating partitions.
- Test robot had reported a test_cpuset_prs.sh test failure.
- The current exclusivity test between cpuset.cpus.exclusive and cpuset.cpus and the restriction that the set effective exclusive CPUs has to be a subset of cpuset.cpus make it harder to preconfigure the cgroup hierarchy to enable remote partition.
The test_cpuset_prs.sh script is updated to match changes made in this patchset and was run to verify that the new code did not cause any regression.
Applied to cgroup/for-6.11.
Thanks.
Hi, I finally got some time to test this and it works exactly as we needed it to. Thanks a lot, Petr
On Mon, Jun 17, 2024 at 10:39:44AM -0400, Waiman Long wrote:
The "cpuset.cpus.exclusive.effective" value is currently limited to a subset of its "cpuset.cpus". This makes the exclusive CPUs distribution hierarchy subsumed within the larger "cpuset.cpus" hierarchy. We have to decide on what CPUs are used locally and what CPUs can be passed down as exclusive CPUs down the hierarchy and combine them into "cpuset.cpus".
The advantage of the current scheme is to have only one hierarchy to worry about. However, it make it harder to use as all the "cpuset.cpus" values have to be properly set along the way down to the designated remote partition root. It also makes it more cumbersome to find out what CPUs can be used locally.
Make creation of remote partition simpler by breaking the dependency of "cpuset.cpus.exclusive" on "cpuset.cpus" and make them independent entities. Now we have two separate hierarchies - one for setting "cpuset.cpus.effective" and the other one for setting "cpuset.cpus.exclusive.effective". We may not need to set "cpuset.cpus" when we activate a partition root anymore.
Also update Documentation/admin-guide/cgroup-v2.rst and cpuset.c comment to document this change.
Suggested-by: Petr Malat oss@malat.biz Signed-off-by: Waiman Long longman@redhat.com
Documentation/admin-guide/cgroup-v2.rst | 4 +- kernel/cgroup/cpuset.c | 67 +++++++++++++++++-------- 2 files changed, 49 insertions(+), 22 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 722e4762c4e0..2e4e74bea6ef 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2380,8 +2380,8 @@ Cpuset Interface Files cpuset-enabled cgroups.
This file shows the effective set of exclusive CPUs that
- can be used to create a partition root. The content of this
- file will always be a subset of "cpuset.cpus" and its parent's
- can be used to create a partition root. The content
- of this file will always be a subset of its parent's "cpuset.cpus.exclusive.effective" if its parent is not the root cgroup. It will also be a subset of "cpuset.cpus.exclusive" if it is set. If "cpuset.cpus.exclusive" is not set, it is
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 144bfc319809..fe76045aa528 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -87,7 +87,7 @@ static const char * const perr_strings[] = { [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", [PERR_HOTPLUG] = "No cpu available due to hotplug",
- [PERR_CPUSEMPTY] = "cpuset.cpus is empty",
- [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup",
};
@@ -127,19 +127,28 @@ struct cpuset { /* * Exclusive CPUs dedicated to current cgroup (default hierarchy only) *
* This exclusive CPUs must be a subset of cpus_allowed. A parent
* cgroup can only grant exclusive CPUs to one of its children.
* The effective_cpus of a valid partition root comes solely from its
* effective_xcpus and some of the effective_xcpus may be distributed
* to sub-partitions below & hence excluded from its effective_cpus.
* For a valid partition root, its effective_cpus have no relationship
* with cpus_allowed unless its exclusive_cpus isn't set.
* When the cgroup becomes a valid partition root, effective_xcpus
* defaults to cpus_allowed if not set. The effective_cpus of a valid
* partition root comes solely from its effective_xcpus and some of the
* effective_xcpus may be distributed to sub-partitions below & hence
* excluded from its effective_cpus.
* This value will only be set if either exclusive_cpus is set or
* when this cpuset becomes a local partition root.
*/ cpumask_var_t effective_xcpus;
/*
- Exclusive CPUs as requested by the user (default hierarchy only)
*
* Its value is independent of cpus_allowed and designates the set of
* CPUs that can be granted to the current cpuset or its children when
* it becomes a valid partition root. The effective set of exclusive
* CPUs granted (effective_xcpus) depends on whether those exclusive
* CPUs are passed down by its ancestors and not yet taken up by
* another sibling partition root along the way.
*
* If its value isn't set, it defaults to cpus_allowed.
*/ cpumask_var_t exclusive_cpus;
@@ -230,6 +239,17 @@ static struct list_head remote_children;
- 2 - partition root without load balancing (isolated)
- -1 - invalid partition root
- -2 - invalid isolated partition root
- There are 2 types of partitions - local or remote. Local partitions are
- those whose parents are partition root themselves. Setting of
- cpuset.cpus.exclusive are optional in setting up local partitions.
- Remote partitions are those whose parents are not partition roots. Passing
- down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor
- nodes are mandatory in creating a remote partition.
- For simplicity, a local partition can be created under a local or remote
- partition but a remote partition cannot have any partition root in its
*/
- ancestor chain except the cgroup root.
#define PRS_MEMBER 0 #define PRS_ROOT 1 @@ -709,6 +729,19 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); }
+/* Return user specified exclusive CPUs */ +static inline struct cpumask *user_xcpus(struct cpuset *cs) +{
- return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed
: cs->exclusive_cpus;
+}
+static inline bool xcpus_empty(struct cpuset *cs) +{
- return cpumask_empty(cs->cpus_allowed) &&
cpumask_empty(cs->exclusive_cpus);
+}
static inline struct cpumask *fetch_xcpus(struct cpuset *cs) { return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : @@ -1593,7 +1626,7 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
- Return: true if xcpus is not empty, false otherwise.
- Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
- it must be a subset of cpus_allowed and parent's effective_xcpus.
*/
- it must be a subset of parent's effective_xcpus.
static bool compute_effective_exclusive_cpumask(struct cpuset *cs, struct cpumask *xcpus) @@ -1603,12 +1636,7 @@ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, if (!xcpus) xcpus = cs->effective_xcpus;
- if (!cpumask_empty(cs->exclusive_cpus))
cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed);
- else
cpumask_copy(xcpus, cs->cpus_allowed);
- return cpumask_and(xcpus, xcpus, parent->effective_xcpus);
- return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);
}
static inline bool is_remote_partition(struct cpuset *cs) @@ -1887,8 +1915,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ adding = deleting = false; old_prs = new_prs = cs->partition_root_state;
- xcpus = !cpumask_empty(cs->exclusive_cpus)
? cs->effective_xcpus : cs->cpus_allowed;
xcpus = user_xcpus(cs);
if (cmd == partcmd_invalidate) { if (is_prs_invalid(old_prs))
@@ -1916,7 +1943,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, return is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART; }
- if (!newmask && cpumask_empty(cs->cpus_allowed))
if (!newmask && xcpus_empty(cs)) return PERR_CPUSEMPTY;
nocpu = tasks_nocpu_error(parent, cs, xcpus);
@@ -3130,9 +3157,9 @@ static int update_prstate(struct cpuset *cs, int new_prs) ? partcmd_enable : partcmd_enablei;
/*
* cpus_allowed cannot be empty.
*/* cpus_allowed and exclusive_cpus cannot be both empty.
if (cpumask_empty(cs->cpus_allowed)) {
}if (xcpus_empty(cs)) { err = PERR_CPUSEMPTY; goto out;
-- 2.39.3
On 7/15/24 11:56, Petr Malat wrote:
Hi, I finally got some time to test this and it works exactly as we needed it to. Thanks a lot, Petr
Thanks for the verification.
Cheers, Longman
On Mon, Jun 17, 2024 at 10:39:44AM -0400, Waiman Long wrote:
The "cpuset.cpus.exclusive.effective" value is currently limited to a subset of its "cpuset.cpus". This makes the exclusive CPUs distribution hierarchy subsumed within the larger "cpuset.cpus" hierarchy. We have to decide on what CPUs are used locally and what CPUs can be passed down as exclusive CPUs down the hierarchy and combine them into "cpuset.cpus".
The advantage of the current scheme is to have only one hierarchy to worry about. However, it make it harder to use as all the "cpuset.cpus" values have to be properly set along the way down to the designated remote partition root. It also makes it more cumbersome to find out what CPUs can be used locally.
Make creation of remote partition simpler by breaking the dependency of "cpuset.cpus.exclusive" on "cpuset.cpus" and make them independent entities. Now we have two separate hierarchies - one for setting "cpuset.cpus.effective" and the other one for setting "cpuset.cpus.exclusive.effective". We may not need to set "cpuset.cpus" when we activate a partition root anymore.
Also update Documentation/admin-guide/cgroup-v2.rst and cpuset.c comment to document this change.
Suggested-by: Petr Malat oss@malat.biz Signed-off-by: Waiman Long longman@redhat.com
Documentation/admin-guide/cgroup-v2.rst | 4 +- kernel/cgroup/cpuset.c | 67 +++++++++++++++++-------- 2 files changed, 49 insertions(+), 22 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 722e4762c4e0..2e4e74bea6ef 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2380,8 +2380,8 @@ Cpuset Interface Files cpuset-enabled cgroups.
This file shows the effective set of exclusive CPUs that
- can be used to create a partition root. The content of this
- file will always be a subset of "cpuset.cpus" and its parent's
- can be used to create a partition root. The content
- of this file will always be a subset of its parent's "cpuset.cpus.exclusive.effective" if its parent is not the root cgroup. It will also be a subset of "cpuset.cpus.exclusive" if it is set. If "cpuset.cpus.exclusive" is not set, it is
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 144bfc319809..fe76045aa528 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -87,7 +87,7 @@ static const char * const perr_strings[] = { [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", [PERR_HOTPLUG] = "No cpu available due to hotplug",
- [PERR_CPUSEMPTY] = "cpuset.cpus is empty",
- [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", };
@@ -127,19 +127,28 @@ struct cpuset { /* * Exclusive CPUs dedicated to current cgroup (default hierarchy only) *
* This exclusive CPUs must be a subset of cpus_allowed. A parent
* cgroup can only grant exclusive CPUs to one of its children.
* The effective_cpus of a valid partition root comes solely from its
* effective_xcpus and some of the effective_xcpus may be distributed
* to sub-partitions below & hence excluded from its effective_cpus.
* For a valid partition root, its effective_cpus have no relationship
* with cpus_allowed unless its exclusive_cpus isn't set.
* When the cgroup becomes a valid partition root, effective_xcpus
* defaults to cpus_allowed if not set. The effective_cpus of a valid
* partition root comes solely from its effective_xcpus and some of the
* effective_xcpus may be distributed to sub-partitions below & hence
* excluded from its effective_cpus.
* This value will only be set if either exclusive_cpus is set or
* when this cpuset becomes a local partition root.
*/ cpumask_var_t effective_xcpus;
/*
- Exclusive CPUs as requested by the user (default hierarchy only)
*
* Its value is independent of cpus_allowed and designates the set of
* CPUs that can be granted to the current cpuset or its children when
* it becomes a valid partition root. The effective set of exclusive
* CPUs granted (effective_xcpus) depends on whether those exclusive
* CPUs are passed down by its ancestors and not yet taken up by
* another sibling partition root along the way.
*
* If its value isn't set, it defaults to cpus_allowed.
*/ cpumask_var_t exclusive_cpus;
@@ -230,6 +239,17 @@ static struct list_head remote_children;
- 2 - partition root without load balancing (isolated)
- -1 - invalid partition root
- -2 - invalid isolated partition root
- There are 2 types of partitions - local or remote. Local partitions are
- those whose parents are partition root themselves. Setting of
- cpuset.cpus.exclusive are optional in setting up local partitions.
- Remote partitions are those whose parents are not partition roots. Passing
- down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor
- nodes are mandatory in creating a remote partition.
- For simplicity, a local partition can be created under a local or remote
- partition but a remote partition cannot have any partition root in its
*/ #define PRS_MEMBER 0 #define PRS_ROOT 1
- ancestor chain except the cgroup root.
@@ -709,6 +729,19 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); }
+/* Return user specified exclusive CPUs */ +static inline struct cpumask *user_xcpus(struct cpuset *cs) +{
- return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed
: cs->exclusive_cpus;
+}
+static inline bool xcpus_empty(struct cpuset *cs) +{
- return cpumask_empty(cs->cpus_allowed) &&
cpumask_empty(cs->exclusive_cpus);
+}
- static inline struct cpumask *fetch_xcpus(struct cpuset *cs) { return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus :
@@ -1593,7 +1626,7 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
- Return: true if xcpus is not empty, false otherwise.
- Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
- it must be a subset of cpus_allowed and parent's effective_xcpus.
*/ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, struct cpumask *xcpus)
- it must be a subset of parent's effective_xcpus.
@@ -1603,12 +1636,7 @@ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, if (!xcpus) xcpus = cs->effective_xcpus;
- if (!cpumask_empty(cs->exclusive_cpus))
cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed);
- else
cpumask_copy(xcpus, cs->cpus_allowed);
- return cpumask_and(xcpus, xcpus, parent->effective_xcpus);
return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); }
static inline bool is_remote_partition(struct cpuset *cs)
@@ -1887,8 +1915,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ adding = deleting = false; old_prs = new_prs = cs->partition_root_state;
- xcpus = !cpumask_empty(cs->exclusive_cpus)
? cs->effective_xcpus : cs->cpus_allowed;
xcpus = user_xcpus(cs);
if (cmd == partcmd_invalidate) { if (is_prs_invalid(old_prs))
@@ -1916,7 +1943,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, return is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART; }
- if (!newmask && cpumask_empty(cs->cpus_allowed))
if (!newmask && xcpus_empty(cs)) return PERR_CPUSEMPTY;
nocpu = tasks_nocpu_error(parent, cs, xcpus);
@@ -3130,9 +3157,9 @@ static int update_prstate(struct cpuset *cs, int new_prs) ? partcmd_enable : partcmd_enablei;
/*
* cpus_allowed cannot be empty.
*/* cpus_allowed and exclusive_cpus cannot be both empty.
if (cpumask_empty(cs->cpus_allowed)) {
}if (xcpus_empty(cs)) { err = PERR_CPUSEMPTY; goto out;
-- 2.39.3
linux-kselftest-mirror@lists.linaro.org