On Mon, 13 Jan 2020, Mike Kravetz wrote:
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 35415af9ed26f..b03270b0d5833 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -96,8 +96,12 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) int idx;
for (idx = 0; idx < hugetlb_max_hstate; idx++) {
if (page_counter_read(&h_cg->hugepage[idx]))
if (page_counter_read(
hugetlb_cgroup_get_counter(h_cg, idx, true)) ||
page_counter_read(
hugetlb_cgroup_get_counter(h_cg, idx, false))) { return true;
} return false;}
} @@ -108,18 +112,32 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, int idx;
for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
struct page_counter *counter = &h_cgroup->hugepage[idx];
struct page_counter *parent = NULL;
struct page_counter *fault_parent = NULL;
unsigned long limit; int ret;struct page_counter *reserved_parent = NULL;
if (parent_h_cgroup)
parent = &parent_h_cgroup->hugepage[idx];
page_counter_init(counter, parent);
if (parent_h_cgroup) {
fault_parent = hugetlb_cgroup_get_counter(
parent_h_cgroup, idx, false);
reserved_parent = hugetlb_cgroup_get_counter(
parent_h_cgroup, idx, true);
}
page_counter_init(hugetlb_cgroup_get_counter(h_cgroup, idx,
false),
fault_parent);
page_counter_init(hugetlb_cgroup_get_counter(h_cgroup, idx,
true),
reserved_parent);
limit = round_down(PAGE_COUNTER_MAX, 1 << huge_page_order(&hstates[idx]));
ret = page_counter_set_max(counter, limit);
ret = page_counter_set_max(
hugetlb_cgroup_get_counter(h_cgroup, idx, false),
limit);
ret = page_counter_set_max(
VM_BUG_ON(ret);hugetlb_cgroup_get_counter(h_cgroup, idx, true), limit);
The second page_counter_set_max() call overwrites ret before the check in VM_BUG_ON().
} } @@ -149,7 +167,6 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) kfree(h_cgroup); }
/*
- Should be called with hugetlb_lock held.
- Since we are holding hugetlb_lock, pages cannot get moved from
@@ -165,7 +182,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, struct hugetlb_cgroup *page_hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
- page_hcg = hugetlb_cgroup_from_page(page);
- page_hcg = hugetlb_cgroup_from_page(page, false); /*
- We can have pages in active list without any cgroup
- ie, hugepage with less than 3 pages. We can safely
@@ -184,7 +201,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, /* Take the pages off the local counter */ page_counter_cancel(counter, nr_pages);
- set_hugetlb_cgroup(page, parent);
- set_hugetlb_cgroup(page, parent, false);
out: return; } @@ -227,7 +244,7 @@ static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, }
int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
struct hugetlb_cgroup **ptr)
struct hugetlb_cgroup **ptr, bool reserved)
{ int ret = 0; struct page_counter *counter; @@ -250,13 +267,20 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, } rcu_read_unlock();
- if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages,
&counter)) {
- if (!page_counter_try_charge(hugetlb_cgroup_get_counter(h_cg, idx,
reserved),
ret = -ENOMEM; hugetlb_event(hugetlb_cgroup_from_counter(counter, idx), idx, HUGETLB_MAX);nr_pages, &counter)) {
css_put(&h_cg->css);
}goto done;
- css_put(&h_cg->css);
- /* Reservations take a reference to the css because they do not get
* reparented.
I'm hoping someone with more cgroup knowledge can comment on this and any consequences of not reparenting reservations. We previously talked about why reparenting would be very difficult/expensive. I understand why you are nopt doing it. Just do not fully understand what needs to be done from the cgroup side.
I don't see any description of how hugetlb_cgroup currently acts wrt reparenting in the last patch in the series and how this is the same or different for reservations. I think the discussion that is referenced here is probably lost in some previous posting of the series. I think it's particularly useful information that the end user will need to know about for its handling so it would benefit from some documentation in the last patch.