Skip to content

Commit b205256

Browse files
hnaztorvalds
authored andcommitted
mm: memcontrol: continue cache reclaim from offlined groups
On cgroup deletion, outstanding page cache charges are moved to the parent group so that they're not lost and can be reclaimed during pressure on/inside said parent. But this reparenting is fairly tricky and its synchroneous nature has led to several lock-ups in the past. Since c2931b7 ("cgroup: iterate cgroup_subsys_states directly") css iterators now also include offlined css, so memcg iterators can be changed to include offlined children during reclaim of a group, and leftover cache can just stay put. There is a slight change of behavior in that charges of deleted groups no longer show up as local charges in the parent. But they are still included in the parent's hierarchical statistics. Signed-off-by: Johannes Weiner <[email protected]> Acked-by: Vladimir Davydov <[email protected]> Acked-by: Michal Hocko <[email protected]> Cc: David Rientjes <[email protected]> Cc: Tejun Heo <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 64f2199 commit b205256

File tree

1 file changed

+1
-217
lines changed

1 file changed

+1
-217
lines changed

mm/memcontrol.c

+1-217
Original file line numberDiff line numberDiff line change
@@ -1132,7 +1132,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
11321132
if (css == &root->css)
11331133
break;
11341134

1135-
if (css_tryget_online(css)) {
1135+
if (css_tryget(css)) {
11361136
/*
11371137
* Make sure the memcg is initialized:
11381138
* mem_cgroup_css_online() orders the the
@@ -3316,79 +3316,6 @@ static int mem_cgroup_move_account(struct page *page,
33163316
return ret;
33173317
}
33183318

3319-
/**
3320-
* mem_cgroup_move_parent - moves page to the parent group
3321-
* @page: the page to move
3322-
* @pc: page_cgroup of the page
3323-
* @child: page's cgroup
3324-
*
3325-
* move charges to its parent or the root cgroup if the group has no
3326-
* parent (aka use_hierarchy==0).
3327-
* Although this might fail (get_page_unless_zero, isolate_lru_page or
3328-
* mem_cgroup_move_account fails) the failure is always temporary and
3329-
* it signals a race with a page removal/uncharge or migration. In the
3330-
* first case the page is on the way out and it will vanish from the LRU
3331-
* on the next attempt and the call should be retried later.
3332-
* Isolation from the LRU fails only if page has been isolated from
3333-
* the LRU since we looked at it and that usually means either global
3334-
* reclaim or migration going on. The page will either get back to the
3335-
* LRU or vanish.
3336-
* Finaly mem_cgroup_move_account fails only if the page got uncharged
3337-
* (!PageCgroupUsed) or moved to a different group. The page will
3338-
* disappear in the next attempt.
3339-
*/
3340-
static int mem_cgroup_move_parent(struct page *page,
3341-
struct page_cgroup *pc,
3342-
struct mem_cgroup *child)
3343-
{
3344-
struct mem_cgroup *parent;
3345-
unsigned int nr_pages;
3346-
unsigned long uninitialized_var(flags);
3347-
int ret;
3348-
3349-
VM_BUG_ON(mem_cgroup_is_root(child));
3350-
3351-
ret = -EBUSY;
3352-
if (!get_page_unless_zero(page))
3353-
goto out;
3354-
if (isolate_lru_page(page))
3355-
goto put;
3356-
3357-
nr_pages = hpage_nr_pages(page);
3358-
3359-
parent = parent_mem_cgroup(child);
3360-
/*
3361-
* If no parent, move charges to root cgroup.
3362-
*/
3363-
if (!parent)
3364-
parent = root_mem_cgroup;
3365-
3366-
if (nr_pages > 1) {
3367-
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3368-
flags = compound_lock_irqsave(page);
3369-
}
3370-
3371-
ret = mem_cgroup_move_account(page, nr_pages,
3372-
pc, child, parent);
3373-
if (!ret) {
3374-
if (!mem_cgroup_is_root(parent))
3375-
css_get_many(&parent->css, nr_pages);
3376-
/* Take charge off the local counters */
3377-
page_counter_cancel(&child->memory, nr_pages);
3378-
if (do_swap_account)
3379-
page_counter_cancel(&child->memsw, nr_pages);
3380-
css_put_many(&child->css, nr_pages);
3381-
}
3382-
3383-
if (nr_pages > 1)
3384-
compound_unlock_irqrestore(page, flags);
3385-
putback_lru_page(page);
3386-
put:
3387-
put_page(page);
3388-
out:
3389-
return ret;
3390-
}
3391-
33923319
#ifdef CONFIG_MEMCG_SWAP
33933320
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
33943321
bool charge)
@@ -3682,105 +3609,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
36823609
return nr_reclaimed;
36833610
}
36843611

3685-
/**
3686-
* mem_cgroup_force_empty_list - clears LRU of a group
3687-
* @memcg: group to clear
3688-
* @node: NUMA node
3689-
* @zid: zone id
3690-
* @lru: lru to to clear
3691-
*
3692-
* Traverse a specified page_cgroup list and try to drop them all. This doesn't
3693-
* reclaim the pages page themselves - pages are moved to the parent (or root)
3694-
* group.
3695-
*/
3696-
static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3697-
int node, int zid, enum lru_list lru)
3698-
{
3699-
struct lruvec *lruvec;
3700-
unsigned long flags;
3701-
struct list_head *list;
3702-
struct page *busy;
3703-
struct zone *zone;
3704-
3705-
zone = &NODE_DATA(node)->node_zones[zid];
3706-
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
3707-
list = &lruvec->lists[lru];
3708-
3709-
busy = NULL;
3710-
do {
3711-
struct page_cgroup *pc;
3712-
struct page *page;
3713-
3714-
spin_lock_irqsave(&zone->lru_lock, flags);
3715-
if (list_empty(list)) {
3716-
spin_unlock_irqrestore(&zone->lru_lock, flags);
3717-
break;
3718-
}
3719-
page = list_entry(list->prev, struct page, lru);
3720-
if (busy == page) {
3721-
list_move(&page->lru, list);
3722-
busy = NULL;
3723-
spin_unlock_irqrestore(&zone->lru_lock, flags);
3724-
continue;
3725-
}
3726-
spin_unlock_irqrestore(&zone->lru_lock, flags);
3727-
3728-
pc = lookup_page_cgroup(page);
3729-
3730-
if (mem_cgroup_move_parent(page, pc, memcg)) {
3731-
/* found lock contention or "pc" is obsolete. */
3732-
busy = page;
3733-
} else
3734-
busy = NULL;
3735-
cond_resched();
3736-
} while (!list_empty(list));
3737-
}
3738-
3739-
/*
3740-
* make mem_cgroup's charge to be 0 if there is no task by moving
3741-
* all the charges and pages to the parent.
3742-
* This enables deleting this mem_cgroup.
3743-
*
3744-
* Caller is responsible for holding css reference on the memcg.
3745-
*/
3746-
static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3747-
{
3748-
int node, zid;
3749-
3750-
do {
3751-
/* This is for making all *used* pages to be on LRU. */
3752-
lru_add_drain_all();
3753-
drain_all_stock_sync(memcg);
3754-
mem_cgroup_start_move(memcg);
3755-
for_each_node_state(node, N_MEMORY) {
3756-
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3757-
enum lru_list lru;
3758-
for_each_lru(lru) {
3759-
mem_cgroup_force_empty_list(memcg,
3760-
node, zid, lru);
3761-
}
3762-
}
3763-
}
3764-
mem_cgroup_end_move(memcg);
3765-
memcg_oom_recover(memcg);
3766-
cond_resched();
3767-
3768-
/*
3769-
* Kernel memory may not necessarily be trackable to a specific
3770-
* process. So they are not migrated, and therefore we can't
3771-
* expect their value to drop to 0 here.
3772-
* Having res filled up with kmem only is enough.
3773-
*
3774-
* This is a safety check because mem_cgroup_force_empty_list
3775-
* could have raced with mem_cgroup_replace_page_cache callers
3776-
* so the lru seemed empty but the page could have been added
3777-
* right after the check. RES_USAGE should be safe as we always
3778-
* charge before adding to the LRU.
3779-
*/
3780-
} while (page_counter_read(&memcg->memory) -
3781-
page_counter_read(&memcg->kmem) > 0);
3782-
}
3783-
37843612
/*
37853613
* Test whether @memcg has children, dead or alive. Note that this
37863614
* function doesn't care whether @memcg has use_hierarchy enabled and
@@ -5323,7 +5151,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
53235151
{
53245152
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
53255153
struct mem_cgroup_event *event, *tmp;
5326-
struct cgroup_subsys_state *iter;
53275154

53285155
/*
53295156
* Unregister events and notify userspace.
@@ -5337,56 +5164,13 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
53375164
}
53385165
spin_unlock(&memcg->event_list_lock);
53395166

5340-
/*
5341-
* This requires that offlining is serialized. Right now that is
5342-
* guaranteed because css_killed_work_fn() holds the cgroup_mutex.
5343-
*/
5344-
css_for_each_descendant_post(iter, css)
5345-
mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
5346-
53475167
memcg_unregister_all_caches(memcg);
53485168
vmpressure_cleanup(&memcg->vmpressure);
53495169
}
53505170

53515171
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
53525172
{
53535173
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5354-
/*
5355-
* XXX: css_offline() would be where we should reparent all
5356-
* memory to prepare the cgroup for destruction. However,
5357-
* memcg does not do css_tryget_online() and page_counter charging
5358-
* under the same RCU lock region, which means that charging
5359-
* could race with offlining. Offlining only happens to
5360-
* cgroups with no tasks in them but charges can show up
5361-
* without any tasks from the swapin path when the target
5362-
* memcg is looked up from the swapout record and not from the
5363-
* current task as it usually is. A race like this can leak
5364-
* charges and put pages with stale cgroup pointers into
5365-
* circulation:
5366-
*
5367-
* #0 #1
5368-
* lookup_swap_cgroup_id()
5369-
* rcu_read_lock()
5370-
* mem_cgroup_lookup()
5371-
* css_tryget_online()
5372-
* rcu_read_unlock()
5373-
* disable css_tryget_online()
5374-
* call_rcu()
5375-
* offline_css()
5376-
* reparent_charges()
5377-
* page_counter_try_charge()
5378-
* css_put()
5379-
* css_free()
5380-
* pc->mem_cgroup = dead memcg
5381-
* add page to lru
5382-
*
5383-
* The bulk of the charges are still moved in offline_css() to
5384-
* avoid pinning a lot of pages in case a long-term reference
5385-
* like a swapout record is deferring the css_free() to long
5386-
* after offlining. But this makes sure we catch any charges
5387-
* made after offlining:
5388-
*/
5389-
mem_cgroup_reparent_charges(memcg);
53905174

53915175
memcg_destroy_kmem(memcg);
53925176
__mem_cgroup_free(memcg);

0 commit comments

Comments
 (0)