Skip to content

Commit 97b2782

Browse files
htejunaxboe
authored andcommitted
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback. The former trackes ownership per-page while the latter per-inode. This was a deliberate design decision because honoring per-page ownership in the writeback path is complicated, may lead to higher CPU and IO overheads and deemed unnecessary given that write-sharing an inode across different cgroups isn't a common use-case. Combined with inode majority-writer ownership switching, this works well enough in most cases but there are some pathological cases. For example, let's say there are two cgroups A and B which keep writing to different but confined parts of the same inode. B owns the inode and A's memory is limited far below B's. A's dirty ratio can rise enough to trigger balance_dirty_pages() sleeps but B's can be low enough to avoid triggering background writeback. A will be slowed down without a way to make writeback of the dirty pages happen. This patch implements foreign dirty recording and foreign mechanism so that when a memcg encounters a condition as above it can trigger flushes on bdi_writebacks which can clean its pages. Please see the comment on top of mem_cgroup_track_foreign_dirty_slowpath() for details. A reproducer follows. write-range.c:: #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <fcntl.h> #include <sys/types.h> static const char *usage = "write-range FILE START SIZE\n"; int main(int argc, char **argv) { int fd; unsigned long start, size, end, pos; char *endp; char buf[4096]; if (argc < 4) { fprintf(stderr, usage); return 1; } fd = open(argv[1], O_WRONLY); if (fd < 0) { perror("open"); return 1; } start = strtoul(argv[2], &endp, 0); if (*endp != '\0') { fprintf(stderr, usage); return 1; } size = strtoul(argv[3], &endp, 0); if (*endp != '\0') { fprintf(stderr, usage); return 1; } end = start + size; while (1) { for (pos = start; pos < end; ) { long bread, bwritten = 0; if (lseek(fd, pos, SEEK_SET) < 0) { perror("lseek"); return 1; } bread = read(0, buf, sizeof(buf) < end - pos ? sizeof(buf) : end - pos); if (bread < 0) { perror("read"); return 1; } if (bread == 0) return 0; while (bwritten < bread) { long this; this = write(fd, buf + bwritten, bread - bwritten); if (this < 0) { perror("write"); return 1; } bwritten += this; pos += bwritten; } } } } repro.sh:: #!/bin/bash set -e set -x sysctl -w vm.dirty_expire_centisecs=300000 sysctl -w vm.dirty_writeback_centisecs=300000 sysctl -w vm.dirtytime_expire_seconds=300000 echo 3 > /proc/sys/vm/drop_caches TEST=/sys/fs/cgroup/test A=$TEST/A B=$TEST/B mkdir -p $A $B echo "+memory +io" > $TEST/cgroup.subtree_control echo $((1<<30)) > $A/memory.high echo $((32<<30)) > $B/memory.high rm -f testfile touch testfile fallocate -l 4G testfile echo "Starting B" (echo $BASHPID > $B/cgroup.procs pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) & echo "Waiting 10s to ensure B claims the testfile inode" sleep 5 sync sleep 5 sync echo "Starting A" (echo $BASHPID > $A/cgroup.procs pv < /dev/urandom | ./write-range testfile 0 $((2<<30))) v2: Added comments explaining why the specific intervals are being used. v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort flushing while avoding possible livelocks. v4: Use get_jiffies_64() and time_before/after64() instead of raw jiffies_64 and arthimetic comparisons as suggested by Jan. Reviewed-by: Jan Kara <[email protected]> Signed-off-by: Tejun Heo <[email protected]> Signed-off-by: Jens Axboe <[email protected]>
1 parent d62241c commit 97b2782

File tree

4 files changed

+178
-0
lines changed

4 files changed

+178
-0
lines changed

include/linux/backing-dev-defs.h

+1
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ enum wb_reason {
6363
* so it has a mismatch name.
6464
*/
6565
WB_REASON_FORKER_THREAD,
66+
WB_REASON_FOREIGN_FLUSH,
6667

6768
WB_REASON_MAX,
6869
};

include/linux/memcontrol.h

+39
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,23 @@ struct memcg_padding {
183183
#define MEMCG_PADDING(name)
184184
#endif
185185

186+
/*
187+
* Remember four most recent foreign writebacks with dirty pages in this
188+
* cgroup. Inode sharing is expected to be uncommon and, even if we miss
189+
* one in a given round, we're likely to catch it later if it keeps
190+
* foreign-dirtying, so a fairly low count should be enough.
191+
*
192+
* See mem_cgroup_track_foreign_dirty_slowpath() for details.
193+
*/
194+
#define MEMCG_CGWB_FRN_CNT 4
195+
196+
struct memcg_cgwb_frn {
197+
u64 bdi_id; /* bdi->id of the foreign inode */
198+
int memcg_id; /* memcg->css.id of foreign inode */
199+
u64 at; /* jiffies_64 at the time of dirtying */
200+
struct wb_completion done; /* tracks in-flight foreign writebacks */
201+
};
202+
186203
/*
187204
* The memory controller data structure. The memory controller controls both
188205
* page cache and RSS per cgroup. We would eventually like to provide
@@ -307,6 +324,7 @@ struct mem_cgroup {
307324
#ifdef CONFIG_CGROUP_WRITEBACK
308325
struct list_head cgwb_list;
309326
struct wb_domain cgwb_domain;
327+
struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
310328
#endif
311329

312330
/* List of events which userspace want to receive */
@@ -1218,6 +1236,18 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
12181236
unsigned long *pheadroom, unsigned long *pdirty,
12191237
unsigned long *pwriteback);
12201238

1239+
void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
1240+
struct bdi_writeback *wb);
1241+
1242+
static inline void mem_cgroup_track_foreign_dirty(struct page *page,
1243+
struct bdi_writeback *wb)
1244+
{
1245+
if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
1246+
mem_cgroup_track_foreign_dirty_slowpath(page, wb);
1247+
}
1248+
1249+
void mem_cgroup_flush_foreign(struct bdi_writeback *wb);
1250+
12211251
#else /* CONFIG_CGROUP_WRITEBACK */
12221252

12231253
static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
@@ -1233,6 +1263,15 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
12331263
{
12341264
}
12351265

1266+
static inline void mem_cgroup_track_foreign_dirty(struct page *page,
1267+
struct bdi_writeback *wb)
1268+
{
1269+
}
1270+
1271+
static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
1272+
{
1273+
}
1274+
12361275
#endif /* CONFIG_CGROUP_WRITEBACK */
12371276

12381277
struct sock;

mm/memcontrol.c

+134
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,10 @@ int do_swap_account __read_mostly;
8787
#define do_swap_account 0
8888
#endif
8989

90+
#ifdef CONFIG_CGROUP_WRITEBACK
91+
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
92+
#endif
93+
9094
/* Whether legacy memory+swap accounting is active */
9195
static bool do_memsw_account(void)
9296
{
@@ -4145,6 +4149,127 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
41454149
}
41464150
}
41474151

4152+
/*
4153+
* Foreign dirty flushing
4154+
*
4155+
* There's an inherent mismatch between memcg and writeback. The former
4156+
* trackes ownership per-page while the latter per-inode. This was a
4157+
* deliberate design decision because honoring per-page ownership in the
4158+
* writeback path is complicated, may lead to higher CPU and IO overheads
4159+
* and deemed unnecessary given that write-sharing an inode across
4160+
* different cgroups isn't a common use-case.
4161+
*
4162+
* Combined with inode majority-writer ownership switching, this works well
4163+
* enough in most cases but there are some pathological cases. For
4164+
* example, let's say there are two cgroups A and B which keep writing to
4165+
* different but confined parts of the same inode. B owns the inode and
4166+
* A's memory is limited far below B's. A's dirty ratio can rise enough to
4167+
* trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
4168+
* triggering background writeback. A will be slowed down without a way to
4169+
* make writeback of the dirty pages happen.
4170+
*
4171+
* Conditions like the above can lead to a cgroup getting repatedly and
4172+
* severely throttled after making some progress after each
4173+
* dirty_expire_interval while the underyling IO device is almost
4174+
* completely idle.
4175+
*
4176+
* Solving this problem completely requires matching the ownership tracking
4177+
* granularities between memcg and writeback in either direction. However,
4178+
* the more egregious behaviors can be avoided by simply remembering the
4179+
* most recent foreign dirtying events and initiating remote flushes on
4180+
* them when local writeback isn't enough to keep the memory clean enough.
4181+
*
4182+
* The following two functions implement such mechanism. When a foreign
4183+
* page - a page whose memcg and writeback ownerships don't match - is
4184+
* dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
4185+
* bdi_writeback on the page owning memcg. When balance_dirty_pages()
4186+
* decides that the memcg needs to sleep due to high dirty ratio, it calls
4187+
* mem_cgroup_flush_foreign() which queues writeback on the recorded
4188+
* foreign bdi_writebacks which haven't expired. Both the numbers of
4189+
* recorded bdi_writebacks and concurrent in-flight foreign writebacks are
4190+
* limited to MEMCG_CGWB_FRN_CNT.
4191+
*
4192+
* The mechanism only remembers IDs and doesn't hold any object references.
4193+
* As being wrong occasionally doesn't matter, updates and accesses to the
4194+
* records are lockless and racy.
4195+
*/
4196+
void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
4197+
struct bdi_writeback *wb)
4198+
{
4199+
struct mem_cgroup *memcg = page->mem_cgroup;
4200+
struct memcg_cgwb_frn *frn;
4201+
u64 now = get_jiffies_64();
4202+
u64 oldest_at = now;
4203+
int oldest = -1;
4204+
int i;
4205+
4206+
/*
4207+
* Pick the slot to use. If there is already a slot for @wb, keep
4208+
* using it. If not replace the oldest one which isn't being
4209+
* written out.
4210+
*/
4211+
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4212+
frn = &memcg->cgwb_frn[i];
4213+
if (frn->bdi_id == wb->bdi->id &&
4214+
frn->memcg_id == wb->memcg_css->id)
4215+
break;
4216+
if (time_before64(frn->at, oldest_at) &&
4217+
atomic_read(&frn->done.cnt) == 1) {
4218+
oldest = i;
4219+
oldest_at = frn->at;
4220+
}
4221+
}
4222+
4223+
if (i < MEMCG_CGWB_FRN_CNT) {
4224+
/*
4225+
* Re-using an existing one. Update timestamp lazily to
4226+
* avoid making the cacheline hot. We want them to be
4227+
* reasonably up-to-date and significantly shorter than
4228+
* dirty_expire_interval as that's what expires the record.
4229+
* Use the shorter of 1s and dirty_expire_interval / 8.
4230+
*/
4231+
unsigned long update_intv =
4232+
min_t(unsigned long, HZ,
4233+
msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4234+
4235+
if (time_before64(frn->at, now - update_intv))
4236+
frn->at = now;
4237+
} else if (oldest >= 0) {
4238+
/* replace the oldest free one */
4239+
frn = &memcg->cgwb_frn[oldest];
4240+
frn->bdi_id = wb->bdi->id;
4241+
frn->memcg_id = wb->memcg_css->id;
4242+
frn->at = now;
4243+
}
4244+
}
4245+
4246+
/* issue foreign writeback flushes for recorded foreign dirtying events */
4247+
void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4248+
{
4249+
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4250+
unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4251+
u64 now = jiffies_64;
4252+
int i;
4253+
4254+
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4255+
struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4256+
4257+
/*
4258+
* If the record is older than dirty_expire_interval,
4259+
* writeback on it has already started. No need to kick it
4260+
* off again. Also, don't start a new one if there's
4261+
* already one in flight.
4262+
*/
4263+
if (time_after64(frn->at, now - intv) &&
4264+
atomic_read(&frn->done.cnt) == 1) {
4265+
frn->at = 0;
4266+
cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
4267+
WB_REASON_FOREIGN_FLUSH,
4268+
&frn->done);
4269+
}
4270+
}
4271+
}
4272+
41484273
#else /* CONFIG_CGROUP_WRITEBACK */
41494274

41504275
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
@@ -4661,6 +4786,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
46614786
struct mem_cgroup *memcg;
46624787
unsigned int size;
46634788
int node;
4789+
int __maybe_unused i;
46644790

46654791
size = sizeof(struct mem_cgroup);
46664792
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
@@ -4704,6 +4830,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
47044830
#endif
47054831
#ifdef CONFIG_CGROUP_WRITEBACK
47064832
INIT_LIST_HEAD(&memcg->cgwb_list);
4833+
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
4834+
memcg->cgwb_frn[i].done =
4835+
__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
47074836
#endif
47084837
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
47094838
return memcg;
@@ -4833,7 +4962,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
48334962
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
48344963
{
48354964
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4965+
int __maybe_unused i;
48364966

4967+
#ifdef CONFIG_CGROUP_WRITEBACK
4968+
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
4969+
wb_wait_for_completion(&memcg->cgwb_frn[i].done);
4970+
#endif
48374971
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
48384972
static_branch_dec(&memcg_sockets_enabled_key);
48394973

mm/page-writeback.c

+4
Original file line numberDiff line numberDiff line change
@@ -1667,6 +1667,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
16671667
if (unlikely(!writeback_in_progress(wb)))
16681668
wb_start_background_writeback(wb);
16691669

1670+
mem_cgroup_flush_foreign(wb);
1671+
16701672
/*
16711673
* Calculate global domain's pos_ratio and select the
16721674
* global dtc by default.
@@ -2427,6 +2429,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
24272429
task_io_account_write(PAGE_SIZE);
24282430
current->nr_dirtied++;
24292431
this_cpu_inc(bdp_ratelimits);
2432+
2433+
mem_cgroup_track_foreign_dirty(page, wb);
24302434
}
24312435
}
24322436

0 commit comments

Comments
 (0)