Skip to content

Commit e076ab2

Browse files
josefbacikkdave
authored andcommitted
btrfs: shrink delalloc pages instead of full inodes
Commit 38d715f ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc") cleaned up how we do delalloc shrinking by utilizing some infrastructure we have in place to flush inodes that we use for device replace and snapshot. However this introduced a pretty serious performance regression. To reproduce the user untarred the source tarball of Firefox (360MiB xz compressed/1.5GiB uncompressed), and would see it take anywhere from 5 to 20 times as long to untar in 5.10 compared to 5.9. This was observed on fast devices (SSD and better) and not on HDD. The root cause is because before we would generally use the normal writeback path to reclaim delalloc space, and for this we would provide it with the number of pages we wanted to flush. The referenced commit changed this to flush that many inodes, which drastically increased the amount of space we were flushing in certain cases, which severely affected performance. We cannot revert this patch unfortunately because of 3d45f22 ("btrfs: fix deadlock when cloning inline extent and low on free metadata space") which requires the ability to skip flushing inodes that are being cloned in certain scenarios, which means we need to keep using our flushing infrastructure or risk re-introducing the deadlock. Instead to fix this problem we can go back to providing btrfs_start_delalloc_roots with a number of pages to flush, and then set up a writeback_control and utilize sync_inode() to handle the flushing for us. This gives us the same behavior we had prior to the fix, while still allowing us to avoid the deadlock that was fixed by Filipe. I redid the users original test and got the following results on one of our test machines (256GiB of ram, 56 cores, 2TiB Intel NVMe drive) 5.9 0m54.258s 5.10 1m26.212s 5.10+patch 0m38.800s 5.10+patch is significantly faster than plain 5.9 because of my patch series "Change data reservations to use the ticketing infra" which contained the patch that introduced the regression, but generally improved the overall ENOSPC flushing mechanisms. Additional testing on consumer-grade SSD (8GiB ram, 8 CPU) confirm the results: 5.10.5 4m00s 5.10.5+patch 1m08s 5.11-rc2 5m14s 5.11-rc2+patch 1m30s Reported-by: René Rebe <[email protected]> Fixes: 38d715f ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc") CC: [email protected] # 5.10 Signed-off-by: Josef Bacik <[email protected]> Tested-by: David Sterba <[email protected]> Reviewed-by: David Sterba <[email protected]> [ add my test results ] Signed-off-by: David Sterba <[email protected]>
1 parent 50e31ef commit e076ab2

File tree

2 files changed

+46
-18
lines changed

2 files changed

+46
-18
lines changed

fs/btrfs/inode.c

+43-17
Original file line numberDiff line numberDiff line change
@@ -9390,7 +9390,8 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
93909390
* some fairly slow code that needs optimization. This walks the list
93919391
* of all the inodes with pending delalloc and forces them to disk.
93929392
*/
9393-
static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot,
9393+
static int start_delalloc_inodes(struct btrfs_root *root,
9394+
struct writeback_control *wbc, bool snapshot,
93949395
bool in_reclaim_context)
93959396
{
93969397
struct btrfs_inode *binode;
@@ -9399,6 +9400,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot
93999400
struct list_head works;
94009401
struct list_head splice;
94019402
int ret = 0;
9403+
bool full_flush = wbc->nr_to_write == LONG_MAX;
94029404

94039405
INIT_LIST_HEAD(&works);
94049406
INIT_LIST_HEAD(&splice);
@@ -9427,18 +9429,24 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot
94279429
if (snapshot)
94289430
set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
94299431
&binode->runtime_flags);
9430-
work = btrfs_alloc_delalloc_work(inode);
9431-
if (!work) {
9432-
iput(inode);
9433-
ret = -ENOMEM;
9434-
goto out;
9435-
}
9436-
list_add_tail(&work->list, &works);
9437-
btrfs_queue_work(root->fs_info->flush_workers,
9438-
&work->work);
9439-
if (*nr != U64_MAX) {
9440-
(*nr)--;
9441-
if (*nr == 0)
9432+
if (full_flush) {
9433+
work = btrfs_alloc_delalloc_work(inode);
9434+
if (!work) {
9435+
iput(inode);
9436+
ret = -ENOMEM;
9437+
goto out;
9438+
}
9439+
list_add_tail(&work->list, &works);
9440+
btrfs_queue_work(root->fs_info->flush_workers,
9441+
&work->work);
9442+
} else {
9443+
ret = sync_inode(inode, wbc);
9444+
if (!ret &&
9445+
test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
9446+
&BTRFS_I(inode)->runtime_flags))
9447+
ret = sync_inode(inode, wbc);
9448+
btrfs_add_delayed_iput(inode);
9449+
if (ret || wbc->nr_to_write <= 0)
94429450
goto out;
94439451
}
94449452
cond_resched();
@@ -9464,18 +9472,29 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot
94649472

94659473
int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
94669474
{
9475+
struct writeback_control wbc = {
9476+
.nr_to_write = LONG_MAX,
9477+
.sync_mode = WB_SYNC_NONE,
9478+
.range_start = 0,
9479+
.range_end = LLONG_MAX,
9480+
};
94679481
struct btrfs_fs_info *fs_info = root->fs_info;
9468-
u64 nr = U64_MAX;
94699482

94709483
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
94719484
return -EROFS;
94729485

9473-
return start_delalloc_inodes(root, &nr, true, false);
9486+
return start_delalloc_inodes(root, &wbc, true, false);
94749487
}
94759488

94769489
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
94779490
bool in_reclaim_context)
94789491
{
9492+
struct writeback_control wbc = {
9493+
.nr_to_write = (nr == U64_MAX) ? LONG_MAX : (unsigned long)nr,
9494+
.sync_mode = WB_SYNC_NONE,
9495+
.range_start = 0,
9496+
.range_end = LLONG_MAX,
9497+
};
94799498
struct btrfs_root *root;
94809499
struct list_head splice;
94819500
int ret;
@@ -9489,6 +9508,13 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
94899508
spin_lock(&fs_info->delalloc_root_lock);
94909509
list_splice_init(&fs_info->delalloc_roots, &splice);
94919510
while (!list_empty(&splice) && nr) {
9511+
/*
9512+
* Reset nr_to_write here so we know that we're doing a full
9513+
* flush.
9514+
*/
9515+
if (nr == U64_MAX)
9516+
wbc.nr_to_write = LONG_MAX;
9517+
94929518
root = list_first_entry(&splice, struct btrfs_root,
94939519
delalloc_root);
94949520
root = btrfs_grab_root(root);
@@ -9497,9 +9523,9 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
94979523
&fs_info->delalloc_roots);
94989524
spin_unlock(&fs_info->delalloc_root_lock);
94999525

9500-
ret = start_delalloc_inodes(root, &nr, false, in_reclaim_context);
9526+
ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
95019527
btrfs_put_root(root);
9502-
if (ret < 0)
9528+
if (ret < 0 || wbc.nr_to_write <= 0)
95039529
goto out;
95049530
spin_lock(&fs_info->delalloc_root_lock);
95059531
}

fs/btrfs/space-info.c

+3-1
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,9 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
532532

533533
loops = 0;
534534
while ((delalloc_bytes || dio_bytes) && loops < 3) {
535-
btrfs_start_delalloc_roots(fs_info, items, true);
535+
u64 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
536+
537+
btrfs_start_delalloc_roots(fs_info, nr_pages, true);
536538

537539
loops++;
538540
if (wait_ordered && !trans) {

0 commit comments

Comments
 (0)