Skip to content

Commit 1d4a6e1

Browse files
amotinberen12
authored andcommitted
Improve too large physical ashift handling
When iterating through children physical ashifts for vdev, prefer ones above the maximum logical ashift, that we can actually use, but within the administrator defined maximum. When selecting top-level vdev ashift, do not set it to the defined maximum in case physical ashift is even higher, but just ignore one. Using the maximum does not prevent misaligned writes, but reduces space efficiency. Since ZFS tries to write data sequentially and aggregates the writes, in many cases large misanigned writes may be not as bad as the space penalty otherwise. Allow internal physical ashifts for vdevs higher than SHIFT_MAX. May be one day allocator or aggregation could benefit from that. Reduce zfs_vdev_max_auto_ashift default from 16 (64KB) to 14 (16KB), so that ZFS may still use bigger ashifts up to SHIFT_MAX (64KB), but only if it really has to or explicitly told to, but not as an "optimization". There are some read-intensive NVMe SSDs that report Preferred Write Alignment of 64KB, and attempt to build RAIDZ2 of those leads to a space inefficiency that can't be justified. Instead these changes make ZFS fall back to logical ashift of 12 (4KB) by default and only warn user that it may be suboptimal for performance. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Ryan Moeller <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc. Closes openzfs#13798
1 parent 3c59f94 commit 1d4a6e1

File tree

9 files changed

+69
-13
lines changed

9 files changed

+69
-13
lines changed

include/sys/vdev_impl.h

+1
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,7 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise);
641641
*/
642642
int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj);
643643
void vdev_metaslab_group_create(vdev_t *vd);
644+
uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b);
644645

645646
/*
646647
* Vdev ashift optimization tunables

man/man4/zfs.4

+4-1
Original file line numberDiff line numberDiff line change
@@ -347,9 +347,12 @@ When a vdev is added, target this number of metaslabs per top-level vdev.
347347
.It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512 MiB Pc Pq int
348348
Default limit for metaslab size.
349349
.
350-
.It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy ASHIFT_MAX Po 16 Pc Pq ulong
350+
.It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy 14 Pq ulong
351351
Maximum ashift used when optimizing for logical \[->] physical sector size on new
352352
top-level vdevs.
353+
May be increased up to
354+
.Sy ASHIFT_MAX Po 16 Pc ,
355+
but this may negatively impact pool space efficiency.
353356
.
354357
.It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq ulong
355358
Minimum ashift used when creating new top-level vdevs.

module/os/freebsd/zfs/vdev_geom.c

+1-2
Original file line numberDiff line numberDiff line change
@@ -955,8 +955,7 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
955955
*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
956956
*physical_ashift = 0;
957957
if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) &&
958-
ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) &&
959-
pp->stripeoffset == 0)
958+
ISP2(pp->stripesize) && pp->stripeoffset == 0)
960959
*physical_ashift = highbit(pp->stripesize) - 1;
961960

962961
/*

module/zfs/vdev.c

+33-3
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,15 @@ int zfs_vdev_standard_sm_blksz = (1 << 17);
136136
*/
137137
int zfs_nocacheflush = 0;
138138

139-
uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX;
139+
/*
140+
* Maximum and minimum ashift values that can be automatically set based on
141+
* vdev's physical ashift (disk's physical sector size). While ASHIFT_MAX
142+
* is higher than the maximum value, it is intentionally limited here to not
143+
* excessively impact pool space efficiency. Higher ashift values may still
144+
* be forced by vdev logical ashift or by user via ashift property, but won't
145+
* be set automatically as a performance optimization.
146+
*/
147+
uint64_t zfs_vdev_max_auto_ashift = 14;
140148
uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
141149

142150
void
@@ -1845,6 +1853,24 @@ vdev_set_deflate_ratio(vdev_t *vd)
18451853
}
18461854
}
18471855

1856+
/*
1857+
* Choose the best of two ashifts, preferring one between logical ashift
1858+
* (absolute minimum) and administrator defined maximum, otherwise take
1859+
* the biggest of the two.
1860+
*/
1861+
uint64_t
1862+
vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b)
1863+
{
1864+
if (a > logical && a <= zfs_vdev_max_auto_ashift) {
1865+
if (b <= logical || b > zfs_vdev_max_auto_ashift)
1866+
return (a);
1867+
else
1868+
return (MAX(a, b));
1869+
} else if (b <= logical || b > zfs_vdev_max_auto_ashift)
1870+
return (MAX(a, b));
1871+
return (b);
1872+
}
1873+
18481874
/*
18491875
* Maximize performance by inflating the configured ashift for top level
18501876
* vdevs to be as close to the physical ashift as possible while maintaining
@@ -1856,7 +1882,8 @@ vdev_ashift_optimize(vdev_t *vd)
18561882
{
18571883
ASSERT(vd == vd->vdev_top);
18581884

1859-
if (vd->vdev_ashift < vd->vdev_physical_ashift) {
1885+
if (vd->vdev_ashift < vd->vdev_physical_ashift &&
1886+
vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) {
18601887
vd->vdev_ashift = MIN(
18611888
MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
18621889
MAX(zfs_vdev_min_auto_ashift,
@@ -4463,7 +4490,10 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
44634490
vs->vs_configured_ashift = vd->vdev_top != NULL
44644491
? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
44654492
vs->vs_logical_ashift = vd->vdev_logical_ashift;
4466-
vs->vs_physical_ashift = vd->vdev_physical_ashift;
4493+
if (vd->vdev_physical_ashift <= ASHIFT_MAX)
4494+
vs->vs_physical_ashift = vd->vdev_physical_ashift;
4495+
else
4496+
vs->vs_physical_ashift = 0;
44674497

44684498
/*
44694499
* Report fragmentation and rebuild progress for top-level,

module/zfs/vdev_draid.c

+8-2
Original file line numberDiff line numberDiff line change
@@ -1496,8 +1496,14 @@ vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep,
14961496
asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1;
14971497
max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1;
14981498
logical_ashift = MAX(logical_ashift, cvd->vdev_ashift);
1499-
physical_ashift = MAX(physical_ashift,
1500-
cvd->vdev_physical_ashift);
1499+
}
1500+
for (int c = 0; c < vd->vdev_children; c++) {
1501+
vdev_t *cvd = vd->vdev_child[c];
1502+
1503+
if (cvd->vdev_ops == &vdev_draid_spare_ops)
1504+
continue;
1505+
physical_ashift = vdev_best_ashift(logical_ashift,
1506+
physical_ashift, cvd->vdev_physical_ashift);
15011507
}
15021508

15031509
*asizep = asize;

module/zfs/vdev_mirror.c

+8-2
Original file line numberDiff line numberDiff line change
@@ -409,8 +409,14 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
409409
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
410410
*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
411411
*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
412-
*physical_ashift = MAX(*physical_ashift,
413-
cvd->vdev_physical_ashift);
412+
}
413+
for (int c = 0; c < vd->vdev_children; c++) {
414+
vdev_t *cvd = vd->vdev_child[c];
415+
416+
if (cvd->vdev_open_error)
417+
continue;
418+
*physical_ashift = vdev_best_ashift(*logical_ashift,
419+
*physical_ashift, cvd->vdev_physical_ashift);
414420
}
415421

416422
if (numerrors == vd->vdev_children) {

module/zfs/vdev_raidz.c

+8-2
Original file line numberDiff line numberDiff line change
@@ -1527,8 +1527,14 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
15271527
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
15281528
*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
15291529
*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1530-
*physical_ashift = MAX(*physical_ashift,
1531-
cvd->vdev_physical_ashift);
1530+
}
1531+
for (c = 0; c < vd->vdev_children; c++) {
1532+
vdev_t *cvd = vd->vdev_child[c];
1533+
1534+
if (cvd->vdev_open_error != 0)
1535+
continue;
1536+
*physical_ashift = vdev_best_ashift(*logical_ashift,
1537+
*physical_ashift, cvd->vdev_physical_ashift);
15321538
}
15331539

15341540
*asize *= vd->vdev_children;

tests/zfs-tests/include/tunables.cfg

+2
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,9 @@ TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch
8181
TXG_HISTORY txg.history zfs_txg_history
8282
TXG_TIMEOUT txg.timeout zfs_txg_timeout
8383
UNLINK_SUSPEND_PROGRESS UNSUPPORTED zfs_unlink_suspend_progress
84+
VDEV_FILE_LOGICAL_ASHIFT vdev.file.logical_ashift vdev_file_logical_ashift
8485
VDEV_FILE_PHYSICAL_ASHIFT vdev.file.physical_ashift vdev_file_physical_ashift
86+
VDEV_MAX_AUTO_ASHIFT vdev.max_auto_ashift zfs_vdev_max_auto_ashift
8587
VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count
8688
VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip
8789
VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev

tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh

+4-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,9 @@ disk2=$TEST_BASE_DIR/disk2
5757
log_must mkfile $SIZE $disk1
5858
log_must mkfile $SIZE $disk2
5959

60+
logical_ashift=$(get_tunable VDEV_FILE_LOGICAL_ASHIFT)
6061
orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT)
62+
max_auto_ashift=$(get_tunable VDEV_MAX_AUTO_ASHIFT)
6163

6264
typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
6365
for ashift in ${ashifts[@]}
@@ -77,7 +79,8 @@ do
7779
log_must zpool create $TESTPOOL $disk1
7880
log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $ashift
7981
log_must zpool add $TESTPOOL $disk2
80-
log_must verify_ashift $disk2 $ashift
82+
exp=$(( (ashift <= max_auto_ashift) ? ashift : logical_ashift ))
83+
log_must verify_ashift $disk2 $exp
8184

8285
# clean things for the next run
8386
log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $orig_ashift

0 commit comments

Comments
 (0)