Skip to content

Commit e7f889e

Browse files
amotinandrewc12
authored andcommitted
Avoid memory copies during mirror scrub
Issuing several scrub reads for a block we may use the parent ZIO buffer for one of child ZIOs. If that read complete successfully, then we won't need to copy the data explicitly. If block has only one copy (typical for root vdev, which is also a mirror inside), then we never need to copy -- succeed or fail as-is. Previous code also copied data from buffer of every successfully completed child ZIO, but that just does not make any sense. On healthy N-wide mirror this saves all N+1 (or even more in case of ditto blocks) memory copies for each scrubbed block, allowing CPU to focus mostly on check-summing. For other vdev types it should save one memory copy per block copy at root vdev. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Mark Maybee <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored-By: iXsystems, Inc. Closes openzfs#13606
1 parent e57d566 commit e7f889e

File tree

1 file changed

+61
-70
lines changed

1 file changed

+61
-70
lines changed

module/zfs/vdev_mirror.c

+61-70
Original file line numberDiff line numberDiff line change
@@ -436,36 +436,6 @@ vdev_mirror_child_done(zio_t *zio)
436436
{
437437
mirror_child_t *mc = zio->io_private;
438438

439-
/* See scrubbing read comment in vdev_mirror_io_start() */
440-
if (zio->io_flags & ZIO_FLAG_SCRUB && zio->io_bp == NULL)
441-
mc->mc_abd = zio->io_abd;
442-
443-
mc->mc_error = zio->io_error;
444-
mc->mc_tried = 1;
445-
mc->mc_skipped = 0;
446-
}
447-
448-
static void
449-
vdev_mirror_scrub_done(zio_t *zio)
450-
{
451-
mirror_child_t *mc = zio->io_private;
452-
453-
if (zio->io_error == 0) {
454-
zio_t *pio;
455-
zio_link_t *zl = NULL;
456-
457-
mutex_enter(&zio->io_lock);
458-
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
459-
mutex_enter(&pio->io_lock);
460-
ASSERT3U(zio->io_size, >=, pio->io_size);
461-
abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
462-
mutex_exit(&pio->io_lock);
463-
}
464-
mutex_exit(&zio->io_lock);
465-
}
466-
467-
abd_free(zio->io_abd);
468-
469439
mc->mc_error = zio->io_error;
470440
mc->mc_tried = 1;
471441
mc->mc_skipped = 0;
@@ -645,15 +615,13 @@ vdev_mirror_io_start(zio_t *zio)
645615
if (zio->io_type == ZIO_TYPE_READ) {
646616
if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
647617
/*
648-
* For scrubbing reads we need to allocate a buffer
649-
* for each child and issue reads to all children.
650-
* If we can verify the checksum here, as indicated
651-
* by io_bp being non-NULL, the data will be copied
652-
* into zio->io_data in vdev_mirror_scrub_done().
653-
* If not, then vdev_mirror_io_done() will compare
654-
* all of the read buffers and return a checksum
655-
* error if they aren't all identical.
618+
* For scrubbing reads we need to issue reads to all
619+
* children. One child can reuse parent buffer, but
620+
* for others we have to allocate separate ones to
621+
* verify checksums if io_bp is non-NULL, or compare
622+
* them in vdev_mirror_io_done() otherwise.
656623
*/
624+
boolean_t first = B_TRUE;
657625
for (c = 0; c < mm->mm_children; c++) {
658626
mc = &mm->mm_child[c];
659627

@@ -665,13 +633,15 @@ vdev_mirror_io_start(zio_t *zio)
665633
continue;
666634
}
667635

668-
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
669-
mc->mc_vd, mc->mc_offset,
636+
mc->mc_abd = first ? zio->io_abd :
670637
abd_alloc_sametype(zio->io_abd,
671-
zio->io_size), zio->io_size,
672-
zio->io_type, zio->io_priority, 0,
673-
zio->io_bp ? vdev_mirror_scrub_done :
638+
zio->io_size);
639+
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
640+
mc->mc_vd, mc->mc_offset, mc->mc_abd,
641+
zio->io_size, zio->io_type,
642+
zio->io_priority, 0,
674643
vdev_mirror_child_done, mc));
644+
first = B_FALSE;
675645
}
676646
zio_execute(zio);
677647
return;
@@ -802,55 +772,76 @@ vdev_mirror_io_done(zio_t *zio)
802772
return;
803773
}
804774

805-
/*
806-
* If we're scrubbing but don't have a BP available (because this
807-
* vdev is under a raidz or draid vdev) then the best we can do is
808-
* compare all of the copies read. If they're not identical then
809-
* return a checksum error and the most likely correct data. The
810-
* raidz code will issue a repair I/O if possible.
811-
*/
812-
if (zio->io_flags & ZIO_FLAG_SCRUB && zio->io_bp == NULL) {
813-
abd_t *last_good_abd;
814-
815-
ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops ||
816-
zio->io_vd->vdev_ops == &vdev_spare_ops);
775+
if (zio->io_flags & ZIO_FLAG_SCRUB && !mm->mm_resilvering) {
776+
abd_t *best_abd = NULL;
777+
if (last_good_copy >= 0)
778+
best_abd = mm->mm_child[last_good_copy].mc_abd;
817779

818-
if (good_copies > 1) {
819-
last_good_abd = mm->mm_child[last_good_copy].mc_abd;
820-
abd_t *best_abd = NULL;
780+
/*
781+
* If we're scrubbing but don't have a BP available (because
782+
* this vdev is under a raidz or draid vdev) then the best we
783+
* can do is compare all of the copies read. If they're not
784+
* identical then return a checksum error and the most likely
785+
* correct data. The raidz code will issue a repair I/O if
786+
* possible.
787+
*/
788+
if (zio->io_bp == NULL) {
789+
ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops ||
790+
zio->io_vd->vdev_ops == &vdev_spare_ops);
821791

792+
abd_t *pref_abd = NULL;
822793
for (c = 0; c < last_good_copy; c++) {
823794
mc = &mm->mm_child[c];
824-
825795
if (mc->mc_error || !mc->mc_tried)
826796
continue;
827797

828-
if (abd_cmp(mc->mc_abd, last_good_abd) != 0)
798+
if (abd_cmp(mc->mc_abd, best_abd) != 0)
829799
zio->io_error = SET_ERROR(ECKSUM);
830800

831801
/*
832802
* The distributed spare is always prefered
833803
* by vdev_mirror_child_select() so it's
834804
* considered to be the best candidate.
835805
*/
836-
if (best_abd == NULL &&
806+
if (pref_abd == NULL &&
837807
mc->mc_vd->vdev_ops ==
838-
&vdev_draid_spare_ops) {
808+
&vdev_draid_spare_ops)
809+
pref_abd = mc->mc_abd;
810+
811+
/*
812+
* In the absence of a preferred copy, use
813+
* the parent pointer to avoid a memory copy.
814+
*/
815+
if (mc->mc_abd == zio->io_abd)
839816
best_abd = mc->mc_abd;
840-
}
841817
}
818+
if (pref_abd)
819+
best_abd = pref_abd;
820+
} else {
842821

843-
abd_copy(zio->io_abd, best_abd ? best_abd :
844-
last_good_abd, zio->io_size);
845-
846-
} else if (good_copies == 1) {
847-
last_good_abd = mm->mm_child[last_good_copy].mc_abd;
848-
abd_copy(zio->io_abd, last_good_abd, zio->io_size);
822+
/*
823+
* If we have a BP available, then checksums are
824+
* already verified and we just need a buffer
825+
* with valid data, preferring parent one to
826+
* avoid a memory copy.
827+
*/
828+
for (c = 0; c < last_good_copy; c++) {
829+
mc = &mm->mm_child[c];
830+
if (mc->mc_error || !mc->mc_tried)
831+
continue;
832+
if (mc->mc_abd == zio->io_abd) {
833+
best_abd = mc->mc_abd;
834+
break;
835+
}
836+
}
849837
}
850838

839+
if (best_abd && best_abd != zio->io_abd)
840+
abd_copy(zio->io_abd, best_abd, zio->io_size);
851841
for (c = 0; c < mm->mm_children; c++) {
852842
mc = &mm->mm_child[c];
853-
abd_free(mc->mc_abd);
843+
if (mc->mc_abd != zio->io_abd)
844+
abd_free(mc->mc_abd);
854845
mc->mc_abd = NULL;
855846
}
856847
}

0 commit comments

Comments
 (0)