Skip to content

Commit 5266a07

Browse files
authored
ZED: Do not offline a missing device if no spare is available
Due to commit d48091d a removed device is now explicitly offlined by the ZED if no spare is available, rather than the letting ZFS detect it as UNAVAIL. This broke auto-replacing of whole-disk devices, as described in issue #10577. In short, when a new device is reinserted in the same slot, the ZED will try to ONLINE it without letting ZFS recreate the necessary partition table. This change simply avoids setting the device OFFLINE when removed if no spare is available (or if spare_on_remove is false). This change has been left minimal to allow it to be backported to 0.8.x release. The auto_offline_001_pos ZTS test has been updated accordingly. Some follow up work is planned to update the ZED so it transitions the vdev to a REMOVED state. This is a state which has always existed but there is no current interface the ZED can use to accomplish this. Therefore it's being left to a follow up PR. Reviewed-by: Gionatan Danti <[email protected]> Co-authored-by: Gionatan Danti <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #10577 Closes #10730
1 parent cfd59f9 commit 5266a07

File tree

2 files changed

+39
-24
lines changed

2 files changed

+39
-24
lines changed

cmd/zed/agents/zfs_retire.c

+2-3
Original file line numberDiff line numberDiff line change
@@ -351,9 +351,8 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
351351
zpool_vdev_offline(zhp, devname, B_TRUE);
352352
} else if (!fmd_prop_get_int32(hdl, "spare_on_remove") ||
353353
replace_with_spare(hdl, zhp, vdev) == B_FALSE) {
354-
/* Could not handle with spare: offline the device */
355-
fmd_hdl_debug(hdl, "zpool_vdev_offline '%s'", devname);
356-
zpool_vdev_offline(zhp, devname, B_TRUE);
354+
/* Could not handle with spare */
355+
fmd_hdl_debug(hdl, "no spare for '%s'", devname);
357356
}
358357

359358
free(devname);

tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh

+37-21
Original file line numberDiff line numberDiff line change
@@ -25,23 +25,29 @@
2525
#
2626
# DESCRIPTION:
2727
# Testing Fault Management Agent ZED Logic - Physically removed device is
28-
# offlined and onlined when reattached
28+
# made unavail and onlined when reattached
2929
#
3030
# STRATEGY:
3131
# 1. Create a pool
3232
# 2. Simulate physical removal of one device
33-
# 3. Verify the device is offlined
33+
# 3. Verify the device is unvailable
3434
# 4. Reattach the device
3535
# 5. Verify the device is onlined
36-
# 6. Repeat the same tests with a spare device: zed will use the spare to handle
37-
# the removed data device
38-
# 7. Repeat the same tests again with a faulted spare device: zed should offline
39-
# the removed data device if no spare is available
36+
# 6. Repeat the same tests with a spare device:
37+
# zed will use the spare to handle the removed data device
38+
# 7. Repeat the same tests again with a faulted spare device:
39+
# the removed data device should be unavailable
4040
#
4141
# NOTE: the use of 'block_device_wait' throughout the test helps avoid race
4242
# conditions caused by mixing creation/removal events from partitioning the
4343
# disk (zpool create) and events from physically removing it (remove_disk).
4444
#
45+
# NOTE: the test relies on 'zpool sync' to prompt the kmods to transition a
46+
# vdev to the unavailable state. The ZED does receive a removal notification
47+
# but only relies on it to activate a hot spare. Additional work is planned
48+
# to extend an existing ioctl interface to allow the ZED to transition the
49+
# vdev in to a removed state.
50+
#
4551
verify_runnable "both"
4652

4753
if is_linux; then
@@ -76,7 +82,6 @@ removedev=$(get_debug_device)
7682
typeset poolconfs=(
7783
"mirror $filedev1 $removedev"
7884
"raidz3 $filedev1 $filedev2 $filedev3 $removedev"
79-
"$filedev1 cache $removedev"
8085
"mirror $filedev1 $filedev2 special mirror $filedev3 $removedev"
8186
)
8287

@@ -91,11 +96,16 @@ do
9196
log_must zpool create -f $TESTPOOL $conf
9297
block_device_wait ${DEV_DSKDIR}/${removedev}
9398

99+
mntpnt=$(get_prop mountpoint /$TESTPOOL) ||
100+
log_fail "get_prop mountpoint /$TESTPOOL"
101+
94102
# 2. Simulate physical removal of one device
95103
remove_disk $removedev
104+
log_must mkfile 1m $mntpnt/file
105+
log_must zpool sync $TESTPOOL
96106

97-
# 3. Verify the device is offlined
98-
log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE"
107+
# 3. Verify the device is unvailable.
108+
log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL"
99109

100110
# 4. Reattach the device
101111
insert_disk $removedev
@@ -118,21 +128,22 @@ do
118128
block_device_wait ${DEV_DSKDIR}/${removedev}
119129
log_must zpool add $TESTPOOL spare $sparedev
120130

121-
# 3. Simulate physical removal of one device
131+
mntpnt=$(get_prop mountpoint /$TESTPOOL) ||
132+
log_fail "get_prop mountpoint /$TESTPOOL"
133+
134+
# 2. Simulate physical removal of one device
122135
remove_disk $removedev
136+
log_must mkfile 1m $mntpnt/file
137+
log_must zpool sync $TESTPOOL
123138

124-
# 4. Verify the device is handled by the spare unless is a l2arc disk
125-
# which can only be offlined
126-
if [[ $(echo "$conf" | grep -c 'cache') -eq 0 ]]; then
127-
log_must wait_hotspare_state $TESTPOOL $sparedev "INUSE"
128-
else
129-
log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE"
130-
fi
139+
# 3. Verify the device is handled by the spare.
140+
log_must wait_hotspare_state $TESTPOOL $sparedev "INUSE"
141+
log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL"
131142

132-
# 5. Reattach the device
143+
# 4. Reattach the device
133144
insert_disk $removedev
134145

135-
# 6. Verify the device is onlined
146+
# 5. Verify the device is onlined
136147
log_must wait_vdev_state $TESTPOOL $removedev "ONLINE"
137148

138149
# cleanup
@@ -150,15 +161,20 @@ do
150161
block_device_wait ${DEV_DSKDIR}/${removedev}
151162
log_must zpool add $TESTPOOL spare $sparedev
152163

164+
mntpnt=$(get_prop mountpoint /$TESTPOOL) ||
165+
log_fail "get_prop mountpoint /$TESTPOOL"
166+
153167
# 2. Fault the spare device making it unavailable
154168
log_must zpool offline -f $TESTPOOL $sparedev
155169
log_must wait_hotspare_state $TESTPOOL $sparedev "FAULTED"
156170

157171
# 3. Simulate physical removal of one device
158172
remove_disk $removedev
173+
log_must mkfile 1m $mntpnt/file
174+
log_must zpool sync $TESTPOOL
159175

160-
# 4. Verify the device is offlined
161-
log_must wait_vdev_state $TESTPOOL $removedev "OFFLINE"
176+
# 4. Verify the device is unavailable
177+
log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL"
162178

163179
# 5. Reattach the device
164180
insert_disk $removedev

0 commit comments

Comments
 (0)