Skip to content

Commit 21f7c97

Browse files
committed
Add support for autoexpand property
While the autoexpand property may seem like a small feature it depends on a significant amount of system infrastructure. Enough of that infrastructure is now in place with a few modifications for Linux it can be supported. Auto-expand works as follows; when a block device is modified (re-sized, closed after being open r/w, etc) a change uevent is generated for udev. The ZED, which is monitoring udev events, passes the change event along to zfs_deliver_dle() if the disk or partition contains a zfs_member as identified by blkid. From here the device is matched against all imported pool vdevs using the vdev_guid which was read from the label by blkid. If a match is found the ZED reopens the pool vdev. This re-opening is important because it allows the vdev to be briefly closed so the disk partition table can be re-read. Otherwise, it wouldn't be possible to report thee maximum possible expansion size. Finally, if the property autoexpand=on a vdev expansion will be attempted. After performing some sanity checks on the disk to verify that it is safe to expand, the primary partition (-part1) will be expanded and the partition table updated. The partition is then re-opened (again) to detect the updated size which allows the new capacity to be used. In order to make all of the above possible the following changes were required: * Updated the zpool_expand_001_pos and zpool_expand_003_pos tests. These tests now create a pool which is layered on a loopback, scsi_debug, and file vdev. This allows for testing of non- partitioned block device (loopback), a partition block device (scsi_debug), and a file which does not receive udev change events. This provided for better test coverage, and by removing the layering on ZFS volumes there issues surrounding layering one pool on another are avoided. * zpool_find_vdev_by_physpath() updated to accept a vdev guid. This allows for matching by guid rather than path which is a more reliable way for the ZED to reference a vdev. * Fixed zfs_zevent_wait() signal handling which could result in the ZED spinning when a signal was not handled. * Removed vdev_disk_rrpart() functionality which can be abandoned in favor of kernel provided blkdev_reread_part() function. * Added a rwlock which is held as a writer while a disk is being reopened. This is important to prevent errors from occurring for any configuration related IOs which bypass the SCL_ZIO lock. The zpool_reopen_007_pos.ksh test case was added to verify IO error are never observed when reopening. This is not expected to impact IO performance. Additional fixes which aren't critical but were discovered and resolved in the course of developing this functionality. * Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for ZFS volumes. This is as good as a unique physical path, while the volumes are not used in the test cases anymore for other reasons this improvement was included. Signed-off-by: Sara Hartse <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Issue #120 Issue #2437 Issue #5771 Issue #7366 Issue #7582
1 parent 2e5dc44 commit 21f7c97

26 files changed

+677
-358
lines changed

cmd/zed/agents/zfs_mod.c

+55-30
Original file line numberDiff line numberDiff line change
@@ -697,51 +697,67 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
697697
{
698698
char *devname = data;
699699
boolean_t avail_spare, l2cache;
700-
vdev_state_t newstate;
701700
nvlist_t *tgt;
701+
int error;
702702

703703
zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'",
704704
devname, zpool_get_name(zhp));
705705

706706
if ((tgt = zpool_find_vdev_by_physpath(zhp, devname,
707707
&avail_spare, &l2cache, NULL)) != NULL) {
708708
char *path, fullpath[MAXPATHLEN];
709-
uint64_t wholedisk = 0ULL;
709+
uint64_t wholedisk;
710710

711-
verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH,
712-
&path) == 0);
713-
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
714-
&wholedisk) == 0);
711+
error = nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &path);
712+
if (error) {
713+
zpool_close(zhp);
714+
return (0);
715+
}
715716

716-
(void) strlcpy(fullpath, path, sizeof (fullpath));
717-
if (wholedisk) {
718-
char *spath = zfs_strip_partition(fullpath);
719-
boolean_t scrub_restart = B_TRUE;
717+
error = nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
718+
&wholedisk);
719+
if (error)
720+
wholedisk = 0;
720721

721-
if (!spath) {
722-
zed_log_msg(LOG_INFO, "%s: Can't alloc",
723-
__func__);
722+
if (wholedisk) {
723+
path = strrchr(path, '/');
724+
if (path != NULL) {
725+
path = zfs_strip_partition(path + 1);
726+
if (path == NULL) {
727+
zpool_close(zhp);
728+
return (0);
729+
}
730+
} else {
731+
zpool_close(zhp);
724732
return (0);
725733
}
726734

727-
(void) strlcpy(fullpath, spath, sizeof (fullpath));
728-
free(spath);
735+
(void) strlcpy(fullpath, path, sizeof (fullpath));
736+
free(path);
729737

730738
/*
731739
* We need to reopen the pool associated with this
732-
* device so that the kernel can update the size
733-
* of the expanded device.
740+
* device so that the kernel can update the size of
741+
* the expanded device. When expanding there is no
742+
* need to restart the scrub from the * beginning.
734743
*/
744+
boolean_t scrub_restart = B_FALSE;
735745
(void) zpool_reopen_one(zhp, &scrub_restart);
746+
} else {
747+
(void) strlcpy(fullpath, path, sizeof (fullpath));
736748
}
737749

738750
if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
739-
zed_log_msg(LOG_INFO, "zfsdle_vdev_online: setting "
740-
"device '%s' to ONLINE state in pool '%s'",
741-
fullpath, zpool_get_name(zhp));
742-
if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL)
743-
(void) zpool_vdev_online(zhp, fullpath, 0,
751+
vdev_state_t newstate;
752+
753+
if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) {
754+
error = zpool_vdev_online(zhp, fullpath, 0,
744755
&newstate);
756+
zed_log_msg(LOG_INFO, "zfsdle_vdev_online: "
757+
"setting device '%s' to ONLINE state "
758+
"in pool '%s': %d", fullpath,
759+
zpool_get_name(zhp), error);
760+
}
745761
}
746762
zpool_close(zhp);
747763
return (1);
@@ -751,23 +767,32 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
751767
}
752768

753769
/*
754-
* This function handles the ESC_DEV_DLE event.
770+
* This function handles the ESC_DEV_DLE device change event. Use the
771+
* provided vdev guid when looking up a disk or partition, when the guid
772+
* is not present assume the entire disk is owned by ZFS and append the
773+
* expected -part1 partition information then lookup by physical path.
755774
*/
756775
static int
757776
zfs_deliver_dle(nvlist_t *nvl)
758777
{
759-
char *devname;
760-
761-
if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) != 0) {
762-
zed_log_msg(LOG_INFO, "zfs_deliver_dle: no physpath");
763-
return (-1);
778+
char *devname, name[MAXPATHLEN];
779+
uint64_t guid;
780+
781+
if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &guid) == 0) {
782+
sprintf(name, "%llu", (u_longlong_t)guid);
783+
} else if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) == 0) {
784+
strlcpy(name, devname, MAXPATHLEN);
785+
zfs_append_partition(name, MAXPATHLEN);
786+
} else {
787+
zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath");
764788
}
765789

766-
if (zpool_iter(g_zfshdl, zfsdle_vdev_online, devname) != 1) {
790+
if (zpool_iter(g_zfshdl, zfsdle_vdev_online, name) != 1) {
767791
zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not "
768-
"found", devname);
792+
"found", name);
769793
return (1);
770794
}
795+
771796
return (0);
772797
}
773798

config/kernel-blkdev-get.m4

-19
This file was deleted.

config/kernel-blkdev-reread-part.m4

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
dnl #
2+
dnl # 4.1 API, exported blkdev_reread_part() symbol, backported to the
3+
dnl # 3.10.0 CentOS 7.x enterprise kernels.
4+
dnl #
5+
AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_REREAD_PART], [
6+
AC_MSG_CHECKING([whether blkdev_reread_part() is available])
7+
ZFS_LINUX_TRY_COMPILE([
8+
#include <linux/fs.h>
9+
], [
10+
struct block_device *bdev = NULL;
11+
int error;
12+
13+
error = blkdev_reread_part(bdev);
14+
], [
15+
AC_MSG_RESULT(yes)
16+
AC_DEFINE(HAVE_BLKDEV_REREAD_PART, 1,
17+
[blkdev_reread_part() is available])
18+
], [
19+
AC_MSG_RESULT(no)
20+
])
21+
])

config/kernel-get-gendisk.m4

-17
This file was deleted.

config/kernel.m4

+1-2
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
4444
ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
4545
ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
4646
ZFS_AC_KERNEL_TYPE_FMODE_T
47-
ZFS_AC_KERNEL_3ARG_BLKDEV_GET
4847
ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH
48+
ZFS_AC_KERNEL_BLKDEV_REREAD_PART
4949
ZFS_AC_KERNEL_OPEN_BDEV_EXCLUSIVE
5050
ZFS_AC_KERNEL_LOOKUP_BDEV
5151
ZFS_AC_KERNEL_INVALIDATE_BDEV_ARGS
@@ -73,7 +73,6 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
7373
ZFS_AC_KERNEL_BLK_QUEUE_HAVE_BLK_PLUG
7474
ZFS_AC_KERNEL_GET_DISK_AND_MODULE
7575
ZFS_AC_KERNEL_GET_DISK_RO
76-
ZFS_AC_KERNEL_GET_GENDISK
7776
ZFS_AC_KERNEL_HAVE_BIO_SET_OP_ATTRS
7877
ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL
7978
ZFS_AC_KERNEL_DISCARD_GRANULARITY

include/linux/blkdev_compat.h

+14
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,20 @@ bio_set_bi_error(struct bio *bio, int error)
364364
#define vdev_bdev_close(bdev, md) close_bdev_excl(bdev)
365365
#endif /* HAVE_BLKDEV_GET_BY_PATH | HAVE_OPEN_BDEV_EXCLUSIVE */
366366

367+
/*
368+
* 4.1 - x.y.z API,
369+
* 3.10.0 CentOS 7.x API,
370+
* blkdev_reread_part()
371+
*
372+
* For older kernels trigger a re-reading of the partition table by calling
373+
* check_disk_change() which calls flush_disk() to invalidate the device.
374+
*/
375+
#ifdef HAVE_BLKDEV_REREAD_PART
376+
#define vdev_bdev_reread_part(bdev) blkdev_reread_part(bdev)
377+
#else
378+
#define vdev_bdev_reread_part(bdev) check_disk_change(bdev)
379+
#endif /* HAVE_BLKDEV_REREAD_PART */
380+
367381
/*
368382
* 2.6.22 API change
369383
* The function invalidate_bdev() lost it's second argument because

include/sys/vdev_disk.h

+1
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ typedef struct vdev_disk {
4747
ddi_devid_t vd_devid;
4848
char *vd_minor;
4949
struct block_device *vd_bdev;
50+
krwlock_t vd_lock;
5051
} vdev_disk_t;
5152

5253
#endif /* _KERNEL */

lib/libzfs/libzfs_import.c

+59-13
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,21 @@ zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
145145
return (0);
146146
}
147147

148+
/*
149+
* For volumes use the persistent /dev/zvol/dataset identifier
150+
*/
151+
entry = udev_device_get_devlinks_list_entry(dev);
152+
while (entry != NULL) {
153+
const char *name;
154+
155+
name = udev_list_entry_get_name(entry);
156+
if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
157+
(void) strlcpy(bufptr, name, buflen);
158+
return (0);
159+
}
160+
entry = udev_list_entry_get_next(entry);
161+
}
162+
148163
/*
149164
* NVME 'by-id' symlinks are similar to bus case
150165
*/
@@ -187,26 +202,57 @@ int
187202
zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
188203
{
189204
const char *physpath = NULL;
205+
struct udev_list_entry *entry;
190206

191207
/*
192-
* Normal disks use ID_PATH for their physical path. Device mapper
193-
* devices are virtual and don't have a physical path. For them we
194-
* use ID_VDEV instead, which is setup via the /etc/vdev_id.conf file.
195-
* ID_VDEV provides a persistent path to a virtual device. If you
196-
* don't have vdev_id.conf setup, you cannot use multipath autoreplace.
208+
* Normal disks use ID_PATH for their physical path.
197209
*/
198-
if (!((physpath = udev_device_get_property_value(dev, "ID_PATH")) &&
199-
physpath[0])) {
200-
if (!((physpath =
201-
udev_device_get_property_value(dev, "ID_VDEV")) &&
202-
physpath[0])) {
203-
return (ENODATA);
210+
physpath = udev_device_get_property_value(dev, "ID_PATH");
211+
if (physpath != NULL && strlen(physpath) > 0) {
212+
(void) strlcpy(bufptr, physpath, buflen);
213+
return (0);
214+
}
215+
216+
/*
217+
* Device mapper devices are virtual and don't have a physical
218+
* path. For them we use ID_VDEV instead, which is setup via the
219+
* /etc/vdev_id.conf file. ID_VDEV provides a persistent path
220+
* to a virtual device. If you don't have vdev_id.conf setup,
221+
* you cannot use multipath autoreplace with device mapper.
222+
*/
223+
physpath = udev_device_get_property_value(dev, "ID_VDEV");
224+
if (physpath != NULL && strlen(physpath) > 0) {
225+
(void) strlcpy(bufptr, physpath, buflen);
226+
return (0);
227+
}
228+
229+
/*
230+
* For ZFS volumes use the persistent /dev/zvol/dataset identifier
231+
*/
232+
entry = udev_device_get_devlinks_list_entry(dev);
233+
while (entry != NULL) {
234+
physpath = udev_list_entry_get_name(entry);
235+
if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
236+
(void) strlcpy(bufptr, physpath, buflen);
237+
return (0);
204238
}
239+
entry = udev_list_entry_get_next(entry);
205240
}
206241

207-
(void) strlcpy(bufptr, physpath, buflen);
242+
/*
243+
* For all other devices fallback to using the by-uuid name.
244+
*/
245+
entry = udev_device_get_devlinks_list_entry(dev);
246+
while (entry != NULL) {
247+
physpath = udev_list_entry_get_name(entry);
248+
if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) {
249+
(void) strlcpy(bufptr, physpath, buflen);
250+
return (0);
251+
}
252+
entry = udev_list_entry_get_next(entry);
253+
}
208254

209-
return (0);
255+
return (ENODATA);
210256
}
211257

212258
boolean_t

lib/libzfs/libzfs_pool.c

+11-3
Original file line numberDiff line numberDiff line change
@@ -2283,17 +2283,25 @@ vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare,
22832283
}
22842284

22852285
/*
2286-
* Given a physical path (minus the "/devices" prefix), find the
2287-
* associated vdev.
2286+
* Given a physical path or guid, find the associated vdev.
22882287
*/
22892288
nvlist_t *
22902289
zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath,
22912290
boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log)
22922291
{
22932292
nvlist_t *search, *nvroot, *ret;
2293+
uint64_t guid;
2294+
char *end;
22942295

22952296
verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2296-
verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0);
2297+
2298+
guid = strtoull(ppath, &end, 0);
2299+
if (guid != 0 && *end == '\0') {
2300+
verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0);
2301+
} else {
2302+
verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH,
2303+
ppath) == 0);
2304+
}
22972305

22982306
verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
22992307
&nvroot) == 0);

0 commit comments

Comments
 (0)