Skip to content

Commit

Permalink
Avoid fault diagnosis if multiple vdevs have errors
Browse files Browse the repository at this point in the history
When multiple drives are throwing errors, it is likely not
a drive failing but rather a failure above the drives, like
a controller.  The active cases context of the drive's peers
is now considered when making a diagnosis.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed by: Brian Behlendorf <[email protected]>
Signed-off-by: Don Brady <[email protected]>
Closes #16531
  • Loading branch information
don-brady authored Sep 18, 2024
1 parent f245541 commit ddf5f34
Show file tree
Hide file tree
Showing 5 changed files with 244 additions and 32 deletions.
101 changes: 72 additions & 29 deletions cmd/zed/agents/zfs_diagnosis.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ typedef struct zfs_case_data {
uint64_t zc_ena;
uint64_t zc_pool_guid;
uint64_t zc_vdev_guid;
uint64_t zc_parent_guid;
int zc_pool_state;
char zc_serd_checksum[MAX_SERDLEN];
char zc_serd_io[MAX_SERDLEN];
Expand Down Expand Up @@ -181,10 +182,10 @@ zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
}

/*
* count other unique slow-io cases in a pool
* Return count of other unique SERD cases under same vdev parent
*/
static uint_t
zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case)
zfs_other_serd_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case)
{
zfs_case_t *zcp;
uint_t cases = 0;
Expand All @@ -206,10 +207,32 @@ zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case)

for (zcp = uu_list_first(zfs_cases); zcp != NULL;
zcp = uu_list_next(zfs_cases, zcp)) {
if (zcp->zc_data.zc_pool_guid == zfs_case->zc_pool_guid &&
zcp->zc_data.zc_vdev_guid != zfs_case->zc_vdev_guid &&
zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
fmd_serd_active(hdl, zcp->zc_data.zc_serd_slow_io)) {
zfs_case_data_t *zcd = &zcp->zc_data;

/*
* must be same pool and parent vdev but different leaf vdev
*/
if (zcd->zc_pool_guid != zfs_case->zc_pool_guid ||
zcd->zc_parent_guid != zfs_case->zc_parent_guid ||
zcd->zc_vdev_guid == zfs_case->zc_vdev_guid) {
continue;
}

/*
* Check if there is another active serd case besides zfs_case
*
* Only one serd engine will be assigned to the case
*/
if (zcd->zc_serd_checksum[0] == zfs_case->zc_serd_checksum[0] &&
fmd_serd_active(hdl, zcd->zc_serd_checksum)) {
cases++;
}
if (zcd->zc_serd_io[0] == zfs_case->zc_serd_io[0] &&
fmd_serd_active(hdl, zcd->zc_serd_io)) {
cases++;
}
if (zcd->zc_serd_slow_io[0] == zfs_case->zc_serd_slow_io[0] &&
fmd_serd_active(hdl, zcd->zc_serd_slow_io)) {
cases++;
}
}
Expand Down Expand Up @@ -502,6 +525,34 @@ zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when)
}
}

/*
* Record the specified event in the SERD engine and return a
* boolean value indicating whether or not the engine fired as
* the result of inserting this event.
*
* When the pool has similar active cases on other vdevs, then
* the fired state is disregarded and the case is retired.
*/
static int
zfs_fm_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep,
zfs_case_t *zcp, const char *err_type)
{
int fired = fmd_serd_record(hdl, name, ep);
int peers = 0;

if (fired && (peers = zfs_other_serd_cases(hdl, &zcp->zc_data)) > 0) {
fmd_hdl_debug(hdl, "pool %llu is tracking %d other %s cases "
"-- skip faulting the vdev %llu",
(u_longlong_t)zcp->zc_data.zc_pool_guid,
peers, err_type,
(u_longlong_t)zcp->zc_data.zc_vdev_guid);
zfs_case_retire(hdl, zcp);
fired = 0;
}

return (fired);
}

/*
* Main fmd entry point.
*/
Expand All @@ -510,7 +561,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
{
zfs_case_t *zcp, *dcp;
int32_t pool_state;
uint64_t ena, pool_guid, vdev_guid;
uint64_t ena, pool_guid, vdev_guid, parent_guid;
uint64_t checksum_n, checksum_t;
uint64_t io_n, io_t;
er_timeval_t pool_load;
Expand Down Expand Up @@ -600,6 +651,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
if (nvlist_lookup_uint64(nvl,
FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
vdev_guid = 0;
if (nvlist_lookup_uint64(nvl,
FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, &parent_guid) != 0)
parent_guid = 0;
if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0)
ena = 0;

Expand Down Expand Up @@ -710,6 +764,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
data.zc_ena = ena;
data.zc_pool_guid = pool_guid;
data.zc_vdev_guid = vdev_guid;
data.zc_parent_guid = parent_guid;
data.zc_pool_state = (int)pool_state;

fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data));
Expand Down Expand Up @@ -872,8 +927,10 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
SEC2NSEC(io_t));
zfs_case_serialize(zcp);
}
if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
if (zfs_fm_serd_record(hdl, zcp->zc_data.zc_serd_io,
ep, zcp, "io error")) {
checkremove = B_TRUE;
}
} else if (fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY))) {
uint64_t slow_io_n, slow_io_t;
Expand All @@ -899,25 +956,10 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
}
/* Pass event to SERD engine and see if this triggers */
if (zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
fmd_serd_record(hdl, zcp->zc_data.zc_serd_slow_io,
ep)) {
/*
* Ignore a slow io diagnosis when other
* VDEVs in the pool show signs of being slow.
*/
if (zfs_other_slow_cases(hdl, &zcp->zc_data)) {
zfs_case_retire(hdl, zcp);
fmd_hdl_debug(hdl, "pool %llu has "
"multiple slow io cases -- skip "
"degrading vdev %llu",
(u_longlong_t)
zcp->zc_data.zc_pool_guid,
(u_longlong_t)
zcp->zc_data.zc_vdev_guid);
} else {
zfs_case_solve(hdl, zcp,
"fault.fs.zfs.vdev.slow_io");
}
zfs_fm_serd_record(hdl,
zcp->zc_data.zc_serd_slow_io, ep, zcp, "slow io")) {
zfs_case_solve(hdl, zcp,
"fault.fs.zfs.vdev.slow_io");
}
} else if (fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
Expand Down Expand Up @@ -968,8 +1010,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
SEC2NSEC(checksum_t));
zfs_case_serialize(zcp);
}
if (fmd_serd_record(hdl,
zcp->zc_data.zc_serd_checksum, ep)) {
if (zfs_fm_serd_record(hdl,
zcp->zc_data.zc_serd_checksum, ep, zcp,
"checksum")) {
zfs_case_solve(hdl, zcp,
"fault.fs.zfs.vdev.checksum");
}
Expand Down
2 changes: 1 addition & 1 deletion tests/runfiles/linux.run
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ tags = ['functional', 'direct']
[tests/functional/events:Linux]
tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill',
'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config',
'zed_slow_io', 'zed_slow_io_many_vdevs']
'zed_slow_io', 'zed_slow_io_many_vdevs', 'zed_diagnose_multiple']
tags = ['functional', 'events']

[tests/functional/fadvise:Linux]
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -1490,6 +1490,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/events/setup.ksh \
functional/events/zed_cksum_config.ksh \
functional/events/zed_cksum_reported.ksh \
functional/events/zed_diagnose_multiple.ksh \
functional/events/zed_fd_spill.ksh \
functional/events/zed_io_config.ksh \
functional/events/zed_rc_filter.ksh \
Expand Down
168 changes: 168 additions & 0 deletions tests/zfs-tests/tests/functional/events/zed_diagnose_multiple.ksh
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#

#
# Copyright (c) 2024, Klara Inc.
#

# DESCRIPTION:
# Verify that simultaneous io error events from multiple vdevs
# doesn't generate a fault
#
# STRATEGY:
# 1. Create a pool with a 4 disk raidz vdev
# 2. Inject io errors
# 3. Verify that ZED detects the errors but doesn't fault any vdevs
#

. $STF_SUITE/include/libtest.shlib

TESTDIR="$TEST_BASE_DIR/zed_error_multiple"
VDEV1="$TEST_BASE_DIR/vdevfile1.$$"
VDEV2="$TEST_BASE_DIR/vdevfile2.$$"
VDEV3="$TEST_BASE_DIR/vdevfile3.$$"
VDEV4="$TEST_BASE_DIR/vdevfile4.$$"
VDEVS="$VDEV1 $VDEV2 $VDEV3 $VDEV4"
TESTPOOL="zed_test_pool"
FILEPATH="$TESTDIR/zed.testfile"

verify_runnable "both"

function cleanup
{
log_must zinject -c all

# if pool still exists then something failed so log additional info
if poolexists $TESTPOOL ; then
log_note "$(zpool status -s $TESTPOOL)"
echo "=================== zed log search ==================="
grep "Diagnosis Engine" $ZEDLET_DIR/zed.log
destroy_pool $TESTPOOL
fi
log_must zed_stop

log_must rm -f $VDEVS
}

function start_io_errors
{
for vdev in $VDEVS
do
log_must zpool set io_n=4 $TESTPOOL $vdev
log_must zpool set io_t=60 $TESTPOOL $vdev
done
zpool sync

for vdev in $VDEVS
do
log_must zinject -d $vdev -e io $TESTPOOL
done
zpool sync
}

function multiple_slow_vdevs_test
{
log_must truncate -s 1G $VDEVS
default_raidz_setup_noexit $VDEVS

log_must zpool events -c
log_must zfs set compression=off $TESTPOOL
log_must zfs set primarycache=none $TESTPOOL
log_must zfs set recordsize=4K $TESTPOOL

log_must dd if=/dev/urandom of=$FILEPATH bs=1M count=4
zpool sync

#
# Read the file with io errors injected on the disks
# This will cause multiple errors on each disk to trip ZED SERD
#
# pool: zed_test_pool
# state: ONLINE
# status: One or more devices has experienced an unrecoverable error. An
# attempt was made to correct the error. Applications are unaffected.
# action: Determine if the device needs to be replaced, and clear the errors
# using 'zpool clear' or replace the device with 'zpool replace'.
# see: https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-9P
# config:
#
# NAME STATE READ WRITE CKSUM
# zed_test_pool ONLINE 0 0 0
# raidz1-0 ONLINE 0 0 0
# /var/tmp/vdevfile1.1547063 ONLINE 532 561 0
# /var/tmp/vdevfile2.1547063 ONLINE 547 594 0
# /var/tmp/vdevfile3.1547063 ONLINE 1.05K 1.10K 0
# /var/tmp/vdevfile4.1547063 ONLINE 1.05K 1.00K 0
#

start_io_errors
dd if=$FILEPATH of=/dev/null bs=1M count=4 2>/dev/null
log_must zinject -c all

# count io error events available for processing
typeset -i i=0
typeset -i events=0
while [[ $i -lt 60 ]]; do
events=$(zpool events | grep "ereport\.fs\.zfs.io" | wc -l)
[[ $events -ge "50" ]] && break
i=$((i+1))
sleep 1
done
log_note "$events io error events found"
if [[ $events -lt "50" ]]; then
log_note "bailing: not enough events to complete the test"
destroy_pool $TESTPOOL
return
fi

#
# give slow ZED a chance to process the checkum events
#
typeset -i i=0
typeset -i skips=0
while [[ $i -lt 75 ]]; do
skips=$(grep "retiring case" \
$ZEDLET_DIR/zed.log | wc -l)
[[ $skips -gt "0" ]] && break
i=$((i+1))
sleep 1
done

log_note $skips fault skips in ZED log after $i seconds
[ $skips -gt "0" ] || log_fail "expecting to see skips"

fault=$(grep "zpool_vdev_fault" $ZEDLET_DIR/zed.log | wc -l)
log_note $fault vdev fault in ZED log
[ $fault -eq "0" ] || \
log_fail "expecting no fault events, found $fault"

destroy_pool $TESTPOOL
}

log_assert "Test ZED io errors across multiple vdevs"
log_onexit cleanup

log_must zed_events_drain
log_must zed_start
multiple_slow_vdevs_test

log_pass "Test ZED io errors across multiple vdevs"
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
#

# DESCRIPTION:
# Verify that delay events from multiple vdevs doesnt degrade
# Verify that delay events from multiple vdevs doesn't degrade
#
# STRATEGY:
# 1. Create a pool with a 3 disk raidz vdev
# 1. Create a pool with a 4 disk raidz vdev
# 2. Inject slow io errors
# 3. Verify that ZED detects slow I/Os but doesn't degrade any vdevs
#
Expand Down

0 comments on commit ddf5f34

Please sign in to comment.