From fd0406e5b332bd9f613fc7b9e8ea4ba6a6552cf1 Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Mon, 14 Sep 2020 16:05:06 -0400 Subject: [PATCH 01/35] Port TrueNAS contrib changes and adjust github workflows Signed-off-by: Ameer Hamza --- .github/workflows/ci.yml | 44 ++++++ .github/workflows/docker_image.yml | 28 ++++ .github/workflows/scripts/generate-summary.sh | 2 - .github/workflows/zfs-linux.yml | 4 +- contrib/debian/changelog.in | 149 ++++++++++++++++++ contrib/debian/control | 20 ++- contrib/debian/control.modules.in | 4 +- contrib/debian/rules.in | 2 +- 8 files changed, 241 insertions(+), 12 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/docker_image.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000000..8dd405bb1193 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,44 @@ +name: CI + +on: [push] + +jobs: + build-native-deb: + runs-on: ubuntu-22.04 + container: + image: debian:testing + steps: + - name: Installing Dependencies + run: | + apt update > /dev/null 2>&1 + apt install -y linux-image-amd64 linux-headers-amd64 debhelper-compat devscripts > /dev/null 2>&1 + + - name: Checkout + uses: actions/checkout@v2 + + - name: Build deb package + run: | + mk-build-deps --build-dep contrib/debian/control + apt install -y ./*.deb + sh autogen.sh + ./configure + cp -a contrib/debian debian + sed 's/@CFGOPTS@/--enable-debuginfo/g' debian/rules.in > debian/rules + chmod +x debian/rules + dch -b -M --force-distribution --distribution bullseye-truenas-unstable 'Tagged from zfs CI' + debuild -us -uc -b + debian/rules override_dh_binary-modules + + - name: Create artifacts dir + run: mkdir artifacts + if: success() + + - name: Move artifacts + run: mv ../*.deb artifacts + if: success() + + - uses: actions/upload-artifact@v1 + with: + name: zfs-native + path: artifacts + if: success() diff --git a/.github/workflows/docker_image.yml b/.github/workflows/docker_image.yml new file mode 100644 index 000000000000..7f5e0c2d233c --- /dev/null +++ b/.github/workflows/docker_image.yml @@ -0,0 +1,28 @@ +name: build_image + +on: + push: + branches: + - 'truenas/zfs-2.1-release' + +jobs: + docker: + runs-on: ubuntu-latest + steps: + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + - name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Build and push + id: docker_build + uses: docker/build-push-action@v2 + with: + push: true + tags: ixsystems/zfs:latest + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/scripts/generate-summary.sh b/.github/workflows/scripts/generate-summary.sh index b5d89208a5d8..5f19aa4081fc 100755 --- a/.github/workflows/scripts/generate-summary.sh +++ b/.github/workflows/scripts/generate-summary.sh @@ -103,9 +103,7 @@ ERRLOGS=0 if [ ! -f Summary/Summary.md ]; then # first call, we do the default summary (~500k) echo -n > Summary.md - summarize_s "Sanity Tests Ubuntu 20.04" Logs-20.04-sanity summarize_s "Sanity Tests Ubuntu 22.04" Logs-22.04-sanity - summarize_f "Functional Tests Ubuntu 20.04" Logs-20.04-functional summarize_f "Functional Tests Ubuntu 22.04" Logs-22.04-functional cat Summary.md >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/zfs-linux.yml b/.github/workflows/zfs-linux.yml index e6b705c86055..13111f6d9847 100644 --- a/.github/workflows/zfs-linux.yml +++ b/.github/workflows/zfs-linux.yml @@ -11,7 +11,7 @@ jobs: strategy: fail-fast: false matrix: - os: [20.04, 22.04] + os: [22.04] runs-on: ubuntu-${{ matrix.os }} steps: - uses: actions/checkout@v4 @@ -32,7 +32,7 @@ jobs: strategy: fail-fast: false matrix: - os: [20.04, 22.04] + os: [22.04] needs: build uses: ./.github/workflows/zfs-linux-tests.yml with: diff --git a/contrib/debian/changelog.in b/contrib/debian/changelog.in index 9ced39b1f7b5..a36bfb1aab85 100644 --- a/contrib/debian/changelog.in +++ b/contrib/debian/changelog.in @@ -17,3 +17,152 @@ openzfs-linux (2.1.99-1) unstable; urgency=low * This packaging is a fork of Debian zfs-linux 2.1.6-2 release. -- Umer Saleem Fri, 11 Oct 2022 15:00:00 -0400 +openzfs-linux (2.1.12-0) unstable; urgency=medium + + * Merge tag zfs-2.1.12 + + -- Ameer Hamza Wed, 14 Jun 2023 09:00:00 -0500 + +openzfs-linux (2.1.11-0) unstable; urgency=medium + + * Merge tag zfs-2.1.11 + + -- Ameer Hamza Thu, 20 Apr 2023 13:00:00 -0500 + +openzfs-linux (2.1.10-0) unstable; urgency=medium + + * Merge tag zfs-2.1.10 + + -- Ameer Hamza Mon, 17 Apr 2023 12:00:00 -0500 + +openzfs-linux (2.1.9-0) unstable; urgency=medium + + * Merge tag zfs-2.1.9 + + -- Ryan Moeller Wed, 25 Jan 2023 14:00:00 -0500 + +openzfs-linux (2.1.8-0) unstable; urgency=medium + + * Merge tag zfs-2.1.8 + + -- Ryan Moeller Fri, 20 Jan 2023 12:30:00 -0500 + +openzfs-linux (2.1.7-1) unstable; urgency=medium + + * Integrate native Debian packaging with TrueNAS ZFS. + * This packaging is a fork of Debian zfs-linux 2.1.6-2 release. + + -- Umer Saleem Wed, 14 Dec 2022 15:00:00 -0400 + +openzfs (2.1.7-0) unstable; urgency=medium + + * Merge tag zfs-2.1.7 + * Expose libzutil error info in libpc_handle_t + * Build packages with debug symbols + * Add support for overlayfs for docker + * Optimize microzaps + * zed: Avoid core dump if wholedisk property does not exist + * zed: post a udev change event from spa_vdev_attach() + * SCALE: ignore wholedisk + * Skip trivial permission checks on xattrs for POSIX ACL type + + -- Ryan Moeller Wed, 07 Dec 2022 16:00:00 -0500 + +openzfs (2.1.6-0) unstable; urgency=medium + + * Merge tag zfs-2.1.6 + * zed: mark disks as REMOVED when they are removed + * Provide kfpu_begin/end from spl + * Add snapshots_changed as property + * Add createtxg sort support for simple snapshot iterator + * Expose ZFS dataset case sensitivity setting via sb_opts + + -- Ryan Moeller Wed, 22 Jun 2022 16:00:00 -0500 + +openzfs (2.1.5-0) unstable; urgency=medium + + * Merged tag zfs-2.1.5 + + -- Ryan Moeller Wed, 22 Jun 2022 16:00:00 -0500 + +openzfs (2.1.4-1) unstable; urgency=medium + + * Merged from zfs-2.1.5-staging + * Remove wrong assertion in log spacemap + * Add debug symbols to the truenas package + + -- Ryan Moeller Thu, 02 Jun 2022 08:00:00 -0500 + +openzfs (2.1.4-0) unstable; urgency=medium + + * Merged OpenZFS zfs-2.1.4 + * Also merged from zfs-2.1.5-staging + * Fix zfs send -V + * Expose zpool guids through kstats + + -- Ryan Moeller Wed, 18 May 2022 14:00:00 -0500 + +openzfs (2.1.3-0) unstable; urgency=medium + + * Merged OpenZFS zfs-2.1.3 + + -- Ryan Moeller Thu, 10 Mar 2021 16:00:00 -0400 + +openzfs (2.1.2-1) unstable; urgency=medium + + * Merged OpenZFS zfs-2.1.2-release + * Implement FS_IOC_GETVERSION + * Improve log spacemap load time after unclean export + + -- Ryan Moeller Mon, 20 Dec 2021 10:00:00 -0400 + +openzfs (2.1.2-0) unstable; urgency=medium + + * Merged OpenZFS zfs-2.1.2-staging + * Updated Python build dependencies + * Linux 5.14 build fixes + + -- Ryan Moeller Wed, 03 Nov 2021 10:00:00 -0400 + +openzfs (2.1.1-0) unstable; urgency=medium + + * Merged OpenZFS 2.1.1 + * Removed feature@xattr_compat and xattr_fallback property + + -- Ryan Moeller Thu, 07 Oct 2021 14:09:30 -0400 + +openzfs (2.1.0-0) unstable; urgency=medium + + * Rebased to OpenZFS 2.1.0 + + -- Ryan Moeller Thu, 01 Apr 2021 13:00:00 -0500 + +openzfs (2.0.4-0) unstable; urgency=medium + + * Rebased to OpenZFS 2.0.4 + + -- Ryan Moeller Tue, 09 Mar 2021 14:00:00 -0500 + +openzfs (2.0.3-0) unstable; urgency=medium + + * Rebased to OpenZFS 2.0.3 + + -- Ryan Moeller Wed, 24 Feb 2021 15:00:00 -0500 + +openzfs (2.0.2-0) unstable; urgency=medium + + * Rebased to OpenZFS 2.0.2 + + -- Ryan Moeller Mon, 01 Feb 2021 15:00:00 -0500 + +openzfs (2.0.1-0) unstable; urgency=medium + + * Rebased to OpenZFS 2.0.1 + + -- Ryan Moeller Thu, 07 Jan 2021 17:34:28 -0500 + +openzfs (2.0.0-0) unstable; urgency=medium + + * Initial package for TrueNAS SCALE based on OpenZFS 2.0.0 + + -- Ryan Moeller Mon, 14 Sep 2020 22:01:55 -0400 diff --git a/contrib/debian/control b/contrib/debian/control index 98beb900d0fa..5b7874d11401 100644 --- a/contrib/debian/control +++ b/contrib/debian/control @@ -4,10 +4,9 @@ Priority: optional Maintainer: ZFS on Linux specific mailing list Build-Depends: debhelper-compat (= 12), dh-python, - dh-sequence-dkms | dkms (>> 2.1.1.2-5), + dh-sequence-dkms, libaio-dev, libblkid-dev, - libcurl4-openssl-dev, libelf-dev, libpam0g-dev, libssl-dev | libssl1.0-dev, @@ -35,6 +34,7 @@ Depends: ${misc:Depends}, ${shlibs:Depends} Breaks: libnvpair1, libnvpair3 Replaces: libnvpair1, libnvpair3, libnvpair3linux Conflicts: libnvpair3linux +Provides: libnvpair3 Description: Solaris name-value library for Linux This library provides routines for packing and unpacking nv pairs for transporting data across process boundaries, transporting between @@ -46,6 +46,7 @@ Architecture: linux-any Depends: libpam-runtime, ${misc:Depends}, ${shlibs:Depends} Replaces: libpam-zfs Conflicts: libpam-zfs +Provides: pam-zfs-key Description: PAM module for managing encryption keys for ZFS OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, @@ -61,6 +62,7 @@ Depends: ${misc:Depends}, ${shlibs:Depends} Breaks: libuutil1, libuutil3 Replaces: libuutil1, libuutil3, libuutil3linux Conflicts: libuutil3linux +Provides: libuutil3 Description: Solaris userland utility library for Linux This library provides a variety of glue functions for ZFS on Linux: * libspl: The Solaris Porting Layer userland library, which provides APIs @@ -84,7 +86,7 @@ Depends: libssl-dev | libssl1.0-dev, ${misc:Depends} Replaces: libzfslinux-dev Conflicts: libzfslinux-dev -Provides: libnvpair-dev, libuutil-dev +Provides: libnvpair-dev, libuutil-dev, libzfs5-devel Description: OpenZFS filesystem development files for Linux Header files and static libraries for compiling software against libraries of OpenZFS filesystem. @@ -102,6 +104,7 @@ Recommends: libcurl4 Breaks: libzfs2, libzfs4 Replaces: libzfs2, libzfs4, libzfs4linux Conflicts: libzfs4linux +Provides: libzfs5 Description: OpenZFS filesystem library for Linux - general support OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, @@ -130,6 +133,7 @@ Depends: ${misc:Depends}, ${shlibs:Depends} Breaks: libzpool2, libzpool5 Replaces: libzpool2, libzpool5, libzpool5linux Conflicts: libzpool5linux +Provides: libzpool5 Description: OpenZFS pool library for Linux OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, @@ -146,6 +150,7 @@ Depends: python3-cffi, ${python3:Depends} Replaces: python3-pyzfs Conflicts: python3-pyzfs +Provides: python3-pyzfs Description: wrapper for libzfs_core C library libzfs_core is intended to be a stable interface for programmatic administration of ZFS. This wrapper provides one-to-one wrappers for @@ -197,6 +202,7 @@ Recommends: openzfs-zfs-zed, openzfs-zfsutils (>= ${source:Version}), ${linux:Re Suggests: debhelper Breaks: spl-dkms (<< 0.8.0~rc1) Replaces: spl-dkms, zfs-dkms +Conflicts: zfs-dkms Provides: openzfs-zfs-modules Description: OpenZFS filesystem kernel modules for Linux OpenZFS is a storage platform that encompasses the functionality of @@ -216,6 +222,7 @@ Depends: busybox-initramfs | busybox-static | busybox, Breaks: zfsutils-linux (<= 0.7.11-2) Replaces: zfsutils-linux (<= 0.7.11-2), zfs-initramfs Conflicts: zfs-initramfs +Provides: zfs-initramfs Description: OpenZFS root filesystem capabilities for Linux - initramfs OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, @@ -232,6 +239,7 @@ Depends: dracut, ${misc:Depends} Conflicts: zfs-dracut Replaces: zfs-dracut +Provides: zfs-dracut Description: OpenZFS root filesystem capabilities for Linux - dracut OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, @@ -248,10 +256,11 @@ Depends: openzfs-libnvpair3 (= ${binary:Version}), openzfs-libuutil3 (= ${binary:Version}), openzfs-libzfs4 (= ${binary:Version}), openzfs-libzpool5 (= ${binary:Version}), + openzfs-zfs-modules | openzfs-zfs-dkms, python3, ${misc:Depends}, ${shlibs:Depends} -Recommends: lsb-base, openzfs-zfs-modules | openzfs-zfs-dkms, openzfs-zfs-zed +Recommends: lsb-base, openzfs-zfs-zed Breaks: openrc, spl (<< 0.7.9-2), spl-dkms (<< 0.8.0~rc1), @@ -262,7 +271,7 @@ Conflicts: zfs, zfs-fuse, zfsutils-linux Suggests: nfs-kernel-server, samba-common-bin (>= 3.0.23), openzfs-zfs-initramfs | openzfs-zfs-dracut -Provides: openzfsutils +Provides: openzfs Description: command-line tools to manage OpenZFS filesystems OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, @@ -316,6 +325,7 @@ Recommends: nfs-kernel-server Breaks: zfsutils-linux (<= 0.7.9-2) Replaces: zfsutils-linux (<= 0.7.9-2), zfs-test Conflicts: zutils, zfs-test +Provides: zfs-test Description: OpenZFS test infrastructure and support scripts OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, diff --git a/contrib/debian/control.modules.in b/contrib/debian/control.modules.in index 34eb7fafba7c..e6e2b9b5be20 100644 --- a/contrib/debian/control.modules.in +++ b/contrib/debian/control.modules.in @@ -5,7 +5,7 @@ Maintainer: ZFS on Linux specific mailing list Build-Depends: debhelper-compat (= 10), dkms (>> 2.1.1.2-5), libtool, - linux-headers-_KVERS_ | raspberrypi-kernel-headers + linux-headers-_KVERS_ Standards-Version: 4.3.0 Homepage: http://www.openzfs.org/ Vcs-Git: https://github.com/openzfs/zfs.git @@ -14,7 +14,7 @@ Vcs-Browser: https://github.com/openzfs/zfs Package: openzfs-zfs-modules-_KVERS_ Architecture: _ARCH_ Provides: openzfs-zfs-modules -Depends: linux-image-_KVERS_ | raspberrypi-kernel +Depends: linux-image-_KVERS_ | linux-image-amd64 Recommends: openzfsutils Replaces: zfs-modules-_KVERS_ Conflicts: zfs-modules-_KVERS_ diff --git a/contrib/debian/rules.in b/contrib/debian/rules.in index a3a05efacb50..260f9279158f 100755 --- a/contrib/debian/rules.in +++ b/contrib/debian/rules.in @@ -25,7 +25,7 @@ PARALLEL = $(subst parallel=,,$(filter parallel=%,$(DEB_BUILD_OPTIONS))) NJOBS = -j$(or $(PARALLEL),$(NUM_CPUS),1) %: - dh $@ --with autoreconf,dkms,python3,sphinxdoc + dh $@ --with autoreconf,python3,sphinxdoc override_dh_autoreconf: @# Embed the downstream version in the module. From 6343ecfcfe87a4b746c999ec64aca986b907d2f6 Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Mon, 22 Mar 2021 15:26:19 -0400 Subject: [PATCH 02/35] Advertise support for large xattrs on TrueNAS SB_LARGEXATTR is used in TrueNAS SCALE to indicate to the kernel that the filesystem supports large-size xattrs (greater than 64KiB). This flag is used to evaluate whether to allow large xattr read or write requests (up to 2 MiB). Signed-off-by: Andrew Walker --- module/os/linux/zfs/zfs_vfsops.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 2015c20d7340..8800a6c22097 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -336,9 +336,15 @@ xattr_changed_cb(void *arg, uint64_t newval) zfsvfs_t *zfsvfs = arg; if (newval == ZFS_XATTR_OFF) { +#ifdef SB_LARGEXATTR + zfsvfs->z_sb->s_flags &= ~SB_LARGEXATTR; +#endif zfsvfs->z_flags &= ~ZSB_XATTR; } else { zfsvfs->z_flags |= ZSB_XATTR; +#ifdef SB_LARGEXATTR + zfsvfs->z_sb->s_flags |= SB_LARGEXATTR; +#endif if (newval == ZFS_XATTR_SA) zfsvfs->z_xattr_sa = B_TRUE; From b5748dea7150f11aabe3273321fd7c087148f18c Mon Sep 17 00:00:00 2001 From: Andrew Date: Mon, 24 May 2021 10:49:43 -0400 Subject: [PATCH 03/35] Implement NFSv41 ACLs through xattr This implements NFSv41 (RFC 5661) ACLs in a manner compatible with vfs_nfs4acl_xattr in Samba and nfs4xdr-acl-tools. There are three key areas of change in this commit: 1) NFSv4 ACL management through system.nfs4_acl_xdr xattr. Install an xattr handler for "system.nfs4_acl_xdr" that presents an xattr containing full NFSv41 ACL structures generated through rpcgen using specification from the Samba project. This xattr is used by userspace programs to read and set permissions. 2) add an i_op->permissions endpoint: zpl_permissions(). This is used by the VFS in Linux to determine whether to allow / deny an operation. Wherever possible, we try to avoid having to call zfs_access(). If kernel has NFSv4 patch for VFS, then perform more complete check of avaiable access mask. 3) add capability-based overrides to secpolicy_vnode_access2() there are various situations in which ACL may need to be overridden based on capabilities. This logic is almost directly copied from Linux VFS. For instance, root needs to be able to always read / write ACLs (otherwise admin can get locked out from files). This is commit was initially inspired by work from Paul B. Henson to implement NFSv4.0 (RFC3530) ACLs in ZFS on Linux. Key areas of divergence are as follows: - ACL specification, xattr format, xattr name - Addition of handling for NFSv4 masks from Linux VFS - Addition of ACL overrides based on capabilities Signed-off-by: Andrew Walker --- .gitignore | 2 + Makefile.am | 1 + include/os/linux/spl/rpc/xdr.h | 1 + include/os/linux/zfs/sys/zpl.h | 2 + module/Kbuild.in | 1 + module/Makefile.in | 12 +- module/os/linux/zfs/nfs41acl.x | 74 +++++ module/os/linux/zfs/policy.c | 44 ++- module/os/linux/zfs/zfs_vfsops.c | 14 +- module/os/linux/zfs/zpl_inode.c | 3 + module/os/linux/zfs/zpl_super.c | 3 + module/os/linux/zfs/zpl_xattr.c | 471 +++++++++++++++++++++++++++++++ 12 files changed, 624 insertions(+), 4 deletions(-) create mode 100644 module/os/linux/zfs/nfs41acl.x diff --git a/.gitignore b/.gitignore index a2cb92dd5406..bd3d4befd291 100644 --- a/.gitignore +++ b/.gitignore @@ -84,6 +84,8 @@ modules.order Makefile Makefile.in changelog +nfs41acl.h +nfs41acl_xdr.c *.patch *.orig *.tmp diff --git a/Makefile.am b/Makefile.am index 11e45dae8255..76ec01f2aa31 100644 --- a/Makefile.am +++ b/Makefile.am @@ -122,6 +122,7 @@ cstyle: $(AM_V_at)find $(top_srcdir) -name build -prune \ -o -type f -name '*.[hc]' \ ! -name 'zfs_config.*' ! -name '*.mod.c' \ + ! -name 'nfs41acl_xdr.c' ! -name 'nfs41acl.h' \ ! -name 'opt_global.h' ! -name '*_if*.h' \ ! -name 'zstd_compat_wrapper.h' \ ! -path './module/zstd/lib/*' \ diff --git a/include/os/linux/spl/rpc/xdr.h b/include/os/linux/spl/rpc/xdr.h index b00f3542fcdf..05aed7cb81ce 100644 --- a/include/os/linux/spl/rpc/xdr.h +++ b/include/os/linux/spl/rpc/xdr.h @@ -22,6 +22,7 @@ #define _SPL_RPC_XDR_H #include +#include typedef int bool_t; diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index 91a4751fffb0..afea2336ba76 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -106,6 +106,8 @@ zpl_chmod_acl(struct inode *ip) } #endif /* CONFIG_FS_POSIX_ACL */ +extern int zpl_permission(struct inode *ip, int mask); + extern xattr_handler_t *zpl_xattr_handlers[]; /* zpl_ctldir.c */ diff --git a/module/Kbuild.in b/module/Kbuild.in index 7e08374fa2b9..94d6987b0af5 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -439,6 +439,7 @@ ZFS_OBJS_OS := \ abd_os.o \ arc_os.o \ mmp_os.o \ + nfs41acl_xdr.o \ policy.o \ qat.o \ qat_compress.o \ diff --git a/module/Makefile.in b/module/Makefile.in index 9b34b3dfaec7..757dfc088ddf 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -53,6 +53,16 @@ FMAKE = env -u MAKEFLAGS make $(FMAKEFLAGS) modules-Linux: mkdir -p $(sort $(dir $(spl-objs) $(spl-))) mkdir -p $(sort $(dir $(zfs-objs) $(zfs-))) + @# Generate the nfs41acl XDR header. + rpcgen -h "@abs_srcdir@/os/linux/zfs/nfs41acl.x" > "@abs_srcdir@/os/linux/zfs/nfs41acl.h" + sed -i "/\/c\\#include \" "@abs_srcdir@/os/linux/zfs/nfs41acl.h" + sed -i "9i #include " "@abs_srcdir@/os/linux/zfs/nfs41acl.h" + @# Generate the nfs41acl XDR code. + rpcgen -c "@abs_srcdir@/os/linux/zfs/nfs41acl.x" > "@abs_srcdir@/os/linux/zfs/nfs41acl_xdr.c" + sed -i "5i #pragma GCC diagnostic push" "@abs_srcdir@/os/linux/zfs/nfs41acl_xdr.c" + sed -i "6i #pragma GCC diagnostic ignored \"-Wunused-variable\"" "@abs_srcdir@/os/linux/zfs/nfs41acl_xdr.c" + echo "#pragma GCC diagnostic pop" >> "@abs_srcdir@/os/linux/zfs/nfs41acl_xdr.c" + @# Build the kernel modules. $(MAKE) -C @LINUX_OBJ@ $(if @KERNEL_CC@,CC=@KERNEL_CC@) \ $(if @KERNEL_LD@,LD=@KERNEL_LD@) $(if @KERNEL_LLVM@,LLVM=@KERNEL_LLVM@) \ M="$$PWD" @KERNEL_MAKE@ CONFIG_ZFS=m modules @@ -161,7 +171,7 @@ cppcheck-FreeBSD: cppcheck: cppcheck-@ac_system@ distdir: - cd @srcdir@ && find . -name '*.[chS]' -exec sh -c 'for f; do mkdir -p $$distdir/$${f%/*}; cp @srcdir@/$$f $$distdir/$$f; done' _ {} + + cd @srcdir@ && find . -name '*.[chxS]' -exec sh -c 'for f; do mkdir -p $$distdir/$${f%/*}; cp @srcdir@/$$f $$distdir/$$f; done' _ {} + cp @srcdir@/Makefile.bsd $$distdir/Makefile.bsd gen-zstd-symbols: diff --git a/module/os/linux/zfs/nfs41acl.x b/module/os/linux/zfs/nfs41acl.x new file mode 100644 index 000000000000..1df04e8b8af8 --- /dev/null +++ b/module/os/linux/zfs/nfs41acl.x @@ -0,0 +1,74 @@ +const ACE4_ACCESS_ALLOWED_ACE_TYPE = 0x00000000; +const ACE4_ACCESS_DENIED_ACE_TYPE = 0x00000001; +const ACE4_SYSTEM_AUDIT_ACE_TYPE = 0x00000002; +const ACE4_SYSTEM_ALARM_ACE_TYPE = 0x00000003; + +typedef u_int acetype4; + +const ACE4_FILE_INHERIT_ACE = 0x00000001; +const ACE4_DIRECTORY_INHERIT_ACE = 0x00000002; +const ACE4_NO_PROPAGATE_INHERIT_ACE = 0x00000004; +const ACE4_INHERIT_ONLY_ACE = 0x00000008; +const ACE4_SUCCESSFUL_ACCESS_ACE_FLAG = 0x00000010; +const ACE4_FAILED_ACCESS_ACE_FLAG = 0x00000020; +const ACE4_IDENTIFIER_GROUP = 0x00000040; +const ACE4_INHERITED_ACE = 0x00000080; + +typedef u_int aceflag4; + +const ACEI4_SPECIAL_WHO = 0x00000001; + +typedef u_int aceiflag4; + +const ACE4_SPECIAL_OWNER = 1; +const ACE4_SPECIAL_GROUP = 2; +const ACE4_SPECIAL_EVERYONE = 3; +const ACE4_SPECIAL_INTERACTIVE = 4; +const ACE4_SPECIAL_NETWORK = 5; +const ACE4_SPECIAL_DIALUP = 6; +const ACE4_SPECIAL_BATCH = 7; +const ACE4_SPECIAL_ANONYMOUS = 8; +const ACE4_SPECIAL_AUTHENTICATED = 9; +const ACE4_SPECIAL_SERVICE = 10; + +const ACE4_READ_DATA = 0x00000001; +const ACE4_LIST_DIRECTORY = 0x00000001; +const ACE4_WRITE_DATA = 0x00000002; +const ACE4_ADD_FILE = 0x00000002; +const ACE4_APPEND_DATA = 0x00000004; +const ACE4_ADD_SUBDIRECTORY = 0x00000004; +const ACE4_READ_NAMED_ATTRS = 0x00000008; +const ACE4_WRITE_NAMED_ATTRS = 0x00000010; +const ACE4_EXECUTE = 0x00000020; +const ACE4_DELETE_CHILD = 0x00000040; +const ACE4_READ_ATTRIBUTES = 0x00000080; +const ACE4_WRITE_ATTRIBUTES = 0x00000100; +const ACE4_WRITE_RETENTION = 0x00000200; +const ACE4_WRITE_RETENTION_HOLD = 0x00000400; + +const ACE4_DELETE = 0x00010000; +const ACE4_READ_ACL = 0x00020000; +const ACE4_WRITE_ACL = 0x00040000; +const ACE4_WRITE_OWNER = 0x00080000; +const ACE4_SYNCHRONIZE = 0x00100000; + +typedef u_int acemask4; + +struct nfsace4i { + acetype4 type; + aceflag4 flag; + aceiflag4 iflag; + acemask4 access_mask; + u_int who; +}; + +const ACL4_AUTO_INHERIT = 0x00000001; +const ACL4_PROTECTED = 0x00000002; +const ACL4_DEFAULTED = 0x00000004; + +typedef u_int aclflag4; + +struct nfsacl41i { + aclflag4 na41_flag; + nfsace4i na41_aces<>; +}; diff --git a/module/os/linux/zfs/policy.c b/module/os/linux/zfs/policy.c index 5d1b4383412a..f8a808604671 100644 --- a/module/os/linux/zfs/policy.c +++ b/module/os/linux/zfs/policy.c @@ -33,6 +33,7 @@ #include #include #include +#include /* * The passed credentials cannot be directly verified because Linux only @@ -103,13 +104,52 @@ secpolicy_sys_config(const cred_t *cr, boolean_t checkonly) * Like secpolicy_vnode_access() but we get the actual wanted mode and the * current mode of the file, not the missing bits. * - * Enforced in the Linux VFS. + * If filesystem is using NFSv4 ACLs, validate the current mode + * and the wanted mode are the same, otherwise access fails. + * + * If using POSIX ACLs or no ACLs, enforced in the Linux VFS. */ int secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner, mode_t curmode, mode_t wantmode) { - return (0); + mode_t remainder = ~curmode & wantmode; + if ((ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_NFSV4) || + (remainder == 0)) { + return (0); + } + + /* + * short-circuit if root + */ + if (capable(CAP_SYS_ADMIN)) { + return (0); + } + + /* + * There are some situations in which capabilities + * may allow overriding the DACL. + */ + if (S_ISDIR(ip->i_mode)) { + if (!(wantmode & S_IWUSR) && + capable(CAP_DAC_READ_SEARCH)) { + return (0); + } + if (capable(CAP_DAC_OVERRIDE)) { + return (0); + } + return (EACCES); + } + + if ((wantmode == S_IRUSR) && capable(CAP_DAC_READ_SEARCH)) { + return (0); + } + + if (!(remainder & S_IXUSR) && capable(CAP_DAC_OVERRIDE)) { + return (0); + } + + return (EACCES); } /* diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 8800a6c22097..dcc586362cc7 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -359,12 +359,17 @@ acltype_changed_cb(void *arg, uint64_t newval) zfsvfs_t *zfsvfs = arg; switch (newval) { - case ZFS_ACLTYPE_NFSV4: case ZFS_ACLTYPE_OFF: zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; +#ifdef SB_NFSV4ACL + zfsvfs->z_sb->s_flags &= ~SB_NFSV4ACL; +#endif break; case ZFS_ACLTYPE_POSIX: +#ifdef SB_NFSV4ACL + zfsvfs->z_sb->s_flags &= ~SB_NFSV4ACL; +#endif #ifdef CONFIG_FS_POSIX_ACL zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIX; zfsvfs->z_sb->s_flags |= SB_POSIXACL; @@ -373,6 +378,13 @@ acltype_changed_cb(void *arg, uint64_t newval) zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; #endif /* CONFIG_FS_POSIX_ACL */ break; + case ZFS_ACLTYPE_NFSV4: + zfsvfs->z_acl_type = ZFS_ACLTYPE_NFSV4; + zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; +#ifdef SB_NFSV4ACL + zfsvfs->z_sb->s_flags |= SB_NFSV4ACL; +#endif + break; default: break; } diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c index ad1753f7a071..b25f3e9224f3 100644 --- a/module/os/linux/zfs/zpl_inode.c +++ b/module/os/linux/zfs/zpl_inode.c @@ -816,6 +816,7 @@ const struct inode_operations zpl_inode_operations = { .get_acl = zpl_get_acl, #endif /* HAVE_GET_INODE_ACL */ #endif /* CONFIG_FS_POSIX_ACL */ + .permission = zpl_permission, }; #ifdef HAVE_RENAME2_OPERATIONS_WRAPPER @@ -862,6 +863,7 @@ const struct inode_operations zpl_dir_inode_operations = { .get_acl = zpl_get_acl, #endif /* HAVE_GET_INODE_ACL */ #endif /* CONFIG_FS_POSIX_ACL */ + .permission = zpl_permission, #ifdef HAVE_RENAME2_OPERATIONS_WRAPPER }, .rename2 = zpl_rename2, @@ -909,4 +911,5 @@ const struct inode_operations zpl_special_inode_operations = { .get_acl = zpl_get_acl, #endif /* HAVE_GET_INODE_ACL */ #endif /* CONFIG_FS_POSIX_ACL */ + .permission = zpl_permission, }; diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c index d98d32c1f9fb..670058bd4eda 100644 --- a/module/os/linux/zfs/zpl_super.c +++ b/module/os/linux/zfs/zpl_super.c @@ -230,6 +230,9 @@ __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) case ZFS_ACLTYPE_POSIX: seq_puts(seq, ",posixacl"); break; + case ZFS_ACLTYPE_NFSV4: + seq_puts(seq, ",nfs4acl"); + break; default: seq_puts(seq, ",noacl"); break; diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index 4e4f5210f85d..4005258728da 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -80,10 +80,34 @@ #include #include #include +#include #include #include #include #include +#include +#include "nfs41acl.h" + +#define NFS41ACL_XATTR "system.nfs4_acl_xdr" + +static const struct { + int kmask; + int zfsperm; +} mask2zfs[] = { + { MAY_READ, ACE_READ_DATA }, + { MAY_WRITE, ACE_WRITE_DATA }, + { MAY_EXEC, ACE_EXECUTE }, +#ifdef SB_NFSV4ACL + { MAY_DELETE, ACE_DELETE }, + { MAY_DELETE_CHILD, ACE_DELETE_CHILD }, + { MAY_WRITE_ATTRS, ACE_WRITE_ATTRIBUTES }, + { MAY_WRITE_NAMED_ATTRS, ACE_WRITE_NAMED_ATTRS }, + { MAY_WRITE_ACL, ACE_WRITE_ACL }, + { MAY_WRITE_OWNER, ACE_WRITE_OWNER }, +#endif +}; + +#define GENERIC_MASK(mask) ((mask & ~(MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) enum xattr_permission { XAPERM_DENY, @@ -1457,6 +1481,448 @@ static xattr_handler_t zpl_xattr_acl_default_handler = { #endif /* CONFIG_FS_POSIX_ACL */ +int +zpl_permission(struct inode *ip, int mask) +{ + int to_check = 0, i, ret; + cred_t *cr = NULL; + + /* + * If NFSv4 ACLs are not being used, go back to + * generic_permission(). If ACL is trivial and the + * mask is representable by POSIX permissions, then + * also go back to generic_permission(). + */ + if ((ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_NFSV4) || + ((ITOZ(ip)->z_pflags & ZFS_ACL_TRIVIAL && GENERIC_MASK(mask)))) { + return (generic_permission(ip, mask)); + } + + for (i = 0; i < ARRAY_SIZE(mask2zfs); i++) { + if (mask & mask2zfs[i].kmask) { + to_check |= mask2zfs[i].zfsperm; + } + } + + /* + * We're being asked to check something that doesn't contain an + * NFSv4 ACE. Pass back to default kernel permissions check. + */ + if (to_check == 0) { + return (generic_permission(ip, mask)); + } + + /* + * Avoid potentially blocking in RCU walk. + */ + if (mask & MAY_NOT_BLOCK) { + return (-ECHILD); + } + + cr = CRED(); + crhold(cr); + ret = -zfs_access(ITOZ(ip), to_check, V_ACE_MASK, cr); + if (ret != -EPERM && ret != -EACCES) { + crfree(cr); + return (ret); + } + + /* + * There are some situations in which capabilities + * may allow overriding the DACL. + */ + if (S_ISDIR(ip->i_mode)) { +#ifdef SB_NFSV4ACL + if (!(mask & (MAY_WRITE | NFS41ACL_WRITE_ALL))) { +#else + if (!(mask & MAY_WRITE)) { +#endif + if (capable(CAP_DAC_READ_SEARCH)) { + crfree(cr); + return (0); + } + } + if (capable(CAP_DAC_OVERRIDE)) { + crfree(cr); + return (0); + } + crfree(cr); + return (ret); + } + + if (to_check == ACE_READ_DATA) { + if (capable(CAP_DAC_READ_SEARCH)) { + crfree(cr); + return (0); + } + } + + if (!(mask & MAY_EXEC) || + (zfs_fastaccesschk_execute(ITOZ(ip), cr) == 0)) { + if (capable(CAP_DAC_OVERRIDE)) { + crfree(cr); + return (0); + } + } + + crfree(cr); + return (ret); +} + +#define NFS41ACL_MAX_ACES 128 +#define NFS41_FLAGS (ACE_DIRECTORY_INHERIT_ACE| \ + ACE_FILE_INHERIT_ACE| \ + ACE_NO_PROPAGATE_INHERIT_ACE| \ + ACE_INHERIT_ONLY_ACE| \ + ACE_INHERITED_ACE| \ + ACE_IDENTIFIER_GROUP) + +/* + * Macros for sanity checks related to XDR and ACL buffer sizes + */ +#define ACE4SIZE (sizeof (nfsace4i)) +#define ACLBASE (sizeof (nfsacl41i)) +#define XDRBASE (2 * sizeof (uint_t)) + +#define ACES_TO_SIZE(x, y) (x + (y * ACE4SIZE)) +#define SIZE_TO_ACES(x, y) ((y - x) / ACE4SIZE) +#define SIZE_IS_VALID(x, y) ((x >= ACES_TO_SIZE(y, 0)) && \ + (((x - y) % ACE4SIZE) == 0)) + +#define ACES_TO_ACLSIZE(x) (ACES_TO_SIZE(ACLBASE, x)) +#define ACES_TO_XDRSIZE(x) (ACES_TO_SIZE(XDRBASE, x)) + +#define ACLSIZE_TO_ACES(x) (SIZE_TO_ACES(ACLBASE, x)) +#define XDRSIZE_TO_ACES(x) (SIZE_TO_ACES(XDRBASE, x)) + +#define ACLSIZE_IS_VALID(x) (SIZE_IS_VALID(x, ACLBASE)) +#define XDRSIZE_IS_VALID(x) (SIZE_IS_VALID(x, XDRBASE)) + +static int +__zpl_xattr_nfs41acl_list(struct inode *ip, char *list, size_t list_size, + const char *name, size_t name_len) +{ + char *xattr_name = NFS41ACL_XATTR; + size_t xattr_size = sizeof (NFS41ACL_XATTR); + + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_NFSV4) + return (0); + + if (list && xattr_size <= list_size) + memcpy(list, xattr_name, xattr_size); + + return (xattr_size); +} +ZPL_XATTR_LIST_WRAPPER(zpl_xattr_nfs41acl_list); + +static int +acep_to_nfsace4i(const ace_t *acep, nfsace4i *nacep) +{ + nacep->type = acep->a_type; + nacep->flag = acep->a_flags & NFS41_FLAGS; + nacep->access_mask = acep->a_access_mask; + + switch (acep->a_flags & ACE_TYPE_FLAGS) { + case ACE_OWNER: + nacep->iflag |= ACEI4_SPECIAL_WHO; + nacep->who = ACE4_SPECIAL_OWNER; + break; + + case ACE_GROUP|ACE_IDENTIFIER_GROUP: + nacep->iflag |= ACEI4_SPECIAL_WHO; + nacep->who = ACE4_SPECIAL_GROUP; + break; + + case ACE_EVERYONE: + nacep->iflag |= ACEI4_SPECIAL_WHO; + nacep->who = ACE4_SPECIAL_EVERYONE; + break; + + case ACE_IDENTIFIER_GROUP: + case 0: + nacep->who = acep->a_who; + break; + + default: + dprintf("Unknown ACE_TYPE_FLAG 0x%08x\n", + acep->a_flags & ACE_TYPE_FLAGS); + return (-EINVAL); + } + + return (0); +} + +static int +zfsacl_to_nfsacl41i(const vsecattr_t vsecp, nfsacl41i **_nacl, size_t *_size) +{ + nfsacl41i *nacl = NULL; + nfsace4i *nacep = NULL; + ace_t *acep = NULL; + + int i, error; + size_t acl_size; + + acl_size = ACES_TO_ACLSIZE(vsecp.vsa_aclcnt); + + nacl = kmem_alloc(acl_size, KM_SLEEP); + nacl->na41_aces.na41_aces_len = vsecp.vsa_aclcnt; + nacl->na41_flag = vsecp.vsa_aclflags; + nacep = (nfsace4i *)((char *)nacl + sizeof (nfsacl41i)); + nacl->na41_aces.na41_aces_val = nacep; + + for (i = 0; i < nacl->na41_aces.na41_aces_len; i++) { + nacep = &nacl->na41_aces.na41_aces_val[i]; + acep = vsecp.vsa_aclentp + (i * sizeof (ace_t)); + error = acep_to_nfsace4i(acep, nacep); + if (error) { + kmem_free(nacl, acl_size); + return (error); + } + } + *_size = acl_size; + *_nacl = nacl; + return (0); +} + +static int +nfsace4i_to_acep(const nfsace4i *nacep, ace_t *acep) +{ + acep->a_type = nacep->type; + acep->a_flags = nacep->flag & NFS41_FLAGS; + acep->a_access_mask = nacep->access_mask; + if (nacep->iflag & ACEI4_SPECIAL_WHO) { + switch (nacep->who) { + case ACE4_SPECIAL_OWNER: + acep->a_flags |= ACE_OWNER; + acep->a_who = -1; + break; + + case ACE4_SPECIAL_GROUP: + acep->a_flags |= (ACE_GROUP | ACE_IDENTIFIER_GROUP); + acep->a_who = -1; + break; + + case ACE4_SPECIAL_EVERYONE: + acep->a_flags |= ACE_EVERYONE; + acep->a_who = -1; + break; + + default: + dprintf("Unknown id 0x%08x\n", nacep->who); + return (-EINVAL); + } + } else { + acep->a_who = nacep->who; + } + + return (0); +} + +static int +nfsacl41i_to_zfsacl(const nfsacl41i *nacl, vsecattr_t *_vsecp) +{ + int i, error = 0; + vsecattr_t vsecp; + + vsecp.vsa_aclcnt = nacl->na41_aces.na41_aces_len; + vsecp.vsa_aclflags = nacl->na41_flag; + vsecp.vsa_aclentsz = vsecp.vsa_aclcnt * sizeof (ace_t); + vsecp.vsa_mask = (VSA_ACE | VSA_ACE_ACLFLAGS); + vsecp.vsa_aclentp = kmem_alloc(vsecp.vsa_aclentsz, KM_SLEEP); + + for (i = 0; i < vsecp.vsa_aclcnt; i++) { + ace_t *acep = vsecp.vsa_aclentp + (i * sizeof (ace_t)); + nfsace4i *nacep = &nacl->na41_aces.na41_aces_val[i]; + error = nfsace4i_to_acep(nacep, acep); + if (error) { + return (error); + } + } + *_vsecp = vsecp; + return (error); +} + +static int +__zpl_xattr_nfs41acl_get(struct inode *ip, const char *name, + void *buffer, size_t size) +{ + vsecattr_t vsecp; + cred_t *cr = CRED(); + int ret, fl; + size_t acl_size = 0, xdr_size = 0; + XDR xdr = {0}; + boolean_t ok; + nfsacl41i *nacl = NULL; + + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") != 0) + return (-EINVAL); +#endif + + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_NFSV4) + return (-EOPNOTSUPP); + + if (size == 0) { + /* + * API user may send 0 size so that we + * return size of buffer needed for ACL. + */ + crhold(cr); + vsecp.vsa_mask = VSA_ACECNT; + ret = -zfs_getsecattr(ITOZ(ip), &vsecp, ATTR_NOACLCHECK, cr); + if (ret) { + return (ret); + } + crfree(cr); + ret = ACES_TO_XDRSIZE(vsecp.vsa_aclcnt); + return (ret); + } + + if (size < ACES_TO_XDRSIZE(1)) { + return (-EINVAL); + } + + vsecp.vsa_mask = VSA_ACE_ALLTYPES | VSA_ACECNT | VSA_ACE | + VSA_ACE_ACLFLAGS; + + crhold(cr); + fl = capable(CAP_DAC_OVERRIDE) ? ATTR_NOACLCHECK : 0; + ret = -zfs_getsecattr(ITOZ(ip), &vsecp, fl, cr); + crfree(cr); + + if (ret) { + return (ret); + } + + if (vsecp.vsa_aclcnt == 0) { + ret = -ENODATA; + goto nfs4acl_get_out; + } + + xdr_size = ACES_TO_XDRSIZE(vsecp.vsa_aclcnt); + if (xdr_size > size) { + ret = -ERANGE; + goto nfs4acl_get_out; + } + + ret = zfsacl_to_nfsacl41i(vsecp, &nacl, &acl_size); + if (ret) { + ret = -ENOMEM; + goto nfs4acl_get_out; + } + + xdrmem_create(&xdr, (char *)buffer, xdr_size, XDR_ENCODE); + ok = xdr_nfsacl41i(&xdr, nacl); + if (!ok) { + ret = -ENOMEM; + kmem_free(nacl, acl_size); + goto nfs4acl_get_out; + } + + kmem_free(nacl, acl_size); + ret = xdr_size; + +nfs4acl_get_out: + kmem_free(vsecp.vsa_aclentp, vsecp.vsa_aclentsz); + + return (ret); +} +ZPL_XATTR_GET_WRAPPER(zpl_xattr_nfs41acl_get); + +static int +__zpl_xattr_nfs41acl_set(struct inode *ip, const char *name, + const void *value, size_t size, int flags) +{ + cred_t *cr = CRED(); + vsecattr_t vsecp; + char *bufp = NULL; + nfsacl41i *nacl = NULL; + boolean_t ok; + XDR xdr = {0}; + size_t acl_size = 0; + int error, fl, naces; + + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_NFSV4) + return (-EOPNOTSUPP); + + /* + * TODO: we may receive NULL value and size 0 + * when rmxattr() on our special xattr is called. + * A function to "strip" the ACL needs to be added + * to avoid POLA violation. + */ + + /* xdr data is 4-byte aligned */ + if (((ulong_t)value % 4) != 0) { + return (-EINVAL); + } + + naces = XDRSIZE_TO_ACES(size); + if (naces > NFS41ACL_MAX_ACES) { + return (-E2BIG); + } + + if (!XDRSIZE_IS_VALID(size)) { + return (-EINVAL); + } + bufp = (char *)value; + acl_size = ACES_TO_ACLSIZE(naces); + nacl = kmem_alloc(sizeof (nfsacl41i), KM_SLEEP); + /* + * NULL may still be returned with KM_SLEEP set. + * In principal, checks for SIZE of xattr are + * sufficient to protect, but check for NULL anyway. + */ + if (nacl == NULL) { + return (-ENOMEM); + } + + xdrmem_create(&xdr, bufp, acl_size, XDR_DECODE); + ok = xdr_nfsacl41i(&xdr, nacl); + if (!ok) { + kmem_free(nacl, sizeof (nfsacl41i)); + return (-ENOMEM); + } + error = nfsacl41i_to_zfsacl(nacl, &vsecp); + if (error) { + kmem_free(nacl, sizeof (nfsacl41i)); + return (error); + } + + /* XDR_DECODE allocates memory for the array of aces */ + kmem_free(nacl->na41_aces.na41_aces_val, + (nacl->na41_aces.na41_aces_len * ACE4SIZE)); + kmem_free(nacl, sizeof (nfsacl41i)); + + crhold(cr); + fl = capable(CAP_DAC_OVERRIDE) ? ATTR_NOACLCHECK : 0; + error = -zfs_setsecattr(ITOZ(ip), &vsecp, fl, cr); + crfree(cr); + + kmem_free(vsecp.vsa_aclentp, vsecp.vsa_aclentsz); + return (error); +} +ZPL_XATTR_SET_WRAPPER(zpl_xattr_nfs41acl_set); + +/* + * ACL access xattr namespace handlers. + * + * Use .name instead of .prefix when available. xattr_resolve_name will match + * whole name and reject anything that has .name only as prefix. + */ +xattr_handler_t zpl_xattr_nfs41acl_handler = +{ +#ifdef HAVE_XATTR_HANDLER_NAME + .name = NFS41ACL_XATTR, +#else + .prefix = NFS41ACL_XATTR, +#endif + .list = zpl_xattr_nfs41acl_list, + .get = zpl_xattr_nfs41acl_get, + .set = zpl_xattr_nfs41acl_set, +}; + xattr_handler_t *zpl_xattr_handlers[] = { &zpl_xattr_security_handler, &zpl_xattr_trusted_handler, @@ -1465,6 +1931,7 @@ xattr_handler_t *zpl_xattr_handlers[] = { &zpl_xattr_acl_access_handler, &zpl_xattr_acl_default_handler, #endif /* CONFIG_FS_POSIX_ACL */ + &zpl_xattr_nfs41acl_handler, NULL }; @@ -1493,6 +1960,10 @@ zpl_xattr_handler(const char *name) return (&zpl_xattr_acl_default_handler); #endif /* CONFIG_FS_POSIX_ACL */ + if (strncmp(name, NFS41ACL_XATTR, + sizeof (NFS41ACL_XATTR)) == 0) + return (&zpl_xattr_nfs41acl_handler); + return (NULL); } From 9290fb7361fb2692ee62f0f12fa384ad885dbe56 Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Thu, 4 Jun 2020 13:32:32 -0400 Subject: [PATCH 04/35] Add zfsd for FreeBSD Signed-off-by: Ryan Moeller --- Makefile.am | 2 +- cmd/Makefile.am | 3 + cmd/zfsd/.gitignore | 1 + cmd/zfsd/Makefile.am | 25 + cmd/zfsd/callout.cc | 220 +++++++ cmd/zfsd/callout.h | 185 ++++++ cmd/zfsd/case_file.cc | 1195 ++++++++++++++++++++++++++++++++++++ cmd/zfsd/case_file.h | 452 ++++++++++++++ cmd/zfsd/vdev.cc | 358 +++++++++++ cmd/zfsd/vdev.h | 188 ++++++ cmd/zfsd/vdev_iterator.cc | 172 ++++++ cmd/zfsd/vdev_iterator.h | 123 ++++ cmd/zfsd/zfsd.8 | 154 +++++ cmd/zfsd/zfsd.cc | 449 ++++++++++++++ cmd/zfsd/zfsd.h | 228 +++++++ cmd/zfsd/zfsd_event.cc | 488 +++++++++++++++ cmd/zfsd/zfsd_event.h | 144 +++++ cmd/zfsd/zfsd_exception.cc | 135 ++++ cmd/zfsd/zfsd_exception.h | 109 ++++ cmd/zfsd/zfsd_main.cc | 90 +++ cmd/zfsd/zpool_list.cc | 122 ++++ cmd/zfsd/zpool_list.h | 133 ++++ config/Rules.am | 9 + configure.ac | 2 + etc/Makefile.am | 8 + etc/rc.d/.gitignore | 1 + etc/rc.d/zfsd.in | 14 + 27 files changed, 5009 insertions(+), 1 deletion(-) create mode 100644 cmd/zfsd/.gitignore create mode 100644 cmd/zfsd/Makefile.am create mode 100644 cmd/zfsd/callout.cc create mode 100644 cmd/zfsd/callout.h create mode 100644 cmd/zfsd/case_file.cc create mode 100644 cmd/zfsd/case_file.h create mode 100644 cmd/zfsd/vdev.cc create mode 100644 cmd/zfsd/vdev.h create mode 100644 cmd/zfsd/vdev_iterator.cc create mode 100644 cmd/zfsd/vdev_iterator.h create mode 100644 cmd/zfsd/zfsd.8 create mode 100644 cmd/zfsd/zfsd.cc create mode 100644 cmd/zfsd/zfsd.h create mode 100644 cmd/zfsd/zfsd_event.cc create mode 100644 cmd/zfsd/zfsd_event.h create mode 100644 cmd/zfsd/zfsd_exception.cc create mode 100644 cmd/zfsd/zfsd_exception.h create mode 100644 cmd/zfsd/zfsd_main.cc create mode 100644 cmd/zfsd/zpool_list.cc create mode 100644 cmd/zfsd/zpool_list.h create mode 100644 etc/rc.d/.gitignore create mode 100644 etc/rc.d/zfsd.in diff --git a/Makefile.am b/Makefile.am index 76ec01f2aa31..f49766a45acd 100644 --- a/Makefile.am +++ b/Makefile.am @@ -119,7 +119,7 @@ cstyle_line = -exec ${top_srcdir}/scripts/cstyle.pl -cpP {} + endif CHECKS += cstyle cstyle: - $(AM_V_at)find $(top_srcdir) -name build -prune \ + $(AM_V_at)find $(top_srcdir) -name build -prune -o -name zfsd -prune \ -o -type f -name '*.[hc]' \ ! -name 'zfs_config.*' ! -name '*.mod.c' \ ! -name 'nfs41acl_xdr.c' ! -name 'nfs41acl.h' \ diff --git a/cmd/Makefile.am b/cmd/Makefile.am index 2bd9d039f20e..74a93c08a926 100644 --- a/cmd/Makefile.am +++ b/cmd/Makefile.am @@ -64,6 +64,9 @@ include $(srcdir)/%D%/zpool/Makefile.am include $(srcdir)/%D%/zpool_influxdb/Makefile.am include $(srcdir)/%D%/zstream/Makefile.am +if BUILD_FREEBSD +include $(srcdir)/%D%/zfsd/Makefile.am +endif if BUILD_LINUX mounthelper_PROGRAMS += mount.zfs diff --git a/cmd/zfsd/.gitignore b/cmd/zfsd/.gitignore new file mode 100644 index 000000000000..5e3efca4d908 --- /dev/null +++ b/cmd/zfsd/.gitignore @@ -0,0 +1 @@ +/zfsd diff --git a/cmd/zfsd/Makefile.am b/cmd/zfsd/Makefile.am new file mode 100644 index 000000000000..0371d389bee3 --- /dev/null +++ b/cmd/zfsd/Makefile.am @@ -0,0 +1,25 @@ +zfsd_CFLAGS = $(AM_CFLAGS) +zfsd_CXXFLAGS = $(AM_CXXFLAGS) +zfsd_CPPFLAGS = $(AM_CPPFLAGS) + +sbin_PROGRAMS += zfsd + +zfsd_SOURCES = \ + %D%/callout.cc \ + %D%/case_file.cc \ + %D%/vdev.cc \ + %D%/vdev_iterator.cc \ + %D%/zfsd.cc \ + %D%/zfsd_event.cc \ + %D%/zfsd_exception.cc \ + %D%/zfsd_main.cc \ + %D%/zpool_list.cc + +zfsd_LDADD = \ + libnvpair.la \ + libuutil.la \ + libzfs_core.la \ + libzfs.la + +zfsd_LDADD += -lrt -lprivatedevdctl -lgeom -lbsdxml -lsbuf +zfsd_LDFLAGS = -pthread diff --git a/cmd/zfsd/callout.cc b/cmd/zfsd/callout.cc new file mode 100644 index 000000000000..3e5cd5779559 --- /dev/null +++ b/cmd/zfsd/callout.cc @@ -0,0 +1,220 @@ +/*- + * Copyright (c) 2011, 2012, 2013 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + * + * $FreeBSD$ + */ + +/** + * \file callout.cc + * + * \brief Implementation of the Callout class - multi-client + * timer services built on top of the POSIX interval timer. + */ + +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "callout.h" +#include "vdev_iterator.h" +#include "zfsd.h" +#include "zfsd_exception.h" + +std::list Callout::s_activeCallouts; +bool Callout::s_alarmFired(false); + +void +Callout::Init() +{ + signal(SIGALRM, Callout::AlarmSignalHandler); +} + +bool +Callout::Stop() +{ + if (!IsPending()) + return (false); + + for (std::list::iterator it(s_activeCallouts.begin()); + it != s_activeCallouts.end(); it++) { + if (*it != this) + continue; + + it = s_activeCallouts.erase(it); + if (it != s_activeCallouts.end()) { + + /* + * Maintain correct interval for the + * callouts that follow the just removed + * entry. + */ + timeradd(&(*it)->m_interval, &m_interval, + &(*it)->m_interval); + } + break; + } + m_pending = false; + return (true); +} + +bool +Callout::Reset(const timeval &interval, CalloutFunc_t *func, void *arg) +{ + bool cancelled(false); + + if (!timerisset(&interval)) + throw ZfsdException("Callout::Reset: interval of 0"); + + cancelled = Stop(); + + m_interval = interval; + m_func = func; + m_arg = arg; + m_pending = true; + + std::list::iterator it(s_activeCallouts.begin()); + for (; it != s_activeCallouts.end(); it++) { + + if (timercmp(&(*it)->m_interval, &m_interval, <=)) { + /* + * Decrease our interval by those that come + * before us. + */ + timersub(&m_interval, &(*it)->m_interval, &m_interval); + } else { + /* + * Account for the time between the newly + * inserted event and those that follow. + */ + timersub(&(*it)->m_interval, &m_interval, + &(*it)->m_interval); + break; + } + } + s_activeCallouts.insert(it, this); + + + if (s_activeCallouts.front() == this) { + itimerval timerval = { {0, 0}, m_interval }; + + setitimer(ITIMER_REAL, &timerval, NULL); + } + + return (cancelled); +} + +void +Callout::AlarmSignalHandler(int) +{ + s_alarmFired = true; + ZfsDaemon::WakeEventLoop(); +} + +void +Callout::ExpireCallouts() +{ + if (!s_alarmFired) + return; + + s_alarmFired = false; + if (s_activeCallouts.empty()) { + /* Callout removal/SIGALRM race was lost. */ + return; + } + + /* + * Expire the first callout (the one we used to set the + * interval timer) as well as any callouts following that + * expire at the same time (have a zero interval from + * the callout before it). + */ + do { + Callout *cur(s_activeCallouts.front()); + s_activeCallouts.pop_front(); + cur->m_pending = false; + cur->m_func(cur->m_arg); + } while (!s_activeCallouts.empty() + && timerisset(&s_activeCallouts.front()->m_interval) == 0); + + if (!s_activeCallouts.empty()) { + Callout *next(s_activeCallouts.front()); + itimerval timerval = { { 0, 0 }, next->m_interval }; + + setitimer(ITIMER_REAL, &timerval, NULL); + } +} + +timeval +Callout::TimeRemaining() const +{ + /* + * Outline: Add the m_interval for each callout in s_activeCallouts + * ahead of this, except for the first callout. Add to that the result + * of getitimer (That's because the first callout stores its original + * interval setting while the timer is ticking). + */ + itimerval timervalToAlarm; + timeval timeToExpiry; + std::list::iterator it; + + if (!IsPending()) { + timeToExpiry.tv_sec = INT_MAX; + timeToExpiry.tv_usec = 999999; /*maximum normalized value*/ + return (timeToExpiry); + } + + timerclear(&timeToExpiry); + getitimer(ITIMER_REAL, &timervalToAlarm); + timeval& timeToAlarm = timervalToAlarm.it_value; + timeradd(&timeToExpiry, &timeToAlarm, &timeToExpiry); + + it =s_activeCallouts.begin(); + it++; /*skip the first callout in the list*/ + for (; it != s_activeCallouts.end(); it++) { + timeradd(&timeToExpiry, &(*it)->m_interval, &timeToExpiry); + if ((*it) == this) + break; + } + return (timeToExpiry); +} diff --git a/cmd/zfsd/callout.h b/cmd/zfsd/callout.h new file mode 100644 index 000000000000..d6b83bcfe5f5 --- /dev/null +++ b/cmd/zfsd/callout.h @@ -0,0 +1,185 @@ +/*- + * Copyright (c) 2011, 2012, 2013 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + * + * $FreeBSD$ + */ + +/** + * \file callout.h + * + * \brief Interface for timer based callback services. + * + * Header requirements: + * + * #include + * + * #include + */ + +#ifndef _CALLOUT_H_ +#define _CALLOUT_H_ + +/** + * \brief Type of the function callback from a Callout. + */ +typedef void CalloutFunc_t(void *); + +/** + * \brief Interface to a schedulable one-shot timer with the granularity + * of the system clock (see setitimer(2)). + * + * Determination of callback expiration is triggered by the SIGALRM + * signal. Callout callbacks are always delivered from Zfsd's event + * processing loop. + * + * Periodic actions can be triggered via the Callout mechanisms by + * resetting the Callout from within its callback. + */ +class Callout +{ +public: + + /** + * Initialize the Callout subsystem. + */ + static void Init(); + + /** + * Function called (via SIGALRM) when our interval + * timer expires. + */ + static void AlarmSignalHandler(int); + + /** + * Execute callbacks for all callouts that have the same + * expiration time as the first callout in the list. + */ + static void ExpireCallouts(); + + /** Constructor. */ + Callout(); + + /** + * Returns true if callout has not been stopped, + * or deactivated since the last time the callout was + * reset. + */ + bool IsActive() const; + + /** + * Returns true if callout is still waiting to expire. + */ + bool IsPending() const; + + /** + * Disestablish a callout. + */ + bool Stop(); + + /** + * \brief Establish or change a timeout. + * + * \param interval Timeval indicating the time which must elapse + * before this callout fires. + * \param func Pointer to the callback function + * \param arg Argument pointer to pass to callback function + * + * \return Cancellation status. + * true: The previous callback was pending and therefore + * was cancelled. + * false: The callout was not pending at the time of this + * reset request. + * In all cases, a new callout is established. + */ + bool Reset(const timeval &interval, CalloutFunc_t *func, void *arg); + + /** + * \brief Calculate the remaining time until this Callout's timer + * expires. + * + * The return value will be slightly greater than the actual time to + * expiry. + * + * If the callout is not pending, returns INT_MAX. + */ + timeval TimeRemaining() const; + +private: + /** + * All active callouts sorted by expiration time. The callout + * with the nearest expiration time is at the head of the list. + */ + static std::list s_activeCallouts; + + /** + * The interval timer has expired. This variable is set from + * signal handler context and tested from Zfsd::EventLoop() + * context via ExpireCallouts(). + */ + static bool s_alarmFired; + + /** + * Time, relative to others in the active list, until + * this callout is fired. + */ + timeval m_interval; + + /** Callback function argument. */ + void *m_arg; + + /** + * The callback function associated with this timer + * entry. + */ + CalloutFunc_t *m_func; + + /** State of this callout. */ + bool m_pending; +}; + +//- Callout public const methods ---------------------------------------------- +inline bool +Callout::IsPending() const +{ + return (m_pending); +} + +//- Callout public methods ---------------------------------------------------- +inline +Callout::Callout() + : m_arg(0), + m_func(NULL), + m_pending(false) +{ + timerclear(&m_interval); +} + +#endif /* CALLOUT_H_ */ diff --git a/cmd/zfsd/case_file.cc b/cmd/zfsd/case_file.cc new file mode 100644 index 000000000000..5064d3c266dd --- /dev/null +++ b/cmd/zfsd/case_file.cc @@ -0,0 +1,1195 @@ +/*- + * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + */ + +/** + * \file case_file.cc + * + * We keep case files for any leaf vdev that is not in the optimal state. + * However, we only serialize to disk those events that need to be preserved + * across reboots. For now, this is just a log of soft errors which we + * accumulate in order to mark a device as degraded. + */ +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "callout.h" +#include "vdev_iterator.h" +#include "zfsd_event.h" +#include "case_file.h" +#include "vdev.h" +#include "zfsd.h" +#include "zfsd_exception.h" +#include "zpool_list.h" + +__FBSDID("$FreeBSD$"); + +/*============================ Namespace Control =============================*/ +using std::hex; +using std::ifstream; +using std::stringstream; +using std::setfill; +using std::setw; + +using DevdCtl::Event; +using DevdCtl::EventFactory; +using DevdCtl::EventList; +using DevdCtl::Guid; +using DevdCtl::ParseException; + +/*--------------------------------- CaseFile ---------------------------------*/ +//- CaseFile Static Data ------------------------------------------------------- + +CaseFileList CaseFile::s_activeCases; +const string CaseFile::s_caseFilePath = "/var/db/zfsd/cases"; +const timeval CaseFile::s_removeGracePeriod = { 60 /*sec*/, 0 /*usec*/}; + +//- CaseFile Static Public Methods --------------------------------------------- +CaseFile * +CaseFile::Find(Guid poolGUID, Guid vdevGUID) +{ + for (CaseFileList::iterator curCase = s_activeCases.begin(); + curCase != s_activeCases.end(); curCase++) { + + if (((*curCase)->PoolGUID() != poolGUID + && Guid::InvalidGuid() != poolGUID) + || (*curCase)->VdevGUID() != vdevGUID) + continue; + + /* + * We only carry one active case per-vdev. + */ + return (*curCase); + } + return (NULL); +} + +void +CaseFile::Find(Guid poolGUID, Guid vdevGUID, CaseFileList &cases) +{ + for (CaseFileList::iterator curCase = s_activeCases.begin(); + curCase != s_activeCases.end(); curCase++) { + if (((*curCase)->PoolGUID() != poolGUID && + Guid::InvalidGuid() != poolGUID) || + (*curCase)->VdevGUID() != vdevGUID) + continue; + + /* + * We can have multiple cases for spare vdevs + */ + cases.push_back(*curCase); + if (!(*curCase)->IsSpare()) { + return; + } + } +} + +CaseFile * +CaseFile::Find(const string &physPath) +{ + CaseFile *result = NULL; + + for (CaseFileList::iterator curCase = s_activeCases.begin(); + curCase != s_activeCases.end(); curCase++) { + + if ((*curCase)->PhysicalPath() != physPath) + continue; + + if (result != NULL) { + syslog(LOG_WARNING, "Multiple casefiles found for " + "physical path %s. " + "This is most likely a bug in zfsd", + physPath.c_str()); + } + result = *curCase; + } + return (result); +} + + +void +CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event) +{ + CaseFileList::iterator casefile; + for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){ + CaseFileList::iterator next = casefile; + next++; + if (poolGUID == (*casefile)->PoolGUID()) + (*casefile)->ReEvaluate(event); + casefile = next; + } +} + +CaseFile & +CaseFile::Create(Vdev &vdev) +{ + CaseFile *activeCase; + + activeCase = Find(vdev.PoolGUID(), vdev.GUID()); + if (activeCase == NULL) + activeCase = new CaseFile(vdev); + + return (*activeCase); +} + +void +CaseFile::DeSerialize() +{ + struct dirent **caseFiles; + + int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles, + DeSerializeSelector, /*compar*/NULL)); + + if (numCaseFiles == -1) + return; + if (numCaseFiles == 0) { + free(caseFiles); + return; + } + + for (int i = 0; i < numCaseFiles; i++) { + + DeSerializeFile(caseFiles[i]->d_name); + free(caseFiles[i]); + } + free(caseFiles); +} + +bool +CaseFile::Empty() +{ + return (s_activeCases.empty()); +} + +void +CaseFile::LogAll() +{ + for (CaseFileList::iterator curCase = s_activeCases.begin(); + curCase != s_activeCases.end(); curCase++) + (*curCase)->Log(); +} + +void +CaseFile::PurgeAll() +{ + /* + * Serialize casefiles before deleting them so that they can be reread + * and revalidated during BuildCaseFiles. + * CaseFiles remove themselves from this list on destruction. + */ + while (s_activeCases.size() != 0) { + CaseFile *casefile = s_activeCases.front(); + casefile->Serialize(); + delete casefile; + } + +} + +int +CaseFile::IsSpare() +{ + return (m_is_spare); +} + +//- CaseFile Public Methods ---------------------------------------------------- +bool +CaseFile::RefreshVdevState() +{ + ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); + zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front()); + if (casePool == NULL) + return (false); + + Vdev vd(casePool, CaseVdev(casePool)); + if (vd.DoesNotExist()) + return (false); + + m_vdevState = vd.State(); + m_vdevPhysPath = vd.PhysicalPath(); + return (true); +} + +bool +CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev) +{ + ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); + zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front()); + int flags = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE; + + if (pool == NULL || !RefreshVdevState()) { + /* + * The pool or vdev for this case file is no longer + * part of the configuration. This can happen + * if we process a device arrival notification + * before seeing the ZFS configuration change + * event. + */ + syslog(LOG_INFO, + "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured. " + "Closing\n", + PoolGUIDString().c_str(), + VdevGUIDString().c_str()); + Close(); + + /* + * Since this event was not used to close this + * case, do not report it as consumed. + */ + return (/*consumed*/false); + } + + if (VdevState() > VDEV_STATE_CANT_OPEN) { + /* + * For now, newly discovered devices only help for + * devices that are missing. In the future, we might + * use a newly inserted spare to replace a degraded + * or faulted device. + */ + syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored", + PoolGUIDString().c_str(), VdevGUIDString().c_str()); + return (/*consumed*/false); + } + + if (vdev != NULL + && ( vdev->PoolGUID() == m_poolGUID + || vdev->PoolGUID() == Guid::InvalidGuid()) + && vdev->GUID() == m_vdevGUID) { + + if (IsSpare()) + flags |= ZFS_ONLINE_SPARE; + if (zpool_vdev_online(pool, vdev->GUIDString().c_str(), + flags, &m_vdevState) != 0) { + syslog(LOG_ERR, + "Failed to online vdev(%s/%s:%s): %s: %s\n", + zpool_get_name(pool), vdev->GUIDString().c_str(), + devPath.c_str(), libzfs_error_action(g_zfsHandle), + libzfs_error_description(g_zfsHandle)); + return (/*consumed*/false); + } + + syslog(LOG_INFO, "Onlined vdev(%s/%s:%s). State now %s.\n", + zpool_get_name(pool), vdev->GUIDString().c_str(), + devPath.c_str(), + zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); + + /* + * Check the vdev state post the online action to see + * if we can retire this case. + */ + CloseIfSolved(); + + return (/*consumed*/true); + } + + /* + * If the auto-replace policy is enabled, and we have physical + * path information, try a physical path replacement. + */ + if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) { + syslog(LOG_INFO, + "CaseFile(%s:%s:%s): AutoReplace not set. " + "Ignoring device insertion.\n", + PoolGUIDString().c_str(), + VdevGUIDString().c_str(), + zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); + return (/*consumed*/false); + } + + if (PhysicalPath().empty()) { + syslog(LOG_INFO, + "CaseFile(%s:%s:%s): No physical path information. " + "Ignoring device insertion.\n", + PoolGUIDString().c_str(), + VdevGUIDString().c_str(), + zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); + return (/*consumed*/false); + } + + if (physPath != PhysicalPath()) { + syslog(LOG_INFO, + "CaseFile(%s:%s:%s): Physical path mismatch. " + "Ignoring device insertion.\n", + PoolGUIDString().c_str(), + VdevGUIDString().c_str(), + zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); + return (/*consumed*/false); + } + + /* Write a label on the newly inserted disk. */ + if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) { + syslog(LOG_ERR, + "Replace vdev(%s/%s) by physical path (label): %s: %s\n", + zpool_get_name(pool), VdevGUIDString().c_str(), + libzfs_error_action(g_zfsHandle), + libzfs_error_description(g_zfsHandle)); + return (/*consumed*/false); + } + + syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s", + PoolGUIDString().c_str(), VdevGUIDString().c_str(), + devPath.c_str()); + return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false)); +} + +bool +CaseFile::ReEvaluate(const ZfsEvent &event) +{ + bool consumed(false); + + if (event.Value("type") == "misc.fs.zfs.vdev_remove") { + /* + * The Vdev we represent has been removed from the + * configuration. This case is no longer of value. + */ + Close(); + + return (/*consumed*/true); + } else if (event.Value("type") == "misc.fs.zfs.pool_destroy") { + /* This Pool has been destroyed. Discard the case */ + Close(); + + return (/*consumed*/true); + } else if (event.Value("type") == "misc.fs.zfs.config_sync") { + RefreshVdevState(); + if (VdevState() < VDEV_STATE_HEALTHY) + consumed = ActivateSpare(); + } + + + if (event.Value("class") == "resource.fs.zfs.removed") { + bool spare_activated; + + if (!RefreshVdevState()) { + /* + * The pool or vdev for this case file is no longer + * part of the configuration. This can happen + * if we process a device arrival notification + * before seeing the ZFS configuration change + * event. + */ + syslog(LOG_INFO, + "CaseFile::ReEvaluate(%s,%s) Pool/Vdev " + "unconfigured. Closing\n", + PoolGUIDString().c_str(), + VdevGUIDString().c_str()); + /* + * Close the case now so we won't waste cycles in the + * system rescan + */ + Close(); + + /* + * Since this event was not used to close this + * case, do not report it as consumed. + */ + return (/*consumed*/false); + } + + /* + * Discard any tentative I/O error events for + * this case. They were most likely caused by the + * hot-unplug of this device. + */ + PurgeTentativeEvents(); + + /* Try to activate spares if they are available */ + spare_activated = ActivateSpare(); + + /* + * Rescan the drives in the system to see if a recent + * drive arrival can be used to solve this case. + */ + ZfsDaemon::RequestSystemRescan(); + + /* + * Consume the event if we successfully activated a spare. + * Otherwise, leave it in the unconsumed events list so that the + * future addition of a spare to this pool might be able to + * close the case + */ + consumed = spare_activated; + } else if (event.Value("class") == "resource.fs.zfs.statechange") { + RefreshVdevState(); + /* + * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to + * activate a hotspare. Otherwise, ignore the event + */ + if (VdevState() == VDEV_STATE_FAULTED || + VdevState() == VDEV_STATE_DEGRADED || + VdevState() == VDEV_STATE_CANT_OPEN) + (void) ActivateSpare(); + consumed = true; + } + else if (event.Value("class") == "ereport.fs.zfs.io" || + event.Value("class") == "ereport.fs.zfs.checksum") { + + m_tentativeEvents.push_front(event.DeepCopy()); + RegisterCallout(event); + consumed = true; + } + + bool closed(CloseIfSolved()); + + return (consumed || closed); +} + +/* Find a Vdev containing the vdev with the given GUID */ +static nvlist_t* +find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid) +{ + nvlist_t **vdevChildren; + int error; + unsigned ch, numChildren; + + error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, + &vdevChildren, &numChildren); + + if (error != 0 || numChildren == 0) + return (NULL); + + for (ch = 0; ch < numChildren; ch++) { + nvlist *result; + Vdev vdev(pool_config, vdevChildren[ch]); + + if (vdev.GUID() == child_guid) + return (config); + + result = find_parent(pool_config, vdevChildren[ch], child_guid); + if (result != NULL) + return (result); + } + + return (NULL); +} + +bool +CaseFile::ActivateSpare() { + nvlist_t *config, *nvroot, *parent_config; + nvlist_t **spares; + const char *devPath, *poolname, *vdev_type; + u_int nspares, i; + int error; + + ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); + zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); + if (zhp == NULL) { + syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool " + "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID); + return (false); + } + poolname = zpool_get_name(zhp); + config = zpool_get_config(zhp, NULL); + if (config == NULL) { + syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool " + "config for pool %s", poolname); + return (false); + } + error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot); + if (error != 0){ + syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev " + "tree for pool %s", poolname); + return (false); + } + + parent_config = find_parent(config, nvroot, m_vdevGUID); + if (parent_config != NULL) { + const char *parent_type; + + /* + * Don't activate spares for members of a "replacing" vdev. + * They're already dealt with. Sparing them will just drag out + * the resilver process. + */ + error = nvlist_lookup_string(parent_config, + ZPOOL_CONFIG_TYPE, &parent_type); + if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0) + return (false); + } + + nspares = 0; + nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, + &nspares); + if (nspares == 0) { + /* The pool has no spares configured */ + syslog(LOG_INFO, "CaseFile::ActivateSpare: " + "No spares available for pool %s", poolname); + return (false); + } + for (i = 0; i < nspares; i++) { + uint64_t *nvlist_array; + vdev_stat_t *vs; + uint_t nstats; + + if (nvlist_lookup_uint64_array(spares[i], + ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) { + syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not " + "find vdev stats for pool %s, spare %d", + poolname, i); + return (false); + } + vs = reinterpret_cast(nvlist_array); + + if ((vs->vs_aux != VDEV_AUX_SPARED) + && (vs->vs_state == VDEV_STATE_HEALTHY)) { + /* We found a usable spare */ + break; + } + } + + if (i == nspares) { + /* No available spares were found */ + return (false); + } + + error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath); + if (error != 0) { + syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine " + "the path of pool %s, spare %d. Error %d", + poolname, i, error); + return (false); + } + + error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type); + if (error != 0) { + syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine " + "the vdev type of pool %s, spare %d. Error %d", + poolname, i, error); + return (false); + } + + return (Replace(vdev_type, devPath, /*isspare*/true)); +} + +void +CaseFile::RegisterCallout(const Event &event) +{ + timeval now, countdown, elapsed, timestamp, zero, remaining; + + gettimeofday(&now, 0); + timestamp = event.GetTimestamp(); + timersub(&now, ×tamp, &elapsed); + timersub(&s_removeGracePeriod, &elapsed, &countdown); + /* + * If countdown is <= zero, Reset the timer to the + * smallest positive time value instead + */ + timerclear(&zero); + if (timercmp(&countdown, &zero, <=)) { + timerclear(&countdown); + countdown.tv_usec = 1; + } + + remaining = m_tentativeTimer.TimeRemaining(); + + if (!m_tentativeTimer.IsPending() + || timercmp(&countdown, &remaining, <)) + m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this); +} + + +bool +CaseFile::CloseIfSolved() +{ + if (m_events.empty() + && m_tentativeEvents.empty()) { + + /* + * We currently do not track or take actions on + * devices in the degraded or faulted state. + * Once we have support for spare pools, we'll + * retain these cases so that any spares added in + * the future can be applied to them. + */ + switch (VdevState()) { + case VDEV_STATE_HEALTHY: + /* No need to keep cases for healthy vdevs */ + Close(); + return (true); + case VDEV_STATE_REMOVED: + case VDEV_STATE_CANT_OPEN: + /* + * Keep open. We may solve it with a newly inserted + * device. + */ + case VDEV_STATE_FAULTED: + case VDEV_STATE_DEGRADED: + /* + * Keep open. We may solve it with the future + * addition of a spare to the pool + */ + case VDEV_STATE_UNKNOWN: + case VDEV_STATE_CLOSED: + case VDEV_STATE_OFFLINE: + /* + * Keep open? This may not be the correct behavior, + * but it's what we've always done + */ + ; + } + + /* + * Re-serialize the case in order to remove any + * previous event data. + */ + Serialize(); + } + + return (false); +} + +void +CaseFile::Log() +{ + syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(), + VdevGUIDString().c_str(), PhysicalPath().c_str()); + syslog(LOG_INFO, "\tVdev State = %s\n", + zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); + if (m_tentativeEvents.size() != 0) { + syslog(LOG_INFO, "\t=== Tentative Events ===\n"); + for (EventList::iterator event(m_tentativeEvents.begin()); + event != m_tentativeEvents.end(); event++) + (*event)->Log(LOG_INFO); + } + if (m_events.size() != 0) { + syslog(LOG_INFO, "\t=== Events ===\n"); + for (EventList::iterator event(m_events.begin()); + event != m_events.end(); event++) + (*event)->Log(LOG_INFO); + } +} + +//- CaseFile Static Protected Methods ------------------------------------------ +void +CaseFile::OnGracePeriodEnded(void *arg) +{ + CaseFile &casefile(*static_cast(arg)); + + casefile.OnGracePeriodEnded(); +} + +int +CaseFile::DeSerializeSelector(const struct dirent *dirEntry) +{ + uint64_t poolGUID; + uint64_t vdevGUID; + + if (dirEntry->d_type == DT_REG + && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case", + &poolGUID, &vdevGUID) == 2) + return (1); + return (0); +} + +void +CaseFile::DeSerializeFile(const char *fileName) +{ + string fullName(s_caseFilePath + '/' + fileName); + CaseFile *existingCaseFile(NULL); + CaseFile *caseFile(NULL); + + try { + uint64_t poolGUID; + uint64_t vdevGUID; + nvlist_t *vdevConf; + + if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case", + &poolGUID, &vdevGUID) != 2) { + throw ZfsdException("CaseFile::DeSerialize: " + "Unintelligible CaseFile filename %s.\n", fileName); + } + existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID)); + if (existingCaseFile != NULL) { + /* + * If the vdev is already degraded or faulted, + * there's no point in keeping the state around + * that we use to put a drive into the degraded + * state. However, if the vdev is simply missing, + * preserve the case data in the hopes that it will + * return. + */ + caseFile = existingCaseFile; + vdev_state curState(caseFile->VdevState()); + if (curState > VDEV_STATE_CANT_OPEN + && curState < VDEV_STATE_HEALTHY) { + unlink(fileName); + return; + } + } else { + ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID); + if (zpl.empty() + || (vdevConf = VdevIterator(zpl.front()) + .Find(vdevGUID)) == NULL) { + /* + * Either the pool no longer exists + * or this vdev is no longer a member of + * the pool. + */ + unlink(fullName.c_str()); + return; + } + + /* + * Any vdev we find that does not have a case file + * must be in the healthy state and thus worthy of + * continued SERD data tracking. + */ + caseFile = new CaseFile(Vdev(zpl.front(), vdevConf)); + } + + ifstream caseStream(fullName.c_str()); + if (!caseStream) + throw ZfsdException("CaseFile::DeSerialize: Unable to " + "read %s.\n", fileName); + + caseFile->DeSerialize(caseStream); + } catch (const ParseException &exp) { + + exp.Log(); + if (caseFile != existingCaseFile) + delete caseFile; + + /* + * Since we can't parse the file, unlink it so we don't + * trip over it again. + */ + unlink(fileName); + } catch (const ZfsdException &zfsException) { + + zfsException.Log(); + if (caseFile != existingCaseFile) + delete caseFile; + } +} + +//- CaseFile Protected Methods ------------------------------------------------- +CaseFile::CaseFile(const Vdev &vdev) + : m_poolGUID(vdev.PoolGUID()), + m_vdevGUID(vdev.GUID()), + m_vdevState(vdev.State()), + m_vdevPhysPath(vdev.PhysicalPath()), + m_is_spare(vdev.IsSpare()) +{ + stringstream guidString; + + guidString << m_vdevGUID; + m_vdevGUIDString = guidString.str(); + guidString.str(""); + guidString << m_poolGUID; + m_poolGUIDString = guidString.str(); + + s_activeCases.push_back(this); + + syslog(LOG_INFO, "Creating new CaseFile:\n"); + Log(); +} + +CaseFile::~CaseFile() +{ + PurgeEvents(); + PurgeTentativeEvents(); + m_tentativeTimer.Stop(); + s_activeCases.remove(this); +} + +void +CaseFile::PurgeEvents() +{ + for (EventList::iterator event(m_events.begin()); + event != m_events.end(); event++) + delete *event; + + m_events.clear(); +} + +void +CaseFile::PurgeTentativeEvents() +{ + for (EventList::iterator event(m_tentativeEvents.begin()); + event != m_tentativeEvents.end(); event++) + delete *event; + + m_tentativeEvents.clear(); +} + +void +CaseFile::SerializeEvList(const EventList events, int fd, + const char* prefix) const +{ + if (events.empty()) + return; + for (EventList::const_iterator curEvent = events.begin(); + curEvent != events.end(); curEvent++) { + const string &eventString((*curEvent)->GetEventString()); + + // TODO: replace many write(2) calls with a single writev(2) + if (prefix) + write(fd, prefix, strlen(prefix)); + write(fd, eventString.c_str(), eventString.length()); + } +} + +void +CaseFile::Serialize() +{ + stringstream saveFile; + + saveFile << setfill('0') + << s_caseFilePath << "/" + << "pool_" << PoolGUIDString() + << "_vdev_" << VdevGUIDString() + << ".case"; + + if (m_events.empty() && m_tentativeEvents.empty()) { + unlink(saveFile.str().c_str()); + return; + } + + int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644)); + if (fd == -1) { + syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n", + saveFile.str().c_str()); + return; + } + SerializeEvList(m_events, fd); + SerializeEvList(m_tentativeEvents, fd, "tentative "); + close(fd); +} + +/* + * XXX: This method assumes that events may not contain embedded newlines. If + * ever events can contain embedded newlines, then CaseFile must switch + * serialization formats + */ +void +CaseFile::DeSerialize(ifstream &caseStream) +{ + string evString; + const EventFactory &factory(ZfsDaemon::Get().GetFactory()); + + caseStream >> std::noskipws >> std::ws; + while (caseStream.good()) { + /* + * Outline: + * read the beginning of a line and check it for + * "tentative". If found, discard "tentative". + * Create a new event + * continue + */ + EventList* destEvents; + const string tentFlag("tentative "); + string line; + std::stringbuf lineBuf; + + caseStream.get(lineBuf); + caseStream.ignore(); /*discard the newline character*/ + line = lineBuf.str(); + if (line.compare(0, tentFlag.size(), tentFlag) == 0) { + /* Discard "tentative" */ + line.erase(0, tentFlag.size()); + destEvents = &m_tentativeEvents; + } else { + destEvents = &m_events; + } + Event *event(Event::CreateEvent(factory, line)); + if (event != NULL) { + destEvents->push_back(event); + RegisterCallout(*event); + } + } +} + +void +CaseFile::Close() +{ + /* + * This case is no longer relevant. Clean up our + * serialization file, and delete the case. + */ + syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n", + PoolGUIDString().c_str(), VdevGUIDString().c_str(), + zpool_state_to_name(VdevState(), VDEV_AUX_NONE)); + + /* + * Serialization of a Case with no event data, clears the + * Serialization data for that event. + */ + PurgeEvents(); + Serialize(); + + delete this; +} + +void +CaseFile::OnGracePeriodEnded() +{ + bool should_fault, should_degrade; + ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); + zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); + + m_events.splice(m_events.begin(), m_tentativeEvents); + should_fault = ShouldFault(); + should_degrade = ShouldDegrade(); + + if (should_fault || should_degrade) { + if (zhp == NULL + || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) { + /* + * Either the pool no longer exists + * or this vdev is no longer a member of + * the pool. + */ + Close(); + return; + } + + } + + /* A fault condition has priority over a degrade condition */ + if (ShouldFault()) { + /* Fault the vdev and close the case. */ + if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID, + VDEV_AUX_ERR_EXCEEDED) == 0) { + syslog(LOG_INFO, "Faulting vdev(%s/%s)", + PoolGUIDString().c_str(), + VdevGUIDString().c_str()); + Close(); + return; + } + else { + syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n", + PoolGUIDString().c_str(), + VdevGUIDString().c_str(), + libzfs_error_action(g_zfsHandle), + libzfs_error_description(g_zfsHandle)); + } + } + else if (ShouldDegrade()) { + /* Degrade the vdev and close the case. */ + if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID, + VDEV_AUX_ERR_EXCEEDED) == 0) { + syslog(LOG_INFO, "Degrading vdev(%s/%s)", + PoolGUIDString().c_str(), + VdevGUIDString().c_str()); + Close(); + return; + } + else { + syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n", + PoolGUIDString().c_str(), + VdevGUIDString().c_str(), + libzfs_error_action(g_zfsHandle), + libzfs_error_description(g_zfsHandle)); + } + } + Serialize(); +} + +Vdev +CaseFile::BeingReplacedBy(zpool_handle_t *zhp) { + Vdev vd(zhp, CaseVdev(zhp)); + std::list children; + std::list::iterator children_it; + + Vdev parent(vd.Parent()); + Vdev replacing(NonexistentVdev); + + /* + * To determine whether we are being replaced by another spare that + * is still working, then make sure that it is currently spared and + * that the spare is either resilvering or healthy. If any of these + * conditions fail, then we are not being replaced by a spare. + * + * If the spare is healthy, then the case file should be closed very + * soon after this check. + */ + if (parent.DoesNotExist() + || parent.Name(zhp, /*verbose*/false) != "spare") + return (NonexistentVdev); + + children = parent.Children(); + children_it = children.begin(); + for (;children_it != children.end(); children_it++) { + Vdev child = *children_it; + + /* Skip our vdev. */ + if (child.GUID() == VdevGUID()) + continue; + /* + * Accept the first child that doesn't match our GUID, or + * any resilvering/healthy device if one exists. + */ + if (replacing.DoesNotExist() || child.IsResilvering() + || child.State() == VDEV_STATE_HEALTHY) + replacing = child; + } + + return (replacing); +} + +bool +CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) { + nvlist_t *nvroot, *newvd; + const char *poolname; + string oldstr(VdevGUIDString()); + bool retval = true; + + /* Figure out what pool we're working on */ + ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID); + zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front()); + if (zhp == NULL) { + syslog(LOG_ERR, "CaseFile::Replace: could not find pool for " + "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID); + return (false); + } + poolname = zpool_get_name(zhp); + Vdev vd(zhp, CaseVdev(zhp)); + Vdev replaced(BeingReplacedBy(zhp)); + + if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) { + /* If we are already being replaced by a working spare, pass. */ + if (replaced.IsResilvering() + || replaced.State() == VDEV_STATE_HEALTHY) { + syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already " + "replaced", VdevGUIDString().c_str(), path); + return (/*consumed*/false); + } + /* + * If we have already been replaced by a spare, but that spare + * is broken, we must spare the spare, not the original device. + */ + oldstr = replaced.GUIDString(); + syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing " + "broken spare %s instead", VdevGUIDString().c_str(), + path, oldstr.c_str()); + } + + /* + * Build a root vdev/leaf vdev configuration suitable for + * zpool_vdev_attach. Only enough data for the kernel to find + * the device (i.e. type and disk device node path) are needed. + */ + nvroot = NULL; + newvd = NULL; + + if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0 + || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { + syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate " + "configuration data.", poolname, oldstr.c_str()); + if (nvroot != NULL) + nvlist_free(nvroot); + return (false); + } + if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0 + || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 + || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 + || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &newvd, 1) != 0) { + syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize " + "configuration data.", poolname, oldstr.c_str()); + nvlist_free(newvd); + nvlist_free(nvroot); + return (true); + } + + /* Data was copied when added to the root vdev. */ + nvlist_free(newvd); + + retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot, + /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0); + if (retval) + syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n", + poolname, oldstr.c_str(), path); + else + syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n", + poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle), + libzfs_error_description(g_zfsHandle)); + nvlist_free(nvroot); + + return (retval); +} + +/* Does the argument event refer to a checksum error? */ +static bool +IsChecksumEvent(const Event* const event) +{ + return ("ereport.fs.zfs.checksum" == event->Value("type")); +} + +/* Does the argument event refer to an IO error? */ +static bool +IsIOEvent(const Event* const event) +{ + return ("ereport.fs.zfs.io" == event->Value("type")); +} + +bool +CaseFile::ShouldDegrade() const +{ + return (std::count_if(m_events.begin(), m_events.end(), + IsChecksumEvent) > ZFS_DEGRADE_IO_COUNT); +} + +bool +CaseFile::ShouldFault() const +{ + return (std::count_if(m_events.begin(), m_events.end(), + IsIOEvent) > ZFS_DEGRADE_IO_COUNT); +} + +nvlist_t * +CaseFile::CaseVdev(zpool_handle_t *zhp) const +{ + return (VdevIterator(zhp).Find(VdevGUID())); +} diff --git a/cmd/zfsd/case_file.h b/cmd/zfsd/case_file.h new file mode 100644 index 000000000000..d7475d331a76 --- /dev/null +++ b/cmd/zfsd/case_file.h @@ -0,0 +1,452 @@ +/*- + * Copyright (c) 2011, 2012, 2013 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + * + * $FreeBSD$ + */ + +/** + * \file case_file.h + * + * CaseFile objects aggregate vdev faults that may require ZFSD action + * in order to maintain the health of a ZFS pool. + * + * Header requirements: + * + * #include + * + * #include "callout.h" + * #include "zfsd_event.h" + */ +#ifndef _CASE_FILE_H_ +#define _CASE_FILE_H_ + +/*=========================== Forward Declarations ===========================*/ +class CaseFile; +class Vdev; + +/*============================= Class Definitions ============================*/ +/*------------------------------- CaseFileList -------------------------------*/ +/** + * CaseFileList is a specialization of the standard list STL container. + */ +typedef std::list< CaseFile *> CaseFileList; + +/*--------------------------------- CaseFile ---------------------------------*/ +/** + * A CaseFile object is instantiated anytime a vdev for an active pool + * experiences an I/O error, is faulted by ZFS, or is determined to be + * missing/removed. + * + * A vdev may have at most one CaseFile. + * + * CaseFiles are retired when a vdev leaves an active pool configuration + * or an action is taken to resolve the issues recorded in the CaseFile. + * + * Logging a case against a vdev does not imply that an immediate action + * to resolve a fault is required or even desired. For example, a CaseFile + * must accumulate a number of I/O errors in order to flag a device as + * degraded. + * + * Vdev I/O errors are not recorded in ZFS label inforamation. For this + * reasons, CaseFile%%s with accumulated I/O error events are serialized + * to the file system so that they survive across boots. Currently all + * other fault types can be reconstructed from ZFS label information, so + * CaseFile%%s for missing, faulted, or degradded members are just recreated + * at ZFSD startup instead of being deserialized from the file system. + */ +class CaseFile +{ +public: + /** + * \brief Find a CaseFile object by a vdev's pool/vdev GUID tuple. + * + * \param poolGUID Pool GUID for the vdev of the CaseFile to find. + * If InvalidGuid, then only match the vdev GUID + * instead of both pool and vdev GUIDs. + * \param vdevGUID Vdev GUID for the vdev of the CaseFile to find. + * + * \return If found, a pointer to a valid CaseFile object. + * Otherwise NULL. + */ + static CaseFile *Find(DevdCtl::Guid poolGUID, DevdCtl::Guid vdevGUID); + + /** + * \brief Find multiple CaseFile objects by a vdev's pool/vdev + * GUID tuple (special case for spare vdevs) + * + * \param poolGUID Pool GUID for the vdev of the CaseFile to find. + * If InvalidGuid, then only match the vdev GUID + * instead of both pool and vdev GUIDs. + * \param vdevGUID Vdev GUID for the vdev of the CaseFile to find. + * \param caseList List of cases associated with the vdev. + */ + static void Find(DevdCtl::Guid poolGUID, DevdCtl::Guid vdevGUID, + CaseFileList &caseList); + + /** + * \brief Find a CaseFile object by a vdev's current/last known + * physical path. + * + * \param physPath Physical path of the vdev of the CaseFile to find. + * + * \return If found, a pointer to a valid CaseFile object. + * Otherwise NULL. + */ + static CaseFile *Find(const string &physPath); + + /** + * \brief ReEvaluate all open cases whose pool guid matches the argument + * + * \param poolGUID Only reevaluate cases for this pool + * \param event Try to consume this event with the casefile + */ + static void ReEvaluateByGuid(DevdCtl::Guid poolGUID, + const ZfsEvent &event); + + /** + * \brief Create or return an existing active CaseFile for the + * specified vdev. + * + * \param vdev The vdev object for which to find/create a CaseFile. + * + * \return A reference to a valid CaseFile object. + */ + static CaseFile &Create(Vdev &vdev); + + /** + * \brief Deserialize all serialized CaseFile objects found in + * the file system. + */ + static void DeSerialize(); + + /** + * \brief returns true if there are no CaseFiles + */ + static bool Empty(); + + /** + * \brief Emit syslog data on all active CaseFile%%s in the system. + */ + static void LogAll(); + + /** + * \brief Destroy the in-core cache of CaseFile data. + * + * This routine does not disturb the on disk, serialized, CaseFile + * data. + */ + static void PurgeAll(); + + DevdCtl::Guid PoolGUID() const; + DevdCtl::Guid VdevGUID() const; + vdev_state VdevState() const; + const string &PoolGUIDString() const; + const string &VdevGUIDString() const; + const string &PhysicalPath() const; + + /** + * \brief Attempt to resolve this CaseFile using the disk + * resource at the given device/physical path/vdev object + * tuple. + * + * \param devPath The devfs path for the disk resource. + * \param physPath The physical path information reported by + * the disk resource. + * \param vdev If the disk contains ZFS label information, + * a pointer to the disk label's vdev object + * data. Otherwise NULL. + * + * \return True if this event was consumed by this CaseFile. + */ + bool ReEvaluate(const string &devPath, const string &physPath, + Vdev *vdev); + + /** + * \brief Update this CaseFile in light of the provided ZfsEvent. + * + * Must be virtual so it can be overridden in the unit tests + * + * \param event The ZfsEvent to evaluate. + * + * \return True if this event was consumed by this CaseFile. + */ + virtual bool ReEvaluate(const ZfsEvent &event); + + /** + * \brief Register an itimer callout for the given event, if necessary + */ + virtual void RegisterCallout(const DevdCtl::Event &event); + + /** + * \brief Close a case if it is no longer relevant. + * + * This method deals with cases tracking soft errors. Soft errors + * will be discarded should a remove event occur within a short period + * of the soft errors being reported. We also discard the events + * if the vdev is marked degraded or failed. + * + * \return True if the case is closed. False otherwise. + */ + bool CloseIfSolved(); + + /** + * \brief Emit data about this CaseFile via syslog(3). + */ + void Log(); + + /** + * \brief Whether we should degrade this vdev + */ + bool ShouldDegrade() const; + + /** + * \brief Whether we should fault this vdev + */ + bool ShouldFault() const; + + /** + * \brief If this vdev is spare + */ + int IsSpare(); + +protected: + enum { + /** + * The number of soft errors on a vdev required + * to transition a vdev from healthy to degraded + * status. + */ + ZFS_DEGRADE_IO_COUNT = 50 + }; + + static CalloutFunc_t OnGracePeriodEnded; + + /** + * \brief scandir(3) filter function used to find files containing + * serialized CaseFile data. + * + * \param dirEntry Directory entry for the file to filter. + * + * \return Non-zero for a file to include in the selection, + * otherwise 0. + */ + static int DeSerializeSelector(const struct dirent *dirEntry); + + /** + * \brief Given the name of a file containing serialized events from a + * CaseFile object, create/update an in-core CaseFile object + * representing the serialized data. + * + * \param fileName The name of a file containing serialized events + * from a CaseFile object. + */ + static void DeSerializeFile(const char *fileName); + + /** Constructor. */ + CaseFile(const Vdev &vdev); + + /** + * Destructor. + * Must be virtual so it can be subclassed in the unit tests + */ + virtual ~CaseFile(); + + /** + * \brief Reload state for the vdev associated with this CaseFile. + * + * \return True if the refresh was successful. False if the system + * has no record of the pool or vdev for this CaseFile. + */ + virtual bool RefreshVdevState(); + + /** + * \brief Free all events in the m_events list. + */ + void PurgeEvents(); + + /** + * \brief Free all events in the m_tentativeEvents list. + */ + void PurgeTentativeEvents(); + + /** + * \brief Commit to file system storage. + */ + void Serialize(); + + /** + * \brief Retrieve event data from a serialization stream. + * + * \param caseStream The serializtion stream to parse. + */ + void DeSerialize(std::ifstream &caseStream); + + /** + * \brief Serializes the supplied event list and writes it to fd + * + * \param prefix If not NULL, this prefix will be prepended to + * every event in the file. + */ + void SerializeEvList(const DevdCtl::EventList events, int fd, + const char* prefix=NULL) const; + + /** + * \brief Unconditionally close a CaseFile. + */ + virtual void Close(); + + /** + * \brief Callout callback invoked when the remove timer grace + * period expires. + * + * If no remove events are received prior to the grace period + * firing, then any tentative events are promoted and counted + * against the health of the vdev. + */ + void OnGracePeriodEnded(); + + /** + * \brief Attempt to activate a spare on this case's pool. + * + * Call this whenever a pool becomes degraded. It will look for any + * spare devices and activate one to replace the casefile's vdev. It + * will _not_ close the casefile; that should only happen when the + * missing drive is replaced or the user promotes the spare. + * + * \return True if a spare was activated + */ + bool ActivateSpare(); + + /** + * \brief replace a pool's vdev with another + * + * \param vdev_type The type of the new vdev. Usually either + * VDEV_TYPE_DISK or VDEV_TYPE_FILE + * \param path The file system path to the new vdev + * \param isspare Whether the new vdev is a spare + * + * \return true iff the replacement was successful + */ + bool Replace(const char* vdev_type, const char* path, bool isspare); + + /** + * \brief Which vdev, if any, is replacing ours. + * + * \param zhp Pool handle state from the caller context + * + * \return the vdev that is currently replacing ours, + * or NonexistentVdev if there isn't one. + */ + Vdev BeingReplacedBy(zpool_handle_t *zhp); + + /** + * \brief All CaseFiles being tracked by ZFSD. + */ + static CaseFileList s_activeCases; + + /** + * \brief The file system path to serialized CaseFile data. + */ + static const string s_caseFilePath; + + /** + * \brief The time ZFSD waits before promoting a tentative event + * into a permanent event. + */ + static const timeval s_removeGracePeriod; + + /** + * \brief A list of soft error events counted against the health of + * a vdev. + */ + DevdCtl::EventList m_events; + + /** + * \brief A list of soft error events waiting for a grace period + * expiration before being counted against the health of + * a vdev. + */ + DevdCtl::EventList m_tentativeEvents; + + DevdCtl::Guid m_poolGUID; + DevdCtl::Guid m_vdevGUID; + vdev_state m_vdevState; + string m_poolGUIDString; + string m_vdevGUIDString; + string m_vdevPhysPath; + int m_is_spare; + + /** + * \brief Callout activated when a grace period + */ + Callout m_tentativeTimer; + +private: + nvlist_t *CaseVdev(zpool_handle_t *zhp) const; +}; + +inline DevdCtl::Guid +CaseFile::PoolGUID() const +{ + return (m_poolGUID); +} + +inline DevdCtl::Guid +CaseFile::VdevGUID() const +{ + return (m_vdevGUID); +} + +inline vdev_state +CaseFile::VdevState() const +{ + return (m_vdevState); +} + +inline const string & +CaseFile::PoolGUIDString() const +{ + return (m_poolGUIDString); +} + +inline const string & +CaseFile::VdevGUIDString() const +{ + return (m_vdevGUIDString); +} + +inline const string & +CaseFile::PhysicalPath() const +{ + return (m_vdevPhysPath); +} + +#endif /* _CASE_FILE_H_ */ diff --git a/cmd/zfsd/vdev.cc b/cmd/zfsd/vdev.cc new file mode 100644 index 000000000000..a0e089a5bdf5 --- /dev/null +++ b/cmd/zfsd/vdev.cc @@ -0,0 +1,358 @@ +/*- + * Copyright (c) 2011, 2012, 2013, 2014 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + * + * $FreeBSD$ + */ + +/** + * \file vdev.cc + * + * Implementation of the Vdev class. + */ +#include +#include +#include +#include + +#include +/* + * Undefine flush, defined by cpufunc.h on sparc64, because it conflicts with + * C++ flush methods + */ +#undef flush + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "vdev.h" +#include "vdev_iterator.h" +#include "zfsd.h" +#include "zfsd_exception.h" +#include "zpool_list.h" + +__FBSDID("$FreeBSD$"); +/*============================ Namespace Control =============================*/ +using std::string; +using std::stringstream; + +//- Special objects ----------------------------------------------------------- +Vdev NonexistentVdev; + +//- Vdev Inline Public Methods ------------------------------------------------ +/*=========================== Class Implementations ==========================*/ +/*----------------------------------- Vdev -----------------------------------*/ + +/* Special constructor for NonexistentVdev. */ +Vdev::Vdev() + : m_poolConfig(NULL), + m_config(NULL) +{} + +bool +Vdev::VdevLookupPoolGuid() +{ + uint64_t guid; + if (nvlist_lookup_uint64(m_poolConfig, ZPOOL_CONFIG_POOL_GUID, &guid)) + return (false); + m_poolGUID = guid; + return (true); +} + +void +Vdev::VdevLookupGuid() +{ + uint64_t guid; + if (nvlist_lookup_uint64(m_config, ZPOOL_CONFIG_GUID, &guid) != 0) + throw ZfsdException("Unable to extract vdev GUID " + "from vdev config data."); + m_vdevGUID = guid; +} + +Vdev::Vdev(zpool_handle_t *pool, nvlist_t *config) + : m_poolConfig(zpool_get_config(pool, NULL)), + m_config(config) +{ + if (!VdevLookupPoolGuid()) + throw ZfsdException("Can't extract pool GUID from handle."); + VdevLookupGuid(); +} + +Vdev::Vdev(nvlist_t *poolConfig, nvlist_t *config) + : m_poolConfig(poolConfig), + m_config(config) +{ + if (!VdevLookupPoolGuid()) + throw ZfsdException("Can't extract pool GUID from config."); + VdevLookupGuid(); +} + +Vdev::Vdev(nvlist_t *labelConfig) + : m_poolConfig(labelConfig), + m_config(labelConfig) +{ + /* + * Spares do not have a Pool GUID. Tolerate its absence. + * Code accessing this Vdev in a context where the Pool GUID is + * required will find it invalid (as it is upon Vdev construction) + * and act accordingly. + */ + (void) VdevLookupPoolGuid(); + VdevLookupGuid(); + + try { + m_config = VdevIterator(labelConfig).Find(m_vdevGUID); + } catch (const ZfsdException &exp) { + /* + * When reading a spare's label, it is normal not to find + * a list of vdevs + */ + m_config = NULL; + } +} + +bool +Vdev::IsSpare() const +{ + uint64_t is_spare(0); + + if (m_config == NULL) + return (false); + + (void)nvlist_lookup_uint64(m_config, ZPOOL_CONFIG_IS_SPARE, &is_spare); + return (bool(is_spare)); +} + +vdev_state +Vdev::State() const +{ + uint64_t *nvlist_array; + vdev_stat_t *vs; + uint_t vsc; + + if (m_config == NULL) { + /* + * If we couldn't find the list of vdevs, that normally means + * that this is an available hotspare. In that case, we will + * presume it to be healthy. Even if this spare had formerly + * been in use, been degraded, and been replaced, the act of + * replacement wipes the degraded bit from the label. So we + * have no choice but to presume that it is healthy. + */ + return (VDEV_STATE_HEALTHY); + } + + if (nvlist_lookup_uint64_array(m_config, ZPOOL_CONFIG_VDEV_STATS, + &nvlist_array, &vsc) == 0) { + vs = reinterpret_cast(nvlist_array); + return (static_cast(vs->vs_state)); + } + + /* + * Stats are not available. This vdev was created from a label. + * Synthesize a state based on available data. + */ + uint64_t faulted(0); + uint64_t degraded(0); + (void)nvlist_lookup_uint64(m_config, ZPOOL_CONFIG_FAULTED, &faulted); + (void)nvlist_lookup_uint64(m_config, ZPOOL_CONFIG_DEGRADED, °raded); + if (faulted) + return (VDEV_STATE_FAULTED); + if (degraded) + return (VDEV_STATE_DEGRADED); + return (VDEV_STATE_HEALTHY); +} + +std::list +Vdev::Children() +{ + nvlist_t **vdevChildren; + int result; + u_int numChildren; + std::list children; + + if (m_poolConfig == NULL || m_config == NULL) + return (children); + + result = nvlist_lookup_nvlist_array(m_config, + ZPOOL_CONFIG_CHILDREN, &vdevChildren, &numChildren); + if (result != 0) + return (children); + + for (u_int c = 0;c < numChildren; c++) + children.push_back(Vdev(m_poolConfig, vdevChildren[c])); + + return (children); +} + +Vdev +Vdev::RootVdev() +{ + nvlist_t *rootVdev; + + if (m_poolConfig == NULL) + return (NonexistentVdev); + + if (nvlist_lookup_nvlist(m_poolConfig, ZPOOL_CONFIG_VDEV_TREE, + &rootVdev) != 0) + return (NonexistentVdev); + return (Vdev(m_poolConfig, rootVdev)); +} + +/* + * Find our parent. This requires doing a traversal of the config; we can't + * cache it as leaf vdevs may change their pool config location (spare, + * replacing, mirror, etc). + */ +Vdev +Vdev::Parent() +{ + std::list to_examine; + std::list children; + std::list::iterator children_it; + + to_examine.push_back(RootVdev()); + for (;;) { + if (to_examine.empty()) + return (NonexistentVdev); + Vdev vd = to_examine.front(); + if (vd.DoesNotExist()) + return (NonexistentVdev); + to_examine.pop_front(); + children = vd.Children(); + children_it = children.begin(); + for (;children_it != children.end(); children_it++) { + Vdev child = *children_it; + + if (child.GUID() == GUID()) + return (vd); + to_examine.push_front(child); + } + } +} + +bool +Vdev::IsAvailableSpare() const +{ + /* If we have a pool guid, we cannot be an available spare. */ + if (PoolGUID()) + return (false); + + return (true); +} + +bool +Vdev::IsSpare() +{ + uint64_t spare; + if (nvlist_lookup_uint64(m_config, ZPOOL_CONFIG_IS_SPARE, &spare) != 0) + return (false); + return (spare != 0); +} + +bool +Vdev::IsActiveSpare() const +{ + vdev_stat_t *vs; + uint_t c; + + if (m_poolConfig == NULL) + return (false); + + (void) nvlist_lookup_uint64_array(m_config, ZPOOL_CONFIG_VDEV_STATS, + reinterpret_cast(&vs), &c); + if (vs == NULL || vs->vs_aux != VDEV_AUX_SPARED) + return (false); + return (true); +} + +bool +Vdev::IsResilvering() const +{ + pool_scan_stat_t *ps = NULL; + uint_t c; + + if (State() != VDEV_STATE_HEALTHY) + return (false); + + (void) nvlist_lookup_uint64_array(m_config, ZPOOL_CONFIG_SCAN_STATS, + reinterpret_cast(&ps), &c); + if (ps == NULL || ps->pss_func != POOL_SCAN_RESILVER) + return (false); + return (true); +} + +string +Vdev::GUIDString() const +{ + stringstream vdevGUIDString; + + vdevGUIDString << GUID(); + return (vdevGUIDString.str()); +} + +string +Vdev::Name(zpool_handle_t *zhp, bool verbose) const +{ + return (zpool_vdev_name(g_zfsHandle, zhp, m_config, + verbose ? B_TRUE : B_FALSE)); +} + +string +Vdev::Path() const +{ + const char *path(NULL); + + if ((m_config != NULL) + && (nvlist_lookup_string(m_config, ZPOOL_CONFIG_PATH, &path) == 0)) + return (path); + + return (""); +} + +string +Vdev::PhysicalPath() const +{ + const char *path(NULL); + + if ((m_config != NULL) && (nvlist_lookup_string(m_config, + ZPOOL_CONFIG_PHYS_PATH, &path) == 0)) + return (path); + + return (""); +} diff --git a/cmd/zfsd/vdev.h b/cmd/zfsd/vdev.h new file mode 100644 index 000000000000..322efc8f4e53 --- /dev/null +++ b/cmd/zfsd/vdev.h @@ -0,0 +1,188 @@ +/*- + * Copyright (c) 2011, 2012, 2013 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + * + * $FreeBSD$ + */ + +/** + * \file vdev.h + * + * Definition of the Vdev class. + * + * Header requirements: + * + * #include + * #include + * + * #include + */ +#ifndef _VDEV_H_ +#define _VDEV_H_ + +/*=========================== Forward Declarations ===========================*/ +struct zpool_handle; +typedef struct zpool_handle zpool_handle_t; + +struct nvlist; +typedef struct nvlist nvlist_t; + +/*============================= Class Definitions ============================*/ +/*----------------------------------- Vdev -----------------------------------*/ +/** + * \brief Wrapper class for a vdev's name/value configuration list + * simplifying access to commonly used vdev attributes. + */ +class Vdev +{ +public: + /** + * \brief Instantiate a vdev object for a vdev that is a member + * of an imported pool. + * + * \param pool The pool object containing the vdev with + * configuration data provided in vdevConfig. + * \param vdevConfig Vdev configuration data. + * + * This method should be used whenever dealing with vdev's + * enumerated via the ZpoolList class. The in-core configuration + * data for a vdev does not contain all of the items found in + * the on-disk label. This requires the vdev class to augment + * the data in vdevConfig with data found in the pool object. + */ + Vdev(zpool_handle_t *pool, nvlist_t *vdevConfig); + + /** + * \brief Instantiate a vdev object for a vdev that is a member + * of a pool configuration. + * + * \param poolConfig The pool configuration containing the vdev + * configuration data provided in vdevConfig. + * \param vdevConfig Vdev configuration data. + * + * This method should be used whenever dealing with vdev's + * enumerated via the ZpoolList class. The in-core configuration + * data for a vdev does not contain all of the items found in + * the on-disk label. This requires the vdev class to augment + * the data in vdevConfig with data found in the pool object. + */ + Vdev(nvlist_t *poolConfig, nvlist_t *vdevConfig); + + /** + * \brief Instantiate a vdev object from a ZFS label stored on + * the device. + * + * \param vdevConfig The name/value list retrieved by reading + * the label information on a leaf vdev. + */ + Vdev(nvlist_t *vdevConfig); + + /** + * \brief No-op copy constructor for nonexistent vdevs. + */ + Vdev(); + + /** + * \brief No-op virtual destructor, since this class has virtual + * functions. + */ + virtual ~Vdev(); + bool DoesNotExist() const; + + /** + * \brief Return a list of the vdev's children. + */ + std::list Children(); + + virtual DevdCtl::Guid GUID() const; + bool IsSpare() const; + virtual DevdCtl::Guid PoolGUID() const; + virtual vdev_state State() const; + std::string Path() const; + virtual std::string PhysicalPath() const; + std::string GUIDString() const; + nvlist_t *PoolConfig() const; + nvlist_t *Config() const; + Vdev Parent(); + Vdev RootVdev(); + std::string Name(zpool_handle_t *, bool verbose) const; + bool IsSpare(); + bool IsAvailableSpare() const; + bool IsActiveSpare() const; + bool IsResilvering() const; + +private: + void VdevLookupGuid(); + bool VdevLookupPoolGuid(); + DevdCtl::Guid m_poolGUID; + DevdCtl::Guid m_vdevGUID; + nvlist_t *m_poolConfig; + nvlist_t *m_config; +}; + +//- Special objects ----------------------------------------------------------- +extern Vdev NonexistentVdev; + +//- Vdev Inline Public Methods ------------------------------------------------ +inline Vdev::~Vdev() +{ +} + +inline DevdCtl::Guid +Vdev::PoolGUID() const +{ + return (m_poolGUID); +} + +inline DevdCtl::Guid +Vdev::GUID() const +{ + return (m_vdevGUID); +} + +inline nvlist_t * +Vdev::PoolConfig() const +{ + return (m_poolConfig); +} + +inline nvlist_t * +Vdev::Config() const +{ + return (m_config); +} + +inline bool +Vdev::DoesNotExist() const +{ + return (m_config == NULL); +} + +#endif /* _VDEV_H_ */ diff --git a/cmd/zfsd/vdev_iterator.cc b/cmd/zfsd/vdev_iterator.cc new file mode 100644 index 000000000000..c23399ed68a8 --- /dev/null +++ b/cmd/zfsd/vdev_iterator.cc @@ -0,0 +1,172 @@ +/*- + * Copyright (c) 2011, 2012, 2013 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + * + * $FreeBSD$ + */ + +/** + * \file vdev_iterator.cc + * + * Implementation of the VdevIterator class. + */ +#include +#include +#include + +#include +#include + +#include + +#include +#include + +#include +#include + +#include "vdev.h" +#include "vdev_iterator.h" +#include "zfsd_exception.h" + +/*============================ Namespace Control =============================*/ +using DevdCtl::Guid; + +/*=========================== Class Implementations ==========================*/ +/*------------------------------- VdevIterator -------------------------------*/ +VdevIterator::VdevIterator(zpool_handle_t *pool) + : m_poolConfig(zpool_get_config(pool, NULL)) +{ + Reset(); +} + +VdevIterator::VdevIterator(nvlist_t *poolConfig) + : m_poolConfig(poolConfig) +{ + Reset(); +} + +void +VdevIterator::Reset() +{ + nvlist_t *rootVdev; + nvlist **cache_child; + nvlist **spare_child; + int result; + uint_t cache_children; + uint_t spare_children; + + result = nvlist_lookup_nvlist(m_poolConfig, + ZPOOL_CONFIG_VDEV_TREE, + &rootVdev); + if (result != 0) + throw ZfsdException(m_poolConfig, "Unable to extract " + "ZPOOL_CONFIG_VDEV_TREE from pool."); + m_vdevQueue.assign(1, rootVdev); + result = nvlist_lookup_nvlist_array(rootVdev, + ZPOOL_CONFIG_L2CACHE, + &cache_child, + &cache_children); + if (result == 0) + for (uint_t c = 0; c < cache_children; c++) + m_vdevQueue.push_back(cache_child[c]); + result = nvlist_lookup_nvlist_array(rootVdev, + ZPOOL_CONFIG_SPARES, + &spare_child, + &spare_children); + if (result == 0) + for (uint_t c = 0; c < spare_children; c++) + m_vdevQueue.push_back(spare_child[c]); +} + +nvlist_t * +VdevIterator::Next() +{ + nvlist_t *vdevConfig; + + if (m_vdevQueue.empty()) + return (NULL); + + for (;;) { + nvlist_t **vdevChildren; + int result; + u_int numChildren; + + vdevConfig = m_vdevQueue.front(); + m_vdevQueue.pop_front(); + + /* Expand non-leaf vdevs. */ + result = nvlist_lookup_nvlist_array(vdevConfig, + ZPOOL_CONFIG_CHILDREN, + &vdevChildren, &numChildren); + if (result != 0) { + /* leaf vdev */ + break; + } + + /* + * Insert children at the head of the queue to effect a + * depth first traversal of the tree. + */ + m_vdevQueue.insert(m_vdevQueue.begin(), vdevChildren, + vdevChildren + numChildren); + } + + return (vdevConfig); +} + +void +VdevIterator::Each(VdevCallback_t *callBack, void *callBackArg) +{ + nvlist_t *vdevConfig; + + Reset(); + while ((vdevConfig = Next()) != NULL) { + Vdev vdev(m_poolConfig, vdevConfig); + + if (callBack(vdev, callBackArg)) + break; + } +} + +nvlist_t * +VdevIterator::Find(Guid vdevGUID) +{ + nvlist_t *vdevConfig; + + Reset(); + while ((vdevConfig = Next()) != NULL) { + Vdev vdev(m_poolConfig, vdevConfig); + + if (vdev.GUID() == vdevGUID) + return (vdevConfig); + } + return (NULL); +} diff --git a/cmd/zfsd/vdev_iterator.h b/cmd/zfsd/vdev_iterator.h new file mode 100644 index 000000000000..435582ec1f84 --- /dev/null +++ b/cmd/zfsd/vdev_iterator.h @@ -0,0 +1,123 @@ +/*- + * Copyright (c) 2011, 2012, 2013 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + * + * $FreeBSD$ + */ + +/** + * \file vdev_iterator.h + * + * VdevIterator class definition. + * + * Header requirements: + * + * #include + */ +#ifndef _VDEV_ITERATOR_H_ +#define _VDEV_ITERATOR_H_ + +/*=========================== Forward Declarations ===========================*/ +struct zpool_handle; +typedef struct zpool_handle zpool_handle_t; + +struct nvlist; +typedef struct nvlist nvlist_t; + +class Vdev; + +/*============================= Class Definitions ============================*/ +/*------------------------------- VdevIterator -------------------------------*/ +typedef bool VdevCallback_t(Vdev &vdev, void *cbArg); + +/** + * \brief VdevIterator provides mechanisms for traversing and searching + * the leaf vdevs contained in a ZFS pool configuration. + */ +class VdevIterator +{ +public: + /** + * \brief Instantiate a VdevIterator for the given ZFS pool. + * + * \param pool The ZFS pool to traverse/search. + */ + VdevIterator(zpool_handle_t *pool); + + /** + * \brief Instantiate a VdevIterator for the given ZFS pool. + * + * \param poolConfig The configuration data for the ZFS pool + * to traverse/search. + */ + VdevIterator(nvlist_t *poolConfig); + + /** + * \brief Reset this iterator's cursor so that Next() will + * report the first member of the pool. + */ + void Reset(); + + /** + * \brief Report the leaf vdev at this iterator's cursor and increment + * the cursor to the next leaf pool member. + */ + nvlist_t *Next(); + + /** + * \brief Traverse the entire pool configuration starting its + * first member, returning a vdev object with the given + * vdev GUID if found. + * + * \param vdevGUID The vdev GUID of the vdev object to find. + * + * \return A Vdev object for the matching vdev if found. Otherwise + * NULL. + * + * Upon return, the VdevIterator's cursor points to the vdev just + * past the returned vdev or end() if no matching vdev is found. + */ + nvlist_t *Find(DevdCtl::Guid vdevGUID); + + /** + * \brief Perform the specified operation on each leaf member of + * a pool's vdev membership. + * + * \param cb Callback function to execute for each member. + * \param cbArg Argument to pass to cb. + */ + void Each(VdevCallback_t *cb, void *cbArg); + +private: + nvlist_t *m_poolConfig; + std::list m_vdevQueue; +}; + +#endif /* _VDEV_ITERATOR_H_ */ diff --git a/cmd/zfsd/zfsd.8 b/cmd/zfsd/zfsd.8 new file mode 100644 index 000000000000..ce7550b891a3 --- /dev/null +++ b/cmd/zfsd/zfsd.8 @@ -0,0 +1,154 @@ +.\"- +.\" Copyright (c) 2016 Allan Jude +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd April 18, 2020 +.Dt ZFSD 8 +.Os +.Sh NAME +.Nm zfsd +.Nd ZFS fault management daemon +.Sh SYNOPSIS +.Nm +.Op Fl d +.Sh DESCRIPTION +.Nm +attempts to resolve ZFS faults that the kernel can't resolve by itself. +It listens to +.Xr devctl 4 +events, which are how the kernel notifies userland of events such as I/O +errors and disk removals. +.Nm +attempts to resolve these faults by activating or deactivating hot spares +and onlining offline vdevs. +.Pp +The following options are available: +.Bl -tag -width indent +.It Fl d +Run in the foreground instead of daemonizing. +.El +.Pp +System administrators never interact with +.Nm +directly. +Instead, they control its behavior indirectly through zpool configuration. +There are two ways to influence +.Nm : +assigning hotspares and setting pool properties. +Currently, only the +.Em autoreplace +property has any effect. +See +.Xr zpool 8 +for details. +.Pp +.Nm +will attempt to resolve the following types of fault: +.Bl -tag -width a +.It device removal +When a leaf vdev disappears, +.Nm +will activate any available hotspare. +.It device arrival +When a new GEOM device appears, +.Nm +will attempt to read its ZFS label, if any. +If it matches a previously removed vdev on an active pool, +.Nm +will online it. +Once resilvering completes, any active hotspare will detach automatically. +.Pp +If the new device has no ZFS label but its physical path matches the +physical path of a previously removed vdev on an active pool, and that +pool has the autoreplace property set, then +.Nm +will replace the missing vdev with the newly arrived device. +Once resilvering completes, any active hotspare will detach automatically. +.It vdev degrade or fault events +If a vdev becomes degraded or faulted, +.Nm +will activate any available hotspare. +.It I/O errors +If a leaf vdev generates more than 50 I/O errors in a 60 second period, then +.Nm +will mark that vdev as +.Em FAULTED . +ZFS will no longer issue any I/Os to it. +.Nm +will activate a hotspare if one is available. +.It Checksum errors +If a leaf vdev generates more than 50 checksum errors in a 60 second +period, then +.Nm +will mark that vdev as +.Em DEGRADED . +ZFS will still use it, but zfsd will activate a spare anyway. +.It Spare addition +If the system administrator adds a hotspare to a pool that is already degraded, +.Nm +will activate the spare. +.It Resilver complete +.Nm +will detach any hotspare once a permanent replacement finishes resilvering. +.It Physical path change +If the physical path of an existing disk changes, +.Nm +will attempt to replace any missing disk with the same physical path, +if its pool's autoreplace property is set. +.El +.Pp +.Nm +will log interesting events and its actions to syslog with facility +.Em daemon +and identity +.Op zfsd . +.El +.Sh FILES +.Bl -tag -width a -compact +.It Pa /var/db/zfsd/cases +When +.Nm +exits, it serializes any unresolved casefiles here, +then reads them back in when next it starts up. +.El +.Sh SEE ALSO +.Xr devctl 4 , +.Xr zpool 8 +.Sh HISTORY +.Nm +first appeared in +.Fx 11.0 . +.Sh AUTHORS +.Nm +was originally written by +.An Justin Gibbs Aq Mt gibbs@FreeBSD.org +and +.An Alan Somers Aq Mt asomers@FreeBSD.org +.Sh TODO +In the future, +.Nm +should be able to resume a pool that became suspended due to device +removals, if enough missing devices have returned. diff --git a/cmd/zfsd/zfsd.cc b/cmd/zfsd/zfsd.cc new file mode 100644 index 000000000000..29c6b1ae22e2 --- /dev/null +++ b/cmd/zfsd/zfsd.cc @@ -0,0 +1,449 @@ +/*- + * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2016 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + */ + +/** + * \file zfsd.cc + * + * The ZFS daemon consumes kernel devdctl(4) event data via devd(8)'s + * unix domain socket in order to react to system changes that impact + * the function of ZFS storage pools. The goal of this daemon is to + * provide similar functionality to the Solaris ZFS Diagnostic Engine + * (zfs-diagnosis), the Solaris ZFS fault handler (zfs-retire), and + * the Solaris ZFS vdev insertion agent (zfs-mod sysevent handler). + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "callout.h" +#include "vdev_iterator.h" +#include "zfsd_event.h" +#include "case_file.h" +#include "vdev.h" +#include "vdev_iterator.h" +#include "zfsd.h" +#include "zfsd_exception.h" +#include "zpool_list.h" + +__FBSDID("$FreeBSD$"); + +/*================================== Macros ==================================*/ +#define NUM_ELEMENTS(x) (sizeof(x) / sizeof(*x)) + +/*============================ Namespace Control =============================*/ +using DevdCtl::Event; +using DevdCtl::EventFactory; +using DevdCtl::EventList; + +/*================================ Global Data ===============================*/ +int g_debug = 0; +libzfs_handle_t *g_zfsHandle; + +/*--------------------------------- ZfsDaemon --------------------------------*/ +//- ZfsDaemon Static Private Data ---------------------------------------------- +ZfsDaemon *ZfsDaemon::s_theZfsDaemon; +bool ZfsDaemon::s_logCaseFiles; +bool ZfsDaemon::s_terminateEventLoop; +char ZfsDaemon::s_pidFilePath[] = "/var/run/zfsd.pid"; +pidfh *ZfsDaemon::s_pidFH; +int ZfsDaemon::s_signalPipeFD[2]; +bool ZfsDaemon::s_systemRescanRequested(false); +EventFactory::Record ZfsDaemon::s_registryEntries[] = +{ + { Event::NOTIFY, "GEOM", &GeomEvent::Builder }, + { Event::NOTIFY, "ZFS", &ZfsEvent::Builder } +}; + +//- ZfsDaemon Static Public Methods -------------------------------------------- +ZfsDaemon & +ZfsDaemon::Get() +{ + return (*s_theZfsDaemon); +} + +void +ZfsDaemon::WakeEventLoop() +{ + write(s_signalPipeFD[1], "+", 1); +} + +void +ZfsDaemon::RequestSystemRescan() +{ + s_systemRescanRequested = true; + ZfsDaemon::WakeEventLoop(); +} + +void +ZfsDaemon::Run() +{ + ZfsDaemon daemon; + + while (s_terminateEventLoop == false) { + + try { + daemon.DisconnectFromDevd(); + + if (daemon.ConnectToDevd() == false) { + sleep(30); + continue; + } + + daemon.DetectMissedEvents(); + + daemon.EventLoop(); + + } catch (const DevdCtl::Exception &exp) { + exp.Log(); + } + } + + daemon.DisconnectFromDevd(); +} + +//- ZfsDaemon Private Methods -------------------------------------------------- +ZfsDaemon::ZfsDaemon() + : Consumer(/*defBuilder*/NULL, s_registryEntries, + NUM_ELEMENTS(s_registryEntries)) +{ + if (s_theZfsDaemon != NULL) + errx(1, "Multiple ZfsDaemon instances created. Exiting"); + + s_theZfsDaemon = this; + + if (pipe(s_signalPipeFD) != 0) + errx(1, "Unable to allocate signal pipe. Exiting"); + + if (fcntl(s_signalPipeFD[0], F_SETFL, O_NONBLOCK) == -1) + errx(1, "Unable to set pipe as non-blocking. Exiting"); + + if (fcntl(s_signalPipeFD[1], F_SETFL, O_NONBLOCK) == -1) + errx(1, "Unable to set pipe as non-blocking. Exiting"); + + signal(SIGHUP, ZfsDaemon::RescanSignalHandler); + signal(SIGINFO, ZfsDaemon::InfoSignalHandler); + signal(SIGINT, ZfsDaemon::QuitSignalHandler); + signal(SIGTERM, ZfsDaemon::QuitSignalHandler); + signal(SIGUSR1, ZfsDaemon::RescanSignalHandler); + + g_zfsHandle = libzfs_init(); + if (g_zfsHandle == NULL) + errx(1, "Unable to initialize ZFS library. Exiting"); + + Callout::Init(); + InitializeSyslog(); + OpenPIDFile(); + + if (g_debug == 0) + daemon(0, 0); + + UpdatePIDFile(); +} + +ZfsDaemon::~ZfsDaemon() +{ + PurgeCaseFiles(); + ClosePIDFile(); +} + +void +ZfsDaemon::PurgeCaseFiles() +{ + CaseFile::PurgeAll(); +} + +bool +ZfsDaemon::VdevAddCaseFile(Vdev &vdev, void *cbArg) +{ + if (vdev.State() != VDEV_STATE_HEALTHY) + CaseFile::Create(vdev); + + return (/*break early*/false); +} + +void +ZfsDaemon::BuildCaseFiles() +{ + ZpoolList zpl; + ZpoolList::iterator pool; + + /* Add CaseFiles for vdevs with issues. */ + for (pool = zpl.begin(); pool != zpl.end(); pool++) + VdevIterator(*pool).Each(VdevAddCaseFile, NULL); + + /* De-serialize any saved cases. */ + CaseFile::DeSerialize(); + + /* Simulate config_sync events to force CaseFile reevaluation */ + for (pool = zpl.begin(); pool != zpl.end(); pool++) { + char evString[160]; + Event *event; + nvlist_t *config; + uint64_t poolGUID; + const char *poolname; + + poolname = zpool_get_name(*pool); + config = zpool_get_config(*pool, NULL); + if (config == NULL) { + syslog(LOG_ERR, "ZFSDaemon::BuildCaseFiles: Could not " + "find pool config for pool %s", poolname); + continue; + } + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &poolGUID) != 0) { + syslog(LOG_ERR, "ZFSDaemon::BuildCaseFiles: Could not " + "find pool guid for pool %s", poolname); + continue; + } + + + snprintf(evString, 160, "!system=ZFS subsystem=ZFS " + "type=misc.fs.zfs.config_sync sub_type=synthesized " + "pool_name=%s pool_guid=%" PRIu64 "\n", poolname, poolGUID); + event = Event::CreateEvent(GetFactory(), string(evString)); + if (event != NULL) { + event->Process(); + delete event; + } + } +} + +void +ZfsDaemon::RescanSystem() +{ + struct gmesh mesh; + struct gclass *mp; + struct ggeom *gp; + struct gprovider *pp; + int result; + + /* + * The devdctl system doesn't replay events for new consumers + * of the interface. Emit manufactured DEVFS arrival events + * for any devices that already before we started or during + * periods where we've lost our connection to devd. + */ + result = geom_gettree(&mesh); + if (result != 0) { + syslog(LOG_ERR, "ZfsDaemon::RescanSystem: " + "geom_gettree failed with error %d\n", result); + return; + } + + const string evStart("!system=DEVFS subsystem=CDEV type=CREATE " + "sub_type=synthesized cdev="); + LIST_FOREACH(mp, &mesh.lg_class, lg_class) { + LIST_FOREACH(gp, &mp->lg_geom, lg_geom) { + LIST_FOREACH(pp, &gp->lg_provider, lg_provider) { + Event *event; + + string evString(evStart + pp->lg_name + "\n"); + event = Event::CreateEvent(GetFactory(), + evString); + if (event != NULL) { + if (event->Process()) + SaveEvent(*event); + delete event; + } + } + } + } + geom_deletetree(&mesh); +} + +void +ZfsDaemon::DetectMissedEvents() +{ + do { + PurgeCaseFiles(); + + /* + * Discard any events waiting for us. We don't know + * if they still apply to the current state of the + * system. + */ + FlushEvents(); + + BuildCaseFiles(); + + /* + * If the system state has changed during our + * interrogation, start over. + */ + } while (s_terminateEventLoop == false && EventsPending()); + + RescanSystem(); +} + +void +ZfsDaemon::EventLoop() +{ + while (s_terminateEventLoop == false) { + struct pollfd fds[2]; + int result; + + if (s_logCaseFiles == true) { + EventList::iterator event(m_unconsumedEvents.begin()); + s_logCaseFiles = false; + CaseFile::LogAll(); + while (event != m_unconsumedEvents.end()) + (*event++)->Log(LOG_INFO); + } + + Callout::ExpireCallouts(); + + /* Wait for data. */ + fds[0].fd = m_devdSockFD; + fds[0].events = POLLIN; + fds[0].revents = 0; + fds[1].fd = s_signalPipeFD[0]; + fds[1].events = POLLIN; + fds[1].revents = 0; + result = poll(fds, NUM_ELEMENTS(fds), /*timeout*/INFTIM); + if (result == -1) { + if (errno == EINTR) + continue; + else + err(1, "Polling for devd events failed"); + } else if (result == 0) { + errx(1, "Unexpected result of 0 from poll. Exiting"); + } + + if ((fds[0].revents & POLLIN) != 0) + ProcessEvents(); + + if ((fds[1].revents & POLLIN) != 0) { + static char discardBuf[128]; + + /* + * This pipe exists just to close the signal + * race. Its contents are of no interest to + * us, but we must ensure that future signals + * have space in the pipe to write. + */ + while (read(s_signalPipeFD[0], discardBuf, + sizeof(discardBuf)) > 0) + ; + } + + if (s_systemRescanRequested == true) { + s_systemRescanRequested = false; + syslog(LOG_INFO, "System Rescan request processed."); + RescanSystem(); + } + + if ((fds[0].revents & POLLERR) != 0) { + syslog(LOG_INFO, "POLLERROR detected on devd socket."); + break; + } + + if ((fds[0].revents & POLLHUP) != 0) { + syslog(LOG_INFO, "POLLHUP detected on devd socket."); + break; + } + } +} +//- ZfsDaemon staic Private Methods -------------------------------------------- +void +ZfsDaemon::InfoSignalHandler(int) +{ + s_logCaseFiles = true; + ZfsDaemon::WakeEventLoop(); +} + +void +ZfsDaemon::RescanSignalHandler(int) +{ + RequestSystemRescan(); +} + +void +ZfsDaemon::QuitSignalHandler(int) +{ + s_terminateEventLoop = true; + ZfsDaemon::WakeEventLoop(); +} + +void +ZfsDaemon::OpenPIDFile() +{ + pid_t otherPID; + + s_pidFH = pidfile_open(s_pidFilePath, 0600, &otherPID); + if (s_pidFH == NULL) { + if (errno == EEXIST) + errx(1, "already running as PID %d. Exiting", otherPID); + warn("cannot open PID file"); + } +} + +void +ZfsDaemon::UpdatePIDFile() +{ + if (s_pidFH != NULL) + pidfile_write(s_pidFH); +} + +void +ZfsDaemon::ClosePIDFile() +{ + if (s_pidFH != NULL) + pidfile_remove(s_pidFH); +} + +void +ZfsDaemon::InitializeSyslog() +{ + openlog("zfsd", LOG_NDELAY, LOG_DAEMON); +} + diff --git a/cmd/zfsd/zfsd.h b/cmd/zfsd/zfsd.h new file mode 100644 index 000000000000..7b4019c7ae10 --- /dev/null +++ b/cmd/zfsd/zfsd.h @@ -0,0 +1,228 @@ +/*- + * Copyright (c) 2011, 2012, 2013, 2014 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + * + * $FreeBSD$ + */ + +/** + * \file zfsd.h + * + * Class definitions and supporting data strutures for the ZFS fault + * management daemon. + * + * Header requirements: + * + * #include + * + * #include + * + * #include + * #include + * #include + * + * #include + * #include + * #include + * #include + * + * #include "vdev_iterator.h" + */ +#ifndef _ZFSD_H_ +#define _ZFSD_H_ + +/*=========================== Forward Declarations ===========================*/ +struct pidfh; + +struct zpool_handle; +typedef struct zpool_handle zpool_handle_t; + +struct zfs_handle; +typedef struct libzfs_handle libzfs_handle_t; + +struct nvlist; +typedef struct nvlist nvlist_t; + +typedef int LeafIterFunc(zpool_handle_t *, nvlist_t *, void *); + +/*================================ Global Data ===============================*/ +extern int g_debug; +extern libzfs_handle_t *g_zfsHandle; + +/*============================= Class Definitions ============================*/ +/*--------------------------------- ZfsDaemon --------------------------------*/ +/** + * Static singleton orchestrating the operations of the ZFS daemon program. + */ +class ZfsDaemon : public DevdCtl::Consumer +{ +public: + /** Return the ZfsDaemon singleton. */ + static ZfsDaemon &Get(); + + /** + * Used by signal handlers to ensure, in a race free way, that + * the event loop will perform at least one more full loop + * before sleeping again. + */ + static void WakeEventLoop(); + + /** + * Schedules a rescan of devices in the system for potential + * candidates to replace a missing vdev. The scan is performed + * during the next run of the event loop. + */ + static void RequestSystemRescan(); + + /** Daemonize and perform all functions of the ZFS daemon. */ + static void Run(); + +private: + ZfsDaemon(); + ~ZfsDaemon(); + + static VdevCallback_t VdevAddCaseFile; + + /** Purge our cache of outstanding ZFS issues in the system. */ + void PurgeCaseFiles(); + + /** Build a cache of outstanding ZFS issues in the system. */ + void BuildCaseFiles(); + + /** + * Iterate over all known issues and attempt to solve them + * given resources currently available in the system. + */ + void RescanSystem(); + + /** + * Interrogate the system looking for previously unknown + * faults that occurred either before ZFSD was started, + * or during a period of lost communication with Devd. + */ + void DetectMissedEvents(); + + /** + * Wait for and process event source activity. + */ + void EventLoop(); + + /** + * Signal handler for which our response is to + * log the current state of the daemon. + * + * \param sigNum The signal caught. + */ + static void InfoSignalHandler(int sigNum); + + /** + * Signal handler for which our response is to + * request a case rescan. + * + * \param sigNum The signal caught. + */ + static void RescanSignalHandler(int sigNum); + + /** + * Signal handler for which our response is to + * gracefully terminate. + * + * \param sigNum The signal caught. + */ + static void QuitSignalHandler(int sigNum); + + /** + * Open and lock our PID file. + */ + static void OpenPIDFile(); + + /** + * Update our PID file with our PID. + */ + static void UpdatePIDFile(); + + /** + * Close and release the lock on our PID file. + */ + static void ClosePIDFile(); + + /** + * Perform syslog configuration. + */ + static void InitializeSyslog(); + + static ZfsDaemon *s_theZfsDaemon; + + /** + * Set to true when our program is signaled to + * gracefully exit. + */ + static bool s_logCaseFiles; + + /** + * Set to true when our program is signaled to + * gracefully exit. + */ + static bool s_terminateEventLoop; + + /** + * The canonical path and file name of zfsd's PID file. + */ + static char s_pidFilePath[]; + + /** + * Control structure for PIDFILE(3) API. + */ + static pidfh *s_pidFH; + + /** + * Pipe file descriptors used to close races with our + * signal handlers. + */ + static int s_signalPipeFD[2]; + + /** + * Flag controlling a rescan from ZFSD's event loop of all + * GEOM providers in the system to find candidates for solving + * cases. + */ + static bool s_systemRescanRequested; + + /** + * Flag controlling whether events can be queued. This boolean + * is set during event replay to ensure that events for pools or + * devices no longer in the system are not retained forever. + */ + static bool s_consumingEvents; + + static DevdCtl::EventFactory::Record s_registryEntries[]; +}; + +#endif /* _ZFSD_H_ */ diff --git a/cmd/zfsd/zfsd_event.cc b/cmd/zfsd/zfsd_event.cc new file mode 100644 index 000000000000..45fc7adbb31b --- /dev/null +++ b/cmd/zfsd/zfsd_event.cc @@ -0,0 +1,488 @@ +/*- + * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + */ + +/** + * \file zfsd_event.cc + */ +#include +#include +#include +#include +#include + +#include + +#include +#include +/* + * Undefine flush, defined by cpufunc.h on sparc64, because it conflicts with + * C++ flush methods + */ +#undef flush +#undef __init +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "callout.h" +#include "vdev_iterator.h" +#include "zfsd_event.h" +#include "case_file.h" +#include "vdev.h" +#include "zfsd.h" +#include "zfsd_exception.h" +#include "zpool_list.h" + +__FBSDID("$FreeBSD$"); +/*============================ Namespace Control =============================*/ +using DevdCtl::Event; +using DevdCtl::Guid; +using DevdCtl::NVPairMap; +using std::stringstream; + +/*=========================== Class Implementations ==========================*/ + +/*-------------------------------- GeomEvent --------------------------------*/ + +//- GeomEvent Static Public Methods ------------------------------------------- +Event * +GeomEvent::Builder(Event::Type type, + NVPairMap &nvPairs, + const string &eventString) +{ + return (new GeomEvent(type, nvPairs, eventString)); +} + +//- GeomEvent Virtual Public Methods ------------------------------------------ +Event * +GeomEvent::DeepCopy() const +{ + return (new GeomEvent(*this)); +} + +bool +GeomEvent::Process() const +{ + /* + * We only use GEOM events to repair damaged pools. So return early if + * there are no damaged pools + */ + if (CaseFile::Empty()) + return (false); + + /* + * We are only concerned with arrivals and physical path changes, + * because those can be used to satisfy online and autoreplace + * operations + */ + if (Value("type") != "GEOM::physpath" && Value("type") != "CREATE") + return (false); + + /* Log the event since it is of interest. */ + Log(LOG_INFO); + + string devPath; + if (!DevPath(devPath)) + return (false); + + int devFd(open(devPath.c_str(), O_RDONLY)); + if (devFd == -1) + return (false); + + bool inUse; + bool degraded; + nvlist_t *devLabel(ReadLabel(devFd, inUse, degraded)); + + string physPath; + bool havePhysPath(PhysicalPath(physPath)); + + string devName; + DevName(devName); + close(devFd); + + if (inUse && devLabel != NULL) { + OnlineByLabel(devPath, physPath, devLabel); + } else if (degraded) { + syslog(LOG_INFO, "%s is marked degraded. Ignoring " + "as a replace by physical path candidate.\n", + devName.c_str()); + } else if (havePhysPath) { + /* + * TODO: attempt to resolve events using every casefile + * that matches this physpath + */ + CaseFile *caseFile(CaseFile::Find(physPath)); + if (caseFile != NULL) { + syslog(LOG_INFO, + "Found CaseFile(%s:%s:%s) - ReEvaluating\n", + caseFile->PoolGUIDString().c_str(), + caseFile->VdevGUIDString().c_str(), + zpool_state_to_name(caseFile->VdevState(), + VDEV_AUX_NONE)); + caseFile->ReEvaluate(devPath, physPath, /*vdev*/NULL); + } + } + return (false); +} + +//- GeomEvent Protected Methods ----------------------------------------------- +GeomEvent::GeomEvent(Event::Type type, NVPairMap &nvpairs, + const string &eventString) + : DevdCtl::GeomEvent(type, nvpairs, eventString) +{ +} + +GeomEvent::GeomEvent(const GeomEvent &src) + : DevdCtl::GeomEvent::GeomEvent(src) +{ +} + +nvlist_t * +GeomEvent::ReadLabel(int devFd, bool &inUse, bool °raded) +{ + pool_state_t poolState; + char *poolName; + boolean_t b_inuse; + int nlabels; + + inUse = false; + degraded = false; + poolName = NULL; + if (zpool_in_use(g_zfsHandle, devFd, &poolState, + &poolName, &b_inuse) == 0) { + nvlist_t *devLabel = NULL; + + inUse = b_inuse == B_TRUE; + if (poolName != NULL) + free(poolName); + + if (zpool_read_label(devFd, &devLabel, &nlabels) != 0) + return (NULL); + /* + * If we find a disk with fewer than the maximum number of + * labels, it might be the whole disk of a partitioned disk + * where ZFS resides on a partition. In that case, we should do + * nothing and wait for the partition to appear. Or, the disk + * might be damaged. In that case, zfsd should do nothing and + * wait for the sysadmin to decide. + */ + if (nlabels != VDEV_LABELS || devLabel == NULL) { + nvlist_free(devLabel); + return (NULL); + } + + try { + Vdev vdev(devLabel); + degraded = vdev.State() != VDEV_STATE_HEALTHY; + return (devLabel); + } catch (ZfsdException &exp) { + string devName = fdevname(devFd); + string devPath = _PATH_DEV + devName; + string context("GeomEvent::ReadLabel: " + + devPath + ": "); + + exp.GetString().insert(0, context); + exp.Log(); + nvlist_free(devLabel); + } + } + return (NULL); +} + +bool +GeomEvent::OnlineByLabel(const string &devPath, const string& physPath, + nvlist_t *devConfig) +{ + bool ret = false; + try { + CaseFileList case_list; + /* + * A device with ZFS label information has been + * inserted. If it matches a device for which we + * have a case, see if we can solve that case. + */ + syslog(LOG_INFO, "Interrogating VDEV label for %s\n", + devPath.c_str()); + Vdev vdev(devConfig); + CaseFile::Find(vdev.PoolGUID(),vdev.GUID(), case_list); + for (CaseFileList::iterator curr = case_list.begin(); + curr != case_list.end(); curr++) { + ret |= (*curr)->ReEvaluate(devPath, physPath, &vdev); + } + return (ret); + + } catch (ZfsdException &exp) { + string context("GeomEvent::OnlineByLabel: " + devPath + ": "); + + exp.GetString().insert(0, context); + exp.Log(); + } + return (ret); +} + + +/*--------------------------------- ZfsEvent ---------------------------------*/ +//- ZfsEvent Static Public Methods --------------------------------------------- +DevdCtl::Event * +ZfsEvent::Builder(Event::Type type, NVPairMap &nvpairs, + const string &eventString) +{ + return (new ZfsEvent(type, nvpairs, eventString)); +} + +//- ZfsEvent Virtual Public Methods -------------------------------------------- +Event * +ZfsEvent::DeepCopy() const +{ + return (new ZfsEvent(*this)); +} + +bool +ZfsEvent::Process() const +{ + string logstr(""); + + if (!Contains("class") && !Contains("type")) { + syslog(LOG_ERR, + "ZfsEvent::Process: Missing class or type data."); + return (false); + } + + /* On config syncs, replay any queued events first. */ + if (Value("type").find("misc.fs.zfs.config_sync") == 0) { + /* + * Even if saved events are unconsumed the second time + * around, drop them. Any events that still can't be + * consumed are probably referring to vdevs or pools that + * no longer exist. + */ + ZfsDaemon::Get().ReplayUnconsumedEvents(/*discard*/true); + CaseFile::ReEvaluateByGuid(PoolGUID(), *this); + } + + if (Value("type").find("misc.fs.zfs.") == 0) { + /* Configuration changes, resilver events, etc. */ + ProcessPoolEvent(); + return (false); + } + + if (!Contains("pool_guid") || !Contains("vdev_guid")) { + /* Only currently interested in Vdev related events. */ + return (false); + } + + CaseFile *caseFile(CaseFile::Find(PoolGUID(), VdevGUID())); + if (caseFile != NULL) { + Log(LOG_INFO); + syslog(LOG_INFO, "Evaluating existing case file\n"); + caseFile->ReEvaluate(*this); + return (false); + } + + /* Skip events that can't be handled. */ + Guid poolGUID(PoolGUID()); + /* If there are no replicas for a pool, then it's not manageable. */ + if (Value("class").find("fs.zfs.vdev.no_replicas") == 0) { + stringstream msg; + msg << "No replicas available for pool " << poolGUID; + msg << ", ignoring"; + Log(LOG_INFO); + syslog(LOG_INFO, "%s", msg.str().c_str()); + return (false); + } + + /* + * Create a case file for this vdev, and have it + * evaluate the event. + */ + ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID); + if (zpl.empty()) { + stringstream msg; + int priority = LOG_INFO; + msg << "ZfsEvent::Process: Event for unknown pool "; + msg << poolGUID << " "; + msg << "queued"; + Log(LOG_INFO); + syslog(priority, "%s", msg.str().c_str()); + return (true); + } + + nvlist_t *vdevConfig = VdevIterator(zpl.front()).Find(VdevGUID()); + if (vdevConfig == NULL) { + stringstream msg; + int priority = LOG_INFO; + msg << "ZfsEvent::Process: Event for unknown vdev "; + msg << VdevGUID() << " "; + msg << "queued"; + Log(LOG_INFO); + syslog(priority, "%s", msg.str().c_str()); + return (true); + } + + Vdev vdev(zpl.front(), vdevConfig); + caseFile = &CaseFile::Create(vdev); + if (caseFile->ReEvaluate(*this) == false) { + stringstream msg; + int priority = LOG_INFO; + msg << "ZfsEvent::Process: Unconsumed event for vdev("; + msg << zpool_get_name(zpl.front()) << ","; + msg << vdev.GUID() << ") "; + msg << "queued"; + Log(LOG_INFO); + syslog(priority, "%s", msg.str().c_str()); + return (true); + } + return (false); +} + +//- ZfsEvent Protected Methods ------------------------------------------------- +ZfsEvent::ZfsEvent(Event::Type type, NVPairMap &nvpairs, + const string &eventString) + : DevdCtl::ZfsEvent(type, nvpairs, eventString) +{ +} + +ZfsEvent::ZfsEvent(const ZfsEvent &src) + : DevdCtl::ZfsEvent(src) +{ +} + +/* + * Sometimes the kernel won't detach a spare when it is no longer needed. This + * can happen for example if a drive is removed, then either the pool is + * exported or the machine is powered off, then the drive is reinserted, then + * the machine is powered on or the pool is imported. ZFSD must detach these + * spares itself. + */ +void +ZfsEvent::CleanupSpares() const +{ + Guid poolGUID(PoolGUID()); + ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID); + if (!zpl.empty()) { + zpool_handle_t* hdl; + + hdl = zpl.front(); + VdevIterator(hdl).Each(TryDetach, (void*)hdl); + } +} + +void +ZfsEvent::ProcessPoolEvent() const +{ + bool degradedDevice(false); + + /* The pool is destroyed. Discard any open cases */ + if (Value("type") == "misc.fs.zfs.pool_destroy") { + Log(LOG_INFO); + CaseFile::ReEvaluateByGuid(PoolGUID(), *this); + return; + } + + CaseFile *caseFile(CaseFile::Find(PoolGUID(), VdevGUID())); + if (caseFile != NULL) { + if (caseFile->VdevState() != VDEV_STATE_UNKNOWN + && caseFile->VdevState() < VDEV_STATE_HEALTHY) + degradedDevice = true; + + Log(LOG_INFO); + caseFile->ReEvaluate(*this); + } + else if (Value("type") == "misc.fs.zfs.resilver_finish") + { + /* + * It's possible to get a resilver_finish event with no + * corresponding casefile. For example, if a damaged pool were + * exported, repaired, then reimported. + */ + Log(LOG_INFO); + CleanupSpares(); + } + + if (Value("type") == "misc.fs.zfs.vdev_remove" + && degradedDevice == false) { + + /* See if any other cases can make use of this device. */ + Log(LOG_INFO); + ZfsDaemon::RequestSystemRescan(); + } +} + +bool +ZfsEvent::TryDetach(Vdev &vdev, void *cbArg) +{ + /* + * Outline: + * if this device is a spare, and its parent includes one healthy, + * non-spare child, then detach this device. + */ + zpool_handle_t *hdl(static_cast(cbArg)); + + if (vdev.IsSpare()) { + std::list siblings; + std::list::iterator siblings_it; + boolean_t cleanup = B_FALSE; + + Vdev parent = vdev.Parent(); + siblings = parent.Children(); + + /* Determine whether the parent should be cleaned up */ + for (siblings_it = siblings.begin(); + siblings_it != siblings.end(); + siblings_it++) { + Vdev sibling = *siblings_it; + + if (!sibling.IsSpare() && + sibling.State() == VDEV_STATE_HEALTHY) { + cleanup = B_TRUE; + break; + } + } + + if (cleanup) { + syslog(LOG_INFO, "Detaching spare vdev %s from pool %s", + vdev.Path().c_str(), zpool_get_name(hdl)); + zpool_vdev_detach(hdl, vdev.Path().c_str()); + } + + } + + /* Always return false, because there may be other spares to detach */ + return (false); +} diff --git a/cmd/zfsd/zfsd_event.h b/cmd/zfsd/zfsd_event.h new file mode 100644 index 000000000000..fd3f9f7c5200 --- /dev/null +++ b/cmd/zfsd/zfsd_event.h @@ -0,0 +1,144 @@ +/*- + * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + * + * $FreeBSD$ + */ + +/** + * \file dev_ctl_event.h + * + * \brief Class hierarchy used to express events received via + * the devdctl API. + * + * Header requirements: + * #include + * #include + * #include + * + * #include + * #include + */ + +#ifndef _ZFSD_EVENT_H_ +#define _ZFSD_EVENT_H_ + +/*============================ Namespace Control =============================*/ +using std::string; + +/*=========================== Forward Declarations ===========================*/ +struct zpool_handle; +typedef struct zpool_handle zpool_handle_t; + +struct nvlist; +typedef struct nvlist nvlist_t; + +/*--------------------------------- ZfsEvent ---------------------------------*/ +class ZfsEvent : public DevdCtl::ZfsEvent +{ +public: + /** Specialized DevdCtlEvent object factory for ZFS events. */ + static BuildMethod Builder; + + virtual DevdCtl::Event *DeepCopy() const; + + /** + * Interpret and perform any actions necessary to + * consume the event. + * \return True if this event should be queued for later reevaluation + */ + virtual bool Process() const; + +protected: + /** DeepCopy Constructor. */ + ZfsEvent(const ZfsEvent &src); + + /** Constructor */ + ZfsEvent(Type, DevdCtl::NVPairMap &, const string &); + + /** + * Detach any spares that are no longer needed, but were not + * automatically detached by the kernel + */ + virtual void CleanupSpares() const; + virtual void ProcessPoolEvent() const; + static VdevCallback_t TryDetach; +}; + +class GeomEvent : public DevdCtl::GeomEvent +{ +public: + static BuildMethod Builder; + + virtual DevdCtl::Event *DeepCopy() const; + + virtual bool Process() const; + +protected: + /** DeepCopy Constructor. */ + GeomEvent(const GeomEvent &src); + + /** Constructor */ + GeomEvent(Type, DevdCtl::NVPairMap &, const string &); + + /** + * Attempt to match the ZFS labeled device at devPath with an active + * CaseFile for a missing vdev. If a CaseFile is found, attempt + * to re-integrate the device with its pool. + * + * \param devPath The devfs path to the potential leaf vdev. + * \param physPath The physical path string reported by the device + * at devPath. + * \param devConfig The ZFS label information found on the device + * at devPath. + * + * \return true if the event that caused the online action can + * be considered consumed. + */ + static bool OnlineByLabel(const string &devPath, + const string& physPath, + nvlist_t *devConfig); + + /** + * \brief Read and return label information for a device. + * + * \param devFd The device from which to read ZFS label information. + * \param inUse The device is part of an active or potentially + * active configuration. + * \param degraded The device label indicates the vdev is not healthy. + * + * \return If label information is available, an nvlist describing + * the vdev configuraiton found on the device specified by + * devFd. Otherwise NULL. + */ + static nvlist_t *ReadLabel(int devFd, bool &inUse, bool °raded); + +}; +#endif /*_ZFSD_EVENT_H_ */ diff --git a/cmd/zfsd/zfsd_exception.cc b/cmd/zfsd/zfsd_exception.cc new file mode 100644 index 000000000000..e83dd0f6e506 --- /dev/null +++ b/cmd/zfsd/zfsd_exception.cc @@ -0,0 +1,135 @@ +/*- + * Copyright (c) 2011, 2012, 2013 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + */ + +/** + * \file zfsd_exception + * + * Implementation of the ZfsdException class. + */ +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include + +#include + +#include "vdev.h" +#include "zfsd_exception.h" + +__FBSDID("$FreeBSD$"); +/*============================ Namespace Control =============================*/ +using std::endl; +using std::string; +using std::stringstream; + +/*=========================== Class Implementations ==========================*/ +/*------------------------------- ZfsdException ------------------------------*/ +ZfsdException::ZfsdException(const char *fmt, ...) + : DevdCtl::Exception(), + m_poolConfig(NULL), + m_vdevConfig(NULL) +{ + va_list ap; + + va_start(ap, fmt); + FormatLog(fmt, ap); + va_end(ap); +} + +ZfsdException::ZfsdException(zpool_handle_t *pool, const char *fmt, ...) + : DevdCtl::Exception(), + m_poolConfig(zpool_get_config(pool, NULL)), + m_vdevConfig(NULL) +{ + va_list ap; + + va_start(ap, fmt); + FormatLog(fmt, ap); + va_end(ap); +} + +ZfsdException::ZfsdException(nvlist_t *poolConfig, const char *fmt, ...) + : DevdCtl::Exception(), + m_poolConfig(poolConfig), + m_vdevConfig(NULL) +{ + va_list ap; + + va_start(ap, fmt); + FormatLog(fmt, ap); + va_end(ap); +} + +void +ZfsdException::Log() const +{ + stringstream output; + + if (m_poolConfig != NULL) { + + output << "Pool "; + + const char *poolName; + if (nvlist_lookup_string(m_poolConfig, ZPOOL_CONFIG_POOL_NAME, + &poolName) == 0) + output << poolName; + else + output << "Unknown"; + output << ": "; + } + + if (m_vdevConfig != NULL) { + + if (m_poolConfig != NULL) { + Vdev vdev(m_poolConfig, m_vdevConfig); + + output << "Vdev " << vdev.GUID() << ": "; + } else { + Vdev vdev(m_vdevConfig); + + output << "Pool " << vdev.PoolGUID() << ": "; + output << "Vdev " << vdev.GUID() << ": "; + } + } + + output << m_log << endl; + syslog(LOG_ERR, "%s", output.str().c_str()); +} + diff --git a/cmd/zfsd/zfsd_exception.h b/cmd/zfsd/zfsd_exception.h new file mode 100644 index 000000000000..5170b2d0dbb1 --- /dev/null +++ b/cmd/zfsd/zfsd_exception.h @@ -0,0 +1,109 @@ +/*- + * Copyright (c) 2011, 2012, 2013 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + * + * $FreeBSD$ + */ + +/** + * \file zfsd_exception.h + * + * Definition of the ZfsdException class hierarchy. All exceptions + * explicitly thrown by Zfsd are defined here. + * + * Header requirements: + * #include + * + * #include + */ +#ifndef _ZFSD_EXCEPTION_H_ +#define _ZFSD_EXCEPTION_H_ + +/*=========================== Forward Declarations ===========================*/ +struct zpool_handle; +typedef struct zpool_handle zpool_handle_t; + +struct nvlist; +typedef struct nvlist nvlist_t; + +/*============================= Class Definitions ============================*/ +/*------------------------------- ZfsdException ------------------------------*/ +/** + * \brief Class allowing unified reporting/logging of exceptional events. + */ +class ZfsdException : public DevdCtl::Exception +{ +public: + /** + * \brief ZfsdException constructor allowing arbitrary string + * data to be reported. + * + * \param fmt Printf-like string format specifier. + */ + ZfsdException(const char *fmt, ...); + + /** + * \brief ZfsdException constructor allowing arbitrary string + * data to be reported and associated with the configuration + * data for a ZFS pool. + * + * \param pool Pool handle describing the pool to which this + * exception is associated. + * \param fmt Printf-like string format specifier. + * + * Instantiation with this method is used to report global + * pool errors. + */ + ZfsdException(zpool_handle_t *pool, const char *, ...); + + /** + * \brief ZfsdException constructor allowing arbitrary string + * data to be reported and associated with the configuration + * data for a ZFS pool. + * + * \param poolConfig Pool configuration describing the pool to + * which this exception is associated. + * \param fmt Printf-like string format specifier. + * + * Instantiation with this method is used to report global + * pool errors. + */ + ZfsdException(nvlist_t *poolConfig, const char *, ...); + + /** + * \brief Emit exception data to syslog(3). + */ + virtual void Log() const; +private: + nvlist_t *m_poolConfig; + nvlist_t *m_vdevConfig; +}; + +#endif /* _ZFSD_EXCEPTION_H_ */ diff --git a/cmd/zfsd/zfsd_main.cc b/cmd/zfsd/zfsd_main.cc new file mode 100644 index 000000000000..f090631e21f2 --- /dev/null +++ b/cmd/zfsd/zfsd_main.cc @@ -0,0 +1,90 @@ +/*- + * Copyright (c) 2012, 2013 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Alan Somers (Spectra Logic Corporation) + */ + +/** + * \file zfsd_main.cc + * + * main function for the ZFS Daemon. Separated to facilitate testing. + * + */ + +#include + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "vdev_iterator.h" +#include "zfsd.h" + +__FBSDID("$FreeBSD$"); + +/*=============================== Program Main ===============================*/ +static void +usage() +{ + fprintf(stderr, "usage: %s [-d]\n", getprogname()); + exit(1); +} + +/** + * Program entry point. + */ +int +main(int argc, char **argv) +{ + int ch; + + while ((ch = getopt(argc, argv, "d")) != -1) { + switch (ch) { + case 'd': + g_debug++; + break; + default: + usage(); + } + } + + ZfsDaemon::Run(); + + return (0); +} diff --git a/cmd/zfsd/zpool_list.cc b/cmd/zfsd/zpool_list.cc new file mode 100644 index 000000000000..82c35736df13 --- /dev/null +++ b/cmd/zfsd/zpool_list.cc @@ -0,0 +1,122 @@ +/*- + * Copyright (c) 2011, 2012, 2013 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + * + * $FreeBSD$ + */ + +/** + * \file zpool_list.cc + * + * Implementation of the ZpoolList class. + */ +#include +#include +#include + +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "vdev.h" +#include "vdev_iterator.h" +#include "zpool_list.h" +#include "zfsd.h" + +/*============================ Namespace Control =============================*/ +using DevdCtl::Guid; + +/*=========================== Class Implementations ==========================*/ +/*--------------------------------- ZpoolList --------------------------------*/ +bool +ZpoolList::ZpoolAll(zpool_handle_t *pool, nvlist_t *poolConfig, void *cbArg) +{ + return (true); +} + +bool +ZpoolList::ZpoolByGUID(zpool_handle_t *pool, nvlist_t *poolConfig, + void *cbArg) +{ + Guid *desiredPoolGUID(static_cast(cbArg)); + uint64_t poolGUID; + + /* We are only intested in the pool that matches our pool GUID. */ + return (nvlist_lookup_uint64(poolConfig, ZPOOL_CONFIG_POOL_GUID, + &poolGUID) == 0 + && poolGUID == (uint64_t)*desiredPoolGUID); +} + +bool +ZpoolList::ZpoolByName(zpool_handle_t *pool, nvlist_t *poolConfig, void *cbArg) +{ + const string &desiredPoolName(*static_cast(cbArg)); + + /* We are only intested in the pool that matches our pool GUID. */ + return (desiredPoolName == zpool_get_name(pool)); +} + +int +ZpoolList::LoadIterator(zpool_handle_t *pool, void *data) +{ + ZpoolList *zpl(reinterpret_cast(data)); + nvlist_t *poolConfig(zpool_get_config(pool, NULL)); + + if (zpl->m_filter(pool, poolConfig, zpl->m_filterArg)) + zpl->push_back(pool); + else + zpool_close(pool); + return (0); +} + +ZpoolList::ZpoolList(PoolFilter_t *filter, void * filterArg) + : m_filter(filter), + m_filterArg(filterArg) +{ + zpool_iter(g_zfsHandle, LoadIterator, this); +} + +ZpoolList::~ZpoolList() +{ + for (iterator it(begin()); it != end(); it++) + zpool_close(*it); + + clear(); +} diff --git a/cmd/zfsd/zpool_list.h b/cmd/zfsd/zpool_list.h new file mode 100644 index 000000000000..7d1b176ac710 --- /dev/null +++ b/cmd/zfsd/zpool_list.h @@ -0,0 +1,133 @@ +/*- + * Copyright (c) 2011, 2012, 2013 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + * + * $FreeBSD$ + */ + +/** + * \file zpool_list.h + * + * ZpoolList class definition. ZpoolList is a standard container + * allowing filtering and iteration of imported ZFS pool information. + * + * Header requirements: + * + * #include + * #include + */ +#ifndef _ZPOOL_LIST_H_ +#define _ZPOOL_LIST_H_ + +/*============================ Namespace Control =============================*/ +using std::string; + +/*=========================== Forward Declarations ===========================*/ +struct zpool_handle; +typedef struct zpool_handle zpool_handle_t; + +struct nvlist; +typedef struct nvlist nvlist_t; + +class Vdev; + +/*============================= Class Definitions ============================*/ +/*--------------------------------- ZpoolList --------------------------------*/ +class ZpoolList; +typedef bool PoolFilter_t(zpool_handle_t *pool, nvlist_t *poolConfig, + void *filterArg); + +/** + * \brief Container of imported ZFS pool data. + * + * ZpoolList is a convenience class that converts libzfs's ZFS + * pool methods into a standard list container. + */ +class ZpoolList : public std::list +{ +public: + /** + * \brief Utility ZpoolList construction filter that causes all + * pools known to the system to be included in the + * instantiated ZpoolList. + */ + static PoolFilter_t ZpoolAll; + + /** + * \brief Utility ZpoolList construction filter that causes only + * a pool known to the system and having the specified GUID + * to be included in the instantiated ZpoolList. + */ + static PoolFilter_t ZpoolByGUID; + + /** + * \brief Utility ZpoolList construction filter that causes only + * pools known to the system and having the specified name + * to be included in the instantiated ZpoolList. + */ + static PoolFilter_t ZpoolByName; + + /** + * \brief ZpoolList constructor + * + * \param filter The filter function to use when constructing + * the ZpoolList. This may be one of the static + * utility filters defined for ZpoolList or a + * user defined function. + * \param filterArg A single argument to pass into the filter function + * when it is invoked on each candidate pool. + */ + ZpoolList(PoolFilter_t *filter = ZpoolAll, void *filterArg = NULL); + ~ZpoolList(); + +private: + /** + * \brief Helper routine used to populate the internal + * data store of ZFS pool objects using libzfs's + * zpool_iter() function. + * + * \param pool The ZFS pool object to filter. + * \param data User argument passed through zpool_iter(). + */ + static int LoadIterator(zpool_handle_t *pool, void *data); + + /** + * \brief The filter with which this ZpoolList was constructed. + */ + PoolFilter_t *m_filter; + + /** + * \brief The filter argument with which this ZpoolList was + * constructed. + */ + void *m_filterArg; +}; + +#endif /* _ZPOOL_ITERATOR_H_ */ diff --git a/config/Rules.am b/config/Rules.am index 30c5f353cd23..01d3ff086c0e 100644 --- a/config/Rules.am +++ b/config/Rules.am @@ -28,6 +28,15 @@ if BUILD_FREEBSD AM_CFLAGS += -fPIC -Werror -Wno-unknown-pragmas -Wno-enum-conversion AM_CFLAGS += -include $(top_srcdir)/include/os/freebsd/spl/sys/ccompile.h AM_CFLAGS += -I/usr/include -I/usr/local/include + +AM_CXXFLAGS = -std=c++17 -Wall -Wstrict-prototypes -fno-strict-aliasing +AM_CXXFLAGS += $(NO_OMIT_FRAME_POINTER) +AM_CXXFLAGS += $(DEBUG_CFLAGS) +AM_CXXFLAGS += $(ASAN_CFLAGS) +AM_CXXFLAGS += $(CODE_COVERAGE_FLAGS) $(NO_FORMAT_ZERO_LENGTH) +AM_CXXFLAGS += -fPIC -Werror -Wno-unknown-pragmas -Wno-enum-conversion +AM_CXXFLAGS += -include $(top_srcdir)/include/os/freebsd/spl/sys/ccompile.h +AM_CXXFLAGS += -I/usr/include -I/usr/local/include endif AM_CPPFLAGS += -D_GNU_SOURCE diff --git a/configure.ac b/configure.ac index 2ce049c58219..45de74006346 100644 --- a/configure.ac +++ b/configure.ac @@ -49,6 +49,8 @@ AC_CONFIG_HEADERS([zfs_config.h], [ LT_INIT AC_PROG_INSTALL AC_PROG_CC +AC_PROG_CXX +AC_PROG_LIBTOOL AC_PROG_LN_S PKG_PROG_PKG_CONFIG AM_PROG_AS diff --git a/etc/Makefile.am b/etc/Makefile.am index 7187762d3802..df7795097d4e 100644 --- a/etc/Makefile.am +++ b/etc/Makefile.am @@ -21,6 +21,14 @@ SUBSTFILES += $(sysconf_zfs_DATA) SHELLCHECKSCRIPTS += $(sysconf_zfs_DATA) $(call SHELLCHECK_OPTS,$(sysconf_zfs_DATA)): SHELLCHECK_SHELL = sh +if BUILD_FREEBSD +rcdir = $(sysconfdir)/rc.d +rc_SCRIPTS = %D%/rc.d/zfsd + +SUBSTFILES += $(rc_SCRIPTS) +SHELLCHECKSCRIPTS += $(rc_SCRIPTS) +$(call SHELLCHECK_OPTS,$(rc_SCRIPTS)): SHELLCHECK_SHELL = sh +endif if BUILD_LINUX initconf_DATA = \ diff --git a/etc/rc.d/.gitignore b/etc/rc.d/.gitignore new file mode 100644 index 000000000000..5e3efca4d908 --- /dev/null +++ b/etc/rc.d/.gitignore @@ -0,0 +1 @@ +/zfsd diff --git a/etc/rc.d/zfsd.in b/etc/rc.d/zfsd.in new file mode 100644 index 000000000000..942b03f1e4df --- /dev/null +++ b/etc/rc.d/zfsd.in @@ -0,0 +1,14 @@ +#!/bin/sh + +# PROVIDE: zfsd +# REQUIRE: devd zfs +# KEYWORD: nojail shutdown + +. /etc/rc.subr + +name="zfsd" +rcvar="zfsd_enable" +command="@sbindir@/${name}" + +load_rc_config ${name} +run_rc_command "${1}" From ab9a80865d333004df88dbf63d146998dc828aec Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Wed, 28 Jul 2021 10:49:40 -0400 Subject: [PATCH 05/35] Make acltype=nfsv4 the default on Linux, too Now that we support NFSv4 ACLs on Linux, this can now be made the default across all platforms. Update the documentation and tests accordingly. Signed-off-by: Ryan Moeller --- man/man7/zfsprops.7 | 9 ++------ module/zcommon/zfs_prop.c | 8 +------ .../synctask_core/tst.get_index_props.zcp | 7 ++---- .../cli_root/zfs_get/zfs_get_001_pos.ksh | 9 ++++---- .../cli_root/zfs_get/zfs_get_002_pos.ksh | 8 +++---- .../cli_root/zfs_get/zfs_get_005_neg.ksh | 6 ++--- .../cli_root/zfs_get/zfs_get_008_pos.ksh | 6 ++--- .../cli_root/zfs_get/zfs_get_009_pos.ksh | 12 +++++----- .../inheritance/inherit_001_pos.ksh | 23 ++++++++++--------- 9 files changed, 37 insertions(+), 51 deletions(-) diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7 index 9ff0236f4d74..3cd225e57b3a 100644 --- a/man/man7/zfsprops.7 +++ b/man/man7/zfsprops.7 @@ -660,23 +660,18 @@ platform, the behavior is the same as if it were set to .Sy off . .Bl -tag -compact -offset 4n -width "posixacl" .It Sy off -default on Linux, when a file system has the +when a file system has the .Sy acltype property set to off then ACLs are disabled. .It Sy noacl an alias for .Sy off .It Sy nfsv4 -default on -.Fx , -indicates that NFSv4-style ZFS ACLs should be used. +default, indicates that NFSv4-style ZFS ACLs should be used. These ACLs can be managed with the .Xr getfacl 1 and .Xr setfacl 1 . -The -.Sy nfsv4 -ZFS ACL type is not yet supported on Linux. .It Sy posix indicates POSIX ACLs should be used. POSIX ACLs are specific to Linux and are not functional on other platforms. diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 764993b45e7c..31425802f97f 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -436,13 +436,7 @@ zfs_prop_init(void) PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "discard | groupmask | passthrough | restricted", "ACLMODE", acl_mode_table, sfeatures); - zprop_register_index(ZFS_PROP_ACLTYPE, "acltype", -#ifdef __linux__ - /* Linux doesn't natively support ZFS's NFSv4-style ACLs. */ - ZFS_ACLTYPE_OFF, -#else - ZFS_ACLTYPE_NFSV4, -#endif + zprop_register_index(ZFS_PROP_ACLTYPE, "acltype", ZFS_ACLTYPE_NFSV4, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "off | nfsv4 | posix", "ACLTYPE", acltype_table, sfeatures); zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit", diff --git a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_index_props.zcp b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_index_props.zcp index 10ef8e7f839a..1fd23ef02922 100644 --- a/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_index_props.zcp +++ b/tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_index_props.zcp @@ -27,11 +27,8 @@ props['checksum'] = {{'on', 'default'}, {nil, nil}} props['dedup'] = {{'off', 'default'}, {nil, nil}} props['compression'] = {{'off', 'default'}, {nil, nil}} props['snapdir'] = {{'hidden', 'default'}, {nil, nil}} -if os == "Linux" then - props['acltype'] = {{'off', 'default'}, {'off', 'default'}} -elseif os == "FreeBSD" then - props['aclmode'] = {{'discard', 'default'}, {'discard', 'default'}} -end +props['acltype'] = {{'nfsv4', 'default'}, {'nfsv4', 'default'}} +props['aclmode'] = {{'discard', 'default'}, {'discard', 'default'}} props['aclinherit'] = {{'restricted','default'}, {nil, nil}} props['copies'] = {{'1', 'default'}, {nil, nil}} props['primarycache'] = {{'all', 'default'}, {'all', 'default'}} diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_001_pos.ksh index 935a85285e8e..f6b0c395f048 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_001_pos.ksh @@ -62,13 +62,12 @@ typeset -r uint64_max="18446744073709551615" typeset zfs_props=("type" used available creation volsize referenced \ compressratio mounted origin recordsize quota reservation mountpoint \ sharenfs checksum compression atime devices exec readonly setuid \ - snapdir aclinherit canmount primarycache secondarycache version \ - usedbychildren usedbydataset usedbyrefreservation usedbysnapshots \ - filesystem_limit snapshot_limit filesystem_count snapshot_count) + snapdir aclinherit aclmode acltype canmount primarycache secondarycache \ + version usedbychildren usedbydataset usedbyrefreservation usedbysnapshots) if is_freebsd; then - typeset zfs_props_os=(jailed aclmode) + typeset zfs_props_os=(jailed) else - typeset zfs_props_os=(zoned acltype) + typeset zfs_props_os=(zoned) fi typeset userquota_props=(userquota@root groupquota@root userused@root \ groupused@root) diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_002_pos.ksh index 65986f6bd1a2..461fd4112e93 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_002_pos.ksh @@ -50,12 +50,12 @@ typeset options=(" " p r H) typeset zfs_props=("type" used available creation volsize referenced \ compressratio mounted origin recordsize quota reservation mountpoint \ sharenfs checksum compression atime devices exec readonly setuid \ - snapdir aclinherit canmount primarycache secondarycache version \ - usedbychildren usedbydataset usedbyrefreservation usedbysnapshots) + snapdir aclinherit aclmode acltype canmount primarycache secondarycache \ + version usedbychildren usedbydataset usedbyrefreservation usedbysnapshots) if is_freebsd; then - typeset zfs_props_os=(jailed aclmode) + typeset zfs_props_os=(jailed) else - typeset zfs_props_os=(zoned acltype) + typeset zfs_props_os=(zoned) fi typeset userquota_props=(userquota@root groupquota@root userused@root \ groupused@root) diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_005_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_005_neg.ksh index d9be907909db..7b3f3a472dd5 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_005_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_005_neg.ksh @@ -48,12 +48,12 @@ typeset val_opts=(p r H) typeset v_props=(type used available creation volsize referenced compressratio \ mounted origin recordsize quota reservation mountpoint sharenfs checksum \ compression atime devices exec readonly setuid snapdir version \ - aclinherit canmount primarycache secondarycache \ + aclinherit aclmode acltype canmount primarycache secondarycache \ usedbychildren usedbydataset usedbyrefreservation usedbysnapshots) if is_freebsd; then - typeset v_props_os=(jailed aclmode) + typeset v_props_os=(jailed) else - typeset v_props_os=(zoned acltype) + typeset v_props_os=(zoned) fi typeset userquota_props=(userquota@root groupquota@root userused@root \ groupused@root) diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_008_pos.ksh index 6be432036845..1cb69d8fe0e3 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_008_pos.ksh @@ -53,13 +53,13 @@ set -A options " " "-r" "-H" "-p" "-rHp" "-o name" \ set -A props type used available creation volsize referenced compressratio \ mounted origin recordsize quota reservation mountpoint sharenfs \ checksum compression atime devices exec readonly setuid snapdir \ - aclinherit canmount primarycache secondarycache version \ + aclinherit aclmode acltype canmount primarycache secondarycache \ usedbychildren usedbydataset usedbyrefreservation usedbysnapshots \ userquota@root groupquota@root userused@root groupused@root if is_freebsd; then - set -A props ${props[*]} jailed aclmode + set -A props ${props[*]} jailed else - set -A props ${props[*]} zoned acltype + set -A props ${props[*]} zoned fi set -A dataset $TESTPOOL/$TESTCTR $TESTPOOL/$TESTFS $TESTPOOL/$TESTVOL \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_009_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_009_pos.ksh index 5686b63b41cc..f9a8f22f5285 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_009_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_009_pos.ksh @@ -52,16 +52,16 @@ fi log_assert "'zfs get -d ' should get expected output." log_onexit depth_fs_cleanup -set -A all_props type used available creation volsize referenced \ - compressratio mounted origin recordsize quota reservation mountpoint \ - sharenfs checksum compression atime devices exec readonly setuid \ - snapdir aclinherit canmount primarycache secondarycache version \ +set -A all_props type used available creation volsize referenced compressratio \ + mounted origin recordsize quota reservation mountpoint sharenfs \ + checksum compression atime devices exec readonly setuid snapdir \ + aclinherit aclmode acltype canmount primarycache secondarycache \ usedbychildren usedbydataset usedbyrefreservation usedbysnapshots \ userquota@root groupquota@root userused@root groupused@root if is_freebsd; then - set -A all_props ${all_props[*]} jailed aclmode + set -A all_props ${all_props[*]} jailed else - set -A all_props ${all_props[*]} zoned acltype + set -A all_props ${all_props[*]} zoned fi depth_fs_setup diff --git a/tests/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh b/tests/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh index 1f203e1dc551..cabbeff719f3 100755 --- a/tests/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh @@ -373,6 +373,8 @@ function scan_state { #state-file # set -A prop "checksum" "" \ "compression" "" \ + "aclmode" "" \ + "acltype" "" \ "atime" "" \ "sharenfs" "" \ "recordsize" "recsize" \ @@ -386,13 +388,21 @@ set -A prop "checksum" "" \ # above must have a corresponding entry in the two arrays below. # -set -A def_val "on" "on" "on" \ +set -A def_val "on" \ + "on" \ + "discard" \ + "nfsv4" \ + "on" \ "off" "" \ "hidden" \ "off" \ "all" -set -A local_val "off" "off" "off" \ +set -A local_val "off" \ + "off" \ + "groupmask" \ + "off" \ + "off" \ "on" "" \ "visible" \ "off" \ @@ -401,15 +411,6 @@ set -A local_val "off" "off" "off" \ # # Add system specific values # -if is_linux; then - prop+=("acltype" "") - def_val+=("off") - local_val+=("off") -else - prop+=("aclmode" "") - def_val+=("discard") - local_val+=("groupmask") -fi if is_illumos; then prop+=("mountpoint" "") def_val+=("") From af2a8ddb0e599281e2591b4eacf6fbfa960e10e6 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 17 Aug 2021 15:36:37 -0400 Subject: [PATCH 06/35] Write /sys/kernel/wait_for_device_probe before import. The new sysfs attribute makes kernel to wait for all device probe to complete before return. Without it wait_for_udev call does not give any guaranties. Ticket: NAS-108200 Signed-off-by: Alexander Motin --- contrib/initramfs/scripts/zfs | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/initramfs/scripts/zfs b/contrib/initramfs/scripts/zfs index 0a2bd2efda7a..8d54f16bb16b 100644 --- a/contrib/initramfs/scripts/zfs +++ b/contrib/initramfs/scripts/zfs @@ -283,6 +283,7 @@ load_module_initrd() while true; do # Wait for all of the /dev/{hd,sd}[a-z] device nodes to appear. + echo "1" > /sys/kernel/wait_for_device_probe if command -v wait_for_udev > /dev/null 2>&1 ; then wait_for_udev 10 elif command -v wait_for_dev > /dev/null 2>&1 ; then From 39b4adf91d3f9f2b740cdfd44396aeb820c7c3a5 Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Tue, 2 Nov 2021 16:54:30 -0400 Subject: [PATCH 07/35] Fix access check when cred allows override of ACL Properly evaluate edge cases where user credential may grant capability to override DAC in various situations. Switch to using ns-aware checks rather than capable(). Expand optimization allow bypass of zfs_zaccess() in case of trivial ACL if MAY_OPEN is included in requested mask. This will be evaluated in generic_permission() check, which is RCU walk safe. This means that in most cases evaluating permissions on boot volume with NFSv4 ACLs will follow the fast path on checking inode permissions. Additionally, CAP_SYS_ADMIN is granted to nfsd process, and so override for this capability in access2 policy check is removed in favor of a simple check for fsid == 0. Checks for CAP_DAC_OVERRIDE and other override capabilities are kept as-is. Signed-off-by: Andrew Walker --- module/os/linux/zfs/policy.c | 25 ++++++++++++++++--------- module/os/linux/zfs/zpl_xattr.c | 3 ++- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/module/os/linux/zfs/policy.c b/module/os/linux/zfs/policy.c index f8a808604671..ebec1e93a436 100644 --- a/module/os/linux/zfs/policy.c +++ b/module/os/linux/zfs/policy.c @@ -114,17 +114,22 @@ secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner, mode_t curmode, mode_t wantmode) { mode_t remainder = ~curmode & wantmode; + uid_t uid = crgetuid(cr); if ((ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_NFSV4) || (remainder == 0)) { return (0); } - /* - * short-circuit if root - */ - if (capable(CAP_SYS_ADMIN)) { + if ((uid == owner) || (uid == 0)) return (0); - } + + if (zpl_inode_owner_or_capable(kcred->user_ns, ip)) + return (0); + +#if defined(CONFIG_USER_NS) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif /* * There are some situations in which capabilities @@ -132,20 +137,22 @@ secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner, */ if (S_ISDIR(ip->i_mode)) { if (!(wantmode & S_IWUSR) && - capable(CAP_DAC_READ_SEARCH)) { + (priv_policy_user(cr, CAP_DAC_READ_SEARCH, EPERM) == 0)) { return (0); } - if (capable(CAP_DAC_OVERRIDE)) { + if (priv_policy_user(cr, CAP_DAC_OVERRIDE, EPERM) == 0) { return (0); } return (EACCES); } - if ((wantmode == S_IRUSR) && capable(CAP_DAC_READ_SEARCH)) { + if ((wantmode == S_IRUSR) && + (priv_policy_user(cr, CAP_DAC_READ_SEARCH, EPERM) == 0)) { return (0); } - if (!(remainder & S_IXUSR) && capable(CAP_DAC_OVERRIDE)) { + if (!(remainder & S_IXUSR) && + (priv_policy_user(cr, CAP_DAC_OVERRIDE, EPERM) == 0)) { return (0); } diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index 4005258728da..332dc63970e6 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -107,7 +107,8 @@ static const struct { #endif }; -#define GENERIC_MASK(mask) ((mask & ~(MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) +#define POSIX_MASKS (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_OPEN) +#define GENERIC_MASK(mask) ((mask & ~POSIX_MASKS) == 0) enum xattr_permission { XAPERM_DENY, From de35ac0d1efe1ffc415d6407795afe6cea473495 Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Wed, 3 Nov 2021 08:03:08 -0400 Subject: [PATCH 08/35] Make zpl_permission work with 5.12+ kernels The "permission" inode operation takes a new `struct user_namespace *` parameter starting in Linux 5.12. Add a configure check and adapt accordingly. Signed-off-by: Ryan Moeller --- include/os/linux/zfs/sys/zpl.h | 5 +++++ module/os/linux/zfs/zpl_xattr.c | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index afea2336ba76..55cdf51c9711 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -106,7 +106,12 @@ zpl_chmod_acl(struct inode *ip) } #endif /* CONFIG_FS_POSIX_ACL */ +#if defined(HAVE_IOPS_PERMISSION_USERNS) +extern int zpl_permission(struct user_namespace *userns, struct inode *ip, + int mask); +#else extern int zpl_permission(struct inode *ip, int mask); +#endif extern xattr_handler_t *zpl_xattr_handlers[]; diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index 332dc63970e6..11f07dbb5b9a 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -1483,7 +1483,11 @@ static xattr_handler_t zpl_xattr_acl_default_handler = { #endif /* CONFIG_FS_POSIX_ACL */ int +#if defined(HAVE_IOPS_PERMISSION_USERNS) +zpl_permission(struct user_namespace *userns, struct inode *ip, int mask) +#else zpl_permission(struct inode *ip, int mask) +#endif { int to_check = 0, i, ret; cred_t *cr = NULL; @@ -1496,7 +1500,11 @@ zpl_permission(struct inode *ip, int mask) */ if ((ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_NFSV4) || ((ITOZ(ip)->z_pflags & ZFS_ACL_TRIVIAL && GENERIC_MASK(mask)))) { +#if defined(HAVE_IOPS_PERMISSION_USERNS) + return (generic_permission(userns, ip, mask)); +#else return (generic_permission(ip, mask)); +#endif } for (i = 0; i < ARRAY_SIZE(mask2zfs); i++) { @@ -1510,7 +1518,11 @@ zpl_permission(struct inode *ip, int mask) * NFSv4 ACE. Pass back to default kernel permissions check. */ if (to_check == 0) { +#if defined(HAVE_IOPS_PERMISSION_USERNS) + return (generic_permission(userns, ip, mask)); +#else return (generic_permission(ip, mask)); +#endif } /* From 29d6360344e9c31acca537103069f75c6effa06e Mon Sep 17 00:00:00 2001 From: Andrew Date: Sun, 27 Mar 2022 07:01:11 -0500 Subject: [PATCH 09/35] NAS-115465 / 22.12 / expose ZFS_ACL_TRIVIAL to users (#52) Add ACL_IS_TRIVIAL and ACL_IS_DIR flags as ACL-wide flags in the system.nfs4_acl_xdr generated on getxattr requests. This are non-RFC flags that are useful for userspace applications (especially the ACL_IS_TRIVIAL flag as it can be used to avoid relatively expensive ACL-related operations). Also add system.nfs4_acl_xdr to xattr results if ACL is not trivial. This duplicates POSIX ACL behavior where whether an ACL is set on a path can be determined via listxattr(). Since the ACL is not actually removed, we check whether the ZFS_ACL_TRIVIAL is set. If the flag is not set, then we omit the xattr name from the list. This allows users to determine whether ACL is trivial from listxattr(). Signed-off-by: Andrew Walker --- include/os/linux/spl/sys/acl.h | 2 ++ module/os/linux/zfs/zfs_acl.c | 4 ++++ module/os/linux/zfs/zpl_xattr.c | 10 +++++++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/os/linux/spl/sys/acl.h b/include/os/linux/spl/sys/acl.h index 5cd7a56b86ec..d66b523b985e 100644 --- a/include/os/linux/spl/sys/acl.h +++ b/include/os/linux/spl/sys/acl.h @@ -83,6 +83,8 @@ typedef struct ace_object { #define ACL_PROTECTED 0x0002 #define ACL_DEFAULTED 0x0004 #define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED|ACL_DEFAULTED) +#define ACL_IS_TRIVIAL 0x10000 +#define ACL_IS_DIR 0x20000 #define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 #define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 diff --git a/module/os/linux/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c index 48abbc010917..8b6c613c9932 100644 --- a/module/os/linux/zfs/zfs_acl.c +++ b/module/os/linux/zfs/zfs_acl.c @@ -2054,6 +2054,10 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) vsecp->vsa_aclflags |= ACL_PROTECTED; if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; + if (zp->z_pflags & ZFS_ACL_TRIVIAL) + vsecp->vsa_aclflags |= ACL_IS_TRIVIAL; + if (S_ISDIR(ZTOI(zp)->i_mode)) + vsecp->vsa_aclflags |= ACL_IS_DIR; } mutex_exit(&zp->z_acl_lock); diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index 11f07dbb5b9a..7d57f804712a 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -275,6 +275,14 @@ zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) goto out1; rw_enter(&zp->z_xattr_lock, RW_READER); + if ((zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4) && + ((zp->z_pflags & ZFS_ACL_TRIVIAL) == 0)) { + error = zpl_xattr_filldir(&xf, NFS41ACL_XATTR, + sizeof (NFS41ACL_XATTR) - 1); + if (error) + goto out; + } + if (zfsvfs->z_use_sa && zp->z_is_sa) { error = zpl_xattr_list_sa(&xf); if (error) @@ -1738,7 +1746,7 @@ nfsacl41i_to_zfsacl(const nfsacl41i *nacl, vsecattr_t *_vsecp) vsecattr_t vsecp; vsecp.vsa_aclcnt = nacl->na41_aces.na41_aces_len; - vsecp.vsa_aclflags = nacl->na41_flag; + vsecp.vsa_aclflags = nacl->na41_flag & ACL_FLAGS_ALL; vsecp.vsa_aclentsz = vsecp.vsa_aclcnt * sizeof (ace_t); vsecp.vsa_mask = (VSA_ACE | VSA_ACE_ACLFLAGS); vsecp.vsa_aclentp = kmem_alloc(vsecp.vsa_aclentsz, KM_SLEEP); From d1df3f6cabc3e68ddb7a923d74831cafaf5a830e Mon Sep 17 00:00:00 2001 From: Andrew Date: Tue, 21 Jun 2022 13:48:30 -0500 Subject: [PATCH 10/35] Add ability for xattr handler to "strip" NFSv4 ACL (#54) On Linux POSIX ACLs can be removed via rmxattr() for the relevant system xattrs. On FreeBSD a non-trivial ACL can be converted to one that is described by the mode with no loss of info via combination of acl_get_file(), acl_strip_np(), and acl_set_file(). Since there's no libc equivalent of these ops in Linux for NFSv4 ACLs, this commit makes this less error prone by handling entirely in ZFS. When user performs rmxattr() vfs_setxattr() is called with value of NULL and length of 0. Add special handling for this situation in the xattr handler for the NFSv4 ACL so that we generate a new ACL and zfs_acl_chmod() with the existing mode of file, then set the ACL. Signed-off-by: Andrew Walker --- include/sys/zfs_acl.h | 2 + module/os/freebsd/zfs/zfs_acl.c | 6 ++ module/os/linux/zfs/zfs_acl.c | 157 ++++++++++++++++++++++++-------- module/os/linux/zfs/zpl_xattr.c | 12 +-- 4 files changed, 135 insertions(+), 42 deletions(-) diff --git a/include/sys/zfs_acl.h b/include/sys/zfs_acl.h index e19288528849..a137393548d8 100644 --- a/include/sys/zfs_acl.h +++ b/include/sys/zfs_acl.h @@ -211,6 +211,8 @@ void zfs_acl_ids_free(zfs_acl_ids_t *); boolean_t zfs_acl_ids_overquota(struct zfsvfs *, zfs_acl_ids_t *, uint64_t); int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); +int zfs_stripacl(struct znode *, cred_t *); + void zfs_acl_rele(void *); void zfs_oldace_byteswap(ace_t *, int); void zfs_ace_byteswap(void *, size_t, boolean_t); diff --git a/module/os/freebsd/zfs/zfs_acl.c b/module/os/freebsd/zfs/zfs_acl.c index 20466aeaaa05..fb47013d06ef 100644 --- a/module/os/freebsd/zfs/zfs_acl.c +++ b/module/os/freebsd/zfs/zfs_acl.c @@ -2029,6 +2029,12 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) return (error); } +int +zfs_stripacl(znode_t *zp, cred_t *cr) +{ + return (SET_ERROR(EOPNOTSUPP)); +} + /* * Check accesses of interest (AoI) against attributes of the dataset * such as read-only. Returns zero if no AoI conflict with dataset diff --git a/module/os/linux/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c index 8b6c613c9932..85269730a6a7 100644 --- a/module/os/linux/zfs/zfs_acl.c +++ b/module/os/linux/zfs/zfs_acl.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -1963,8 +1964,8 @@ zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid) /* * Retrieve a file's ACL */ -int -zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +static int +zfs_getacl_impl(znode_t *zp, vsecattr_t *vsecp, boolean_t stripped, cred_t *cr) { zfs_acl_t *aclp; ulong_t mask; @@ -1975,21 +1976,21 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); - if (mask == 0) - return (SET_ERROR(ENOSYS)); - - if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr, - zfs_init_idmap))) - return (error); - - mutex_enter(&zp->z_acl_lock); - - error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); - if (error != 0) { - mutex_exit(&zp->z_acl_lock); - return (error); + if (stripped) { + mode_t mode = ZTOI(zp)->i_mode; + aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + (aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(S_ISDIR(mode), mode, B_TRUE, + (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_GROUPMASK), aclp); + } else { + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); + if (error != 0) + return (error); } + mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | + VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); + /* * Scan ACL to determine number of ACEs */ @@ -2038,7 +2039,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) for (aclnode = list_head(&aclp->z_acl); aclnode; aclnode = list_next(&aclp->z_acl, aclnode)) { - memcpy(start, aclnode->z_acldata, + bcopy(aclnode->z_acldata, start, aclnode->z_size); start = (caddr_t)start + aclnode->z_size; } @@ -2060,9 +2061,29 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) vsecp->vsa_aclflags |= ACL_IS_DIR; } + return (0); +} + +int +zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + int error; + ulong_t mask; + + mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | + VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); + + if (mask == 0) + return (SET_ERROR(ENOSYS)); + + if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))) + return (error); + + mutex_enter(&zp->z_acl_lock); + error = zfs_getacl_impl(zp, vsecp, B_FALSE, cr); mutex_exit(&zp->z_acl_lock); - return (0); + return (error); } int @@ -2123,12 +2144,11 @@ zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_mode, /* * Set a file's ACL */ -int -zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +static int +zfs_setacl_impl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); zilog_t *zilog = zfsvfs->z_log; - ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); dmu_tx_t *tx; int error; zfs_acl_t *aclp; @@ -2136,16 +2156,6 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) boolean_t fuid_dirtied; uint64_t acl_obj; - if (mask == 0) - return (SET_ERROR(ENOSYS)); - - if (zp->z_pflags & ZFS_IMMUTABLE) - return (SET_ERROR(EPERM)); - - if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, - zfs_init_idmap))) - return (error); - error = zfs_vsec_2_aclp(zfsvfs, ZTOI(zp)->i_mode, vsecp, cr, &fuidp, &aclp); if (error) @@ -2160,9 +2170,6 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) (zp->z_pflags & V4_ACL_WIDE_FLAGS); } top: - mutex_enter(&zp->z_acl_lock); - mutex_enter(&zp->z_lock); - tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); @@ -2193,12 +2200,15 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - mutex_exit(&zp->z_acl_lock); - mutex_exit(&zp->z_lock); - if (error == ERESTART) { + mutex_exit(&zp->z_acl_lock); + mutex_exit(&zp->z_lock); + dmu_tx_wait(tx); dmu_tx_abort(tx); + + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); goto top; } dmu_tx_abort(tx); @@ -2220,9 +2230,84 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) zfs_fuid_info_free(fuidp); dmu_tx_commit(tx); + return (error); +} + +int +zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); + int error; + + if (mask == 0) + return (SET_ERROR(ENOSYS)); + + if (zp->z_pflags & ZFS_IMMUTABLE) + return (SET_ERROR(EPERM)); + + if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) + return (error); + + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + + error = zfs_setacl_impl(zp, vsecp, cr); + mutex_exit(&zp->z_lock); mutex_exit(&zp->z_acl_lock); + return (error); +} + +int +zfs_stripacl(znode_t *zp, cred_t *cr) +{ + int error; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + + vsecattr_t vsec = { + .vsa_mask = VSA_ACE_ALLTYPES | VSA_ACECNT | VSA_ACE | + VSA_ACE_ACLFLAGS + }; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (zp->z_pflags & ZFS_IMMUTABLE) { + error = SET_ERROR(EPERM); + goto done; + } + + if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, B_FALSE, cr))) + goto done; + + if (zp->z_pflags & ZFS_ACL_TRIVIAL) { + // ACL is already stripped. Nothing to do. + error = 0; + goto done; + } + + mutex_enter(&zp->z_acl_lock); + error = zfs_getacl_impl(zp, &vsec, B_TRUE, cr); + if (error) { + mutex_exit(&zp->z_acl_lock); + goto done; + } + + mutex_enter(&zp->z_lock); + + error = zfs_setacl_impl(zp, &vsec, cr); + + mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); + + kmem_free(vsec.vsa_aclentp, vsec.vsa_aclentsz); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zfsvfs->z_log, 0); + +done: + ZFS_EXIT(zfsvfs); return (error); } diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index 7d57f804712a..beffa2f7ee11 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -1867,12 +1867,12 @@ __zpl_xattr_nfs41acl_set(struct inode *ip, const char *name, if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_NFSV4) return (-EOPNOTSUPP); - /* - * TODO: we may receive NULL value and size 0 - * when rmxattr() on our special xattr is called. - * A function to "strip" the ACL needs to be added - * to avoid POLA violation. - */ + if (value == NULL && size == 0) { + crhold(cr); + error = zfs_stripacl(ITOZ(ip), cr); + crfree(cr); + return (error); + } /* xdr data is 4-byte aligned */ if (((ulong_t)value % 4) != 0) { From 44e8ea6a4a61efd1537b8ea6edc446feaefcfe29 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Tue, 20 Jun 2023 02:15:22 +0500 Subject: [PATCH 11/35] Fix ACL build errors on sync with openzfs/master Signed-off-by: Ameer Hamza --- module/os/linux/zfs/zfs_acl.c | 22 +++++++++++++++------- module/os/linux/zfs/zpl_xattr.c | 4 +++- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/module/os/linux/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c index 85269730a6a7..9127f79160c9 100644 --- a/module/os/linux/zfs/zfs_acl.c +++ b/module/os/linux/zfs/zfs_acl.c @@ -2039,7 +2039,7 @@ zfs_getacl_impl(znode_t *zp, vsecattr_t *vsecp, boolean_t stripped, cred_t *cr) for (aclnode = list_head(&aclp->z_acl); aclnode; aclnode = list_next(&aclp->z_acl, aclnode)) { - bcopy(aclnode->z_acldata, start, + memcpy(start, aclnode->z_acldata, aclnode->z_size); start = (caddr_t)start + aclnode->z_size; } @@ -2076,8 +2076,10 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) if (mask == 0) return (SET_ERROR(ENOSYS)); - if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))) + if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr, + zfs_init_idmap))) { return (error); + } mutex_enter(&zp->z_acl_lock); error = zfs_getacl_impl(zp, vsecp, B_FALSE, cr); @@ -2245,8 +2247,10 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) if (zp->z_pflags & ZFS_IMMUTABLE) return (SET_ERROR(EPERM)); - if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) + if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, + zfs_init_idmap))) { return (error); + } mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); @@ -2270,15 +2274,19 @@ zfs_stripacl(znode_t *zp, cred_t *cr) VSA_ACE_ACLFLAGS }; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); + + if ((error = zfs_verify_zp(zp)) != 0) + goto done; if (zp->z_pflags & ZFS_IMMUTABLE) { error = SET_ERROR(EPERM); goto done; } - if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, B_FALSE, cr))) + if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, B_FALSE, cr, + zfs_init_idmap))) goto done; if (zp->z_pflags & ZFS_ACL_TRIVIAL) { @@ -2307,7 +2315,7 @@ zfs_stripacl(znode_t *zp, cred_t *cr) zil_commit(zfsvfs->z_log, 0); done: - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index beffa2f7ee11..326c8215b83d 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -1852,9 +1852,11 @@ __zpl_xattr_nfs41acl_get(struct inode *ip, const char *name, ZPL_XATTR_GET_WRAPPER(zpl_xattr_nfs41acl_get); static int -__zpl_xattr_nfs41acl_set(struct inode *ip, const char *name, +__zpl_xattr_nfs41acl_set(zidmap_t *mnt_ns, + struct inode *ip, const char *name, const void *value, size_t size, int flags) { + (void) mnt_ns; cred_t *cr = CRED(); vsecattr_t vsecp; char *bufp = NULL; From 08b7bccff416c240c5453da8520b705dcbef4e6a Mon Sep 17 00:00:00 2001 From: Andrew Date: Wed, 22 Jun 2022 09:02:40 -0500 Subject: [PATCH 12/35] NAS-116836 / Force BSD semantics for group ownership if NFSV4ACL (#78) When a new file is created on FreeBSD it is given the group of the directory which contains it. On Linux it is given to either the effective GID of the process (System V semantices) or the GID of the parent directory (BSD semantics). Since there is no hard-and-fast rule about creation semantics for NFSv4 ACLs on Linux, we should opt for what is least likely to break users permissions on change from FreeBSD to Linux. Avoid setting actually setting the SGID bit on dirs unless it was explicitly set. Signed-off-by: Andrew Walker --- module/os/linux/zfs/zfs_vnops_os.c | 32 ++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index a32307c39331..9ca09ac38dcb 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -594,6 +594,22 @@ zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, os = zfsvfs->z_os; zilog = zfsvfs->z_log; + /* + * For compatibility purposes with data migrated from FreeBSD + * (which will have NFSv4 ACL type), BSD file creation semantics + * are forced rather than System V. Hence on new file creation + * if NFSV4ACL we inherit GID from parent rather than take current + * process GID. This makes S_ISGID on directories a de-facto + * no-op, but we still honor setting / removing it and normal + * inheritance of the bit on new directories in case user changes + * the underlying ACL type. + */ + if ((vap->va_mask & ATTR_MODE) && + S_ISDIR(ZTOI(dzp)->i_mode) && + (zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4)) { + vap->va_gid = KGID_TO_SGID(ZTOI(dzp)->i_gid); + } + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); @@ -1220,6 +1236,22 @@ zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, return (error); zilog = zfsvfs->z_log; + /* + * For compatibility purposes with data migrated from FreeBSD + * (which will have NFSv4 ACL type), BSD file creation semantics + * are forced rather than System V. Hence on new file creation + * if NFSV4ACL we inherit GID from parent rather than take current + * process GID. This makes S_ISGID on directories a de-facto + * no-op, but we still honor setting / removing it and normal + * inheritance of the bit on new directories in case user changes + * the underlying ACL type. + */ + if ((vap->va_mask & ATTR_MODE) && + S_ISDIR(ZTOI(dzp)->i_mode) && + (zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4)) { + vap->va_gid = KGID_TO_SGID(ZTOI(dzp)->i_gid); + } + if (dzp->z_pflags & ZFS_XATTR) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); From 457bcb08d8a6e9e75360e75bd9c275995ce46c91 Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Wed, 29 Jun 2022 14:22:23 +0000 Subject: [PATCH 13/35] initramfs: Skip lvm scan before boot pool import TrueNAS SCALE doesn't boot from pools on top of LVM, and the scan can take a significant amount of time on systems with a large number of disks. Skip the lvm commands in our local-top/zfs script. Signed-off-by: Ryan Moeller --- contrib/initramfs/scripts/local-top/zfs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/contrib/initramfs/scripts/local-top/zfs b/contrib/initramfs/scripts/local-top/zfs index 6b80e9f43607..bae1707bdaba 100755 --- a/contrib/initramfs/scripts/local-top/zfs +++ b/contrib/initramfs/scripts/local-top/zfs @@ -48,6 +48,8 @@ activate_vg() } udev_settle -activate_vg +# TrueNAS SCALE doesn't boot from pools on top of LVM, and the scan can take a +# significant amount of time on systems with a large number of disks. Skip it. +#activate_vg exit 0 From 5b009301f4e4a6d049b04ecb3d7e33836093213d Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Thu, 8 Sep 2022 13:57:55 +0000 Subject: [PATCH 14/35] Provide kfpu_begin/end from spl Jira: NAS-115648 --- config/kernel-fpu.m4 | 8 +-- include/os/linux/kernel/linux/simd_x86.h | 25 +------- module/Kbuild.in | 1 + module/os/linux/spl/spl-kfpu.c | 75 ++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 27 deletions(-) create mode 100644 module/os/linux/spl/spl-kfpu.c diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4 index edfde1a02d30..f419f2982594 100644 --- a/config/kernel-fpu.m4 +++ b/config/kernel-fpu.m4 @@ -62,7 +62,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_FPU], [ ], [ kernel_fpu_begin(); kernel_fpu_end(); - ], [], [ZFS_META_LICENSE]) + ], [], []) ZFS_LINUX_TEST_SRC([__kernel_fpu], [ #include @@ -77,7 +77,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_FPU], [ ], [ __kernel_fpu_begin(); __kernel_fpu_end(); - ], [], [ZFS_META_LICENSE]) + ], [], []) ZFS_LINUX_TEST_SRC([kernel_neon], [ #include @@ -92,7 +92,7 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [ dnl # Legacy kernel dnl # AC_MSG_CHECKING([whether kernel fpu is available]) - ZFS_LINUX_TEST_RESULT([kernel_fpu_license], [ + ZFS_LINUX_TEST_RESULT([kernel_fpu], [ AC_MSG_RESULT(kernel_fpu_*) AC_DEFINE(HAVE_KERNEL_FPU, 1, [kernel has kernel_fpu_* functions]) @@ -102,7 +102,7 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [ dnl # dnl # Linux 4.2 kernel dnl # - ZFS_LINUX_TEST_RESULT_SYMBOL([__kernel_fpu_license], + ZFS_LINUX_TEST_RESULT_SYMBOL([__kernel_fpu], [__kernel_fpu_begin], [arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [ AC_MSG_RESULT(__kernel_fpu_*) diff --git a/include/os/linux/kernel/linux/simd_x86.h b/include/os/linux/kernel/linux/simd_x86.h index 699b8a571824..4e99c2d1c212 100644 --- a/include/os/linux/kernel/linux/simd_x86.h +++ b/include/os/linux/kernel/linux/simd_x86.h @@ -106,29 +106,8 @@ #define kfpu_init() 0 #define kfpu_fini() ((void) 0) -#if defined(HAVE_UNDERSCORE_KERNEL_FPU) -#define kfpu_begin() \ -{ \ - preempt_disable(); \ - __kernel_fpu_begin(); \ -} -#define kfpu_end() \ -{ \ - __kernel_fpu_end(); \ - preempt_enable(); \ -} - -#elif defined(HAVE_KERNEL_FPU) -#define kfpu_begin() kernel_fpu_begin() -#define kfpu_end() kernel_fpu_end() - -#else -/* - * This case is unreachable. When KERNEL_EXPORTS_X86_FPU is defined then - * either HAVE_UNDERSCORE_KERNEL_FPU or HAVE_KERNEL_FPU must be defined. - */ -#error "Unreachable kernel configuration" -#endif +extern void kfpu_begin(void); +extern void kfpu_end(void); #else /* defined(KERNEL_EXPORTS_X86_FPU) */ diff --git a/module/Kbuild.in b/module/Kbuild.in index 94d6987b0af5..b544ec7b9d50 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -74,6 +74,7 @@ SPL_OBJS := \ spl-cred.o \ spl-err.o \ spl-generic.o \ + spl-kfpu.o \ spl-kmem-cache.o \ spl-kmem.o \ spl-kstat.o \ diff --git a/module/os/linux/spl/spl-kfpu.c b/module/os/linux/spl/spl-kfpu.c new file mode 100644 index 000000000000..9e781c9f768f --- /dev/null +++ b/module/os/linux/spl/spl-kfpu.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2022 iXsystems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef KERNEL_EXPORTS_X86_FPU + +#include + +#if defined(HAVE_KERNEL_FPU) + +void +// cppcheck-suppress syntaxError +kfpu_begin(void) +{ + kernel_fpu_begin(); +} + +void +// cppcheck-suppress syntaxError +kfpu_end(void) +{ + kernel_fpu_end(); +} + +#elif defined(HAVE_UNDERSCORE_KERNEL_FPU) + +void +// cppcheck-suppress syntaxError +kfpu_begin(void) +{ + preempt_disable(); + __kernel_fpu_begin(); +} + +void +// cppcheck-suppress syntaxError +kfpu_end(void) +{ + __kernel_fpu_end(); + preempt_enable(); +} + +#else +/* + * This case should be unreachable. When KERNEL_EXPORTS_X86_FPU is defined + * then either HAVE_UNDERSCORE_KERNEL_FPU or HAVE_KERNEL_FPU must be defined. + */ +#error "Unreachable kernel configuration" +#endif + +EXPORT_SYMBOL(kfpu_begin); +EXPORT_SYMBOL(kfpu_end); + +#endif /* KERNEL_EXPORTS_X86_FPU */ From 2df4032585d7bf9f26a540ac62d4cb166fbdfbdf Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Thu, 3 Nov 2022 15:21:28 +0000 Subject: [PATCH 15/35] SCALE: ignore wholedisk We never want to partition vdevs automatically from ZFS in SCALE. Ignore the wholedisk flag in SCALE and skip the tests that expect auto partitioning to work. Signed-off-by: Ryan Moeller --- cmd/zed/agents/zfs_mod.c | 9 ++++++--- cmd/zpool/zpool_vdev.c | 6 ++++++ config/Rules.am | 6 ++++++ lib/libzfs/libzfs_pool.c | 5 ++++- lib/libzfs/os/linux/libzfs_pool_os.c | 6 ++++++ .../cli_root/zpool_expand/zpool_expand_005_pos.ksh | 2 ++ .../cli_root/zpool_reopen/zpool_reopen_005_pos.ksh | 2 ++ .../tests/functional/fault/auto_offline_001_pos.ksh | 1 + .../tests/functional/fault/auto_replace_001_pos.ksh | 2 ++ 9 files changed, 35 insertions(+), 4 deletions(-) diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 69163b80bd5a..4bc9f044e49d 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -238,8 +238,9 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &enc_sysfs_path); - +#ifndef TRUENAS_SCALE_NEVER_WHOLEDISK (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); +#endif (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_FAULTED, &faulted); @@ -1040,7 +1041,9 @@ vdev_whole_disk_from_config(zpool_handle_t *zhp, const char *vdev_path) if (!nvl) return (0); +#ifndef TRUENAS_SCALE_NEVER_WHOLEDISK (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); +#endif return (wholedisk); } @@ -1088,10 +1091,10 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data) zpool_close(zhp); return (0); } - +#ifndef TRUENAS_SCALE_NEVER_WHOLEDISK (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); - +#endif if (wholedisk) { char *tmp; path = strrchr(path, '/'); diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index fbd4b81dfacc..0ba326473e6c 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -989,8 +989,12 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv, boolean_t replacing) * magic value left by the previous filesystem. */ verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); +#ifdef TRUENAS_SCALE_NEVER_WHOLEDISK + wholedisk = B_FALSE; +#else verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); +#endif if (!wholedisk) { /* @@ -1155,9 +1159,11 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, &child, &children) != 0) { verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); +#ifndef TRUENAS_SCALE_NEVER_WHOLEDISK if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); +#endif /* * As a generic check, we look to see if this is a replace of a diff --git a/config/Rules.am b/config/Rules.am index 01d3ff086c0e..6997f4e3bfef 100644 --- a/config/Rules.am +++ b/config/Rules.am @@ -52,6 +52,12 @@ AM_CPPFLAGS += -DPKGDATADIR=\"$(pkgdatadir)\" AM_CPPFLAGS += $(DEBUG_CPPFLAGS) AM_CPPFLAGS += $(CODE_COVERAGE_CPPFLAGS) AM_CPPFLAGS += -DTEXT_DOMAIN=\"zfs-@ac_system_l@-user\" +if BUILD_LINUX +# NAS-118236: Vdevs from CORE are labeled with whole_disk=1, but we did not +# create the partition table in ZFS and must not repartition them. We know +# SCALE never creates wholedisk vdevs, so just ignore the config. +AM_CPPFLAGS += -DTRUENAS_SCALE_NEVER_WHOLEDISK +endif if ASAN_ENABLED AM_CPPFLAGS += -DZFS_ASAN_ENABLED diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 402c14a6baee..720135301598 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3158,8 +3158,10 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &pathname) == 0) { uint64_t wholedisk = 0; +#ifndef TRUENAS_SCALE_NEVER_WHOLEDISK (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); +#endif /* * XXX - L2ARC 1.0 devices can't support expansion. @@ -4314,7 +4316,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, !(name_flags & VDEV_NAME_PATH)) { path = zfs_strip_path(path); } - +#ifndef TRUENAS_SCALE_NEVER_WHOLEDISK /* * Remove the partition from the path if this is a whole disk. */ @@ -4323,6 +4325,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, == 0 && value && !(name_flags & VDEV_NAME_PATH)) { return (zfs_strip_partition(path)); } +#endif } else { path = type; diff --git a/lib/libzfs/os/linux/libzfs_pool_os.c b/lib/libzfs/os/linux/libzfs_pool_os.c index 86eef3255bc2..088ba4ea2335 100644 --- a/lib/libzfs/os/linux/libzfs_pool_os.c +++ b/lib/libzfs/os/linux/libzfs_pool_os.c @@ -87,6 +87,7 @@ zpool_relabel_disk(libzfs_handle_t *hdl, const char *path, const char *msg) return (0); } +#ifndef TRUENAS_SCALE_NEVER_WHOLEDISK /* * Read the EFI label from the config, if a label does not exist then * pass back the error to the caller. If the caller has passed a non-NULL @@ -118,6 +119,7 @@ read_efi_label(nvlist_t *config, diskaddr_t *sb) } return (err); } +#endif /* * determine where a partition starts on a disk in the current @@ -126,6 +128,9 @@ read_efi_label(nvlist_t *config, diskaddr_t *sb) static diskaddr_t find_start_block(nvlist_t *config) { +#ifdef TRUENAS_SCALE_NEVER_WHOLEDISK + (void) config; +#else nvlist_t **child; uint_t c, children; diskaddr_t sb = MAXOFFSET_T; @@ -149,6 +154,7 @@ find_start_block(nvlist_t *config) return (sb); } } +#endif return (MAXOFFSET_T); } diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh index 5e8f1d7053e8..7eadb03793a4 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh @@ -45,6 +45,8 @@ verify_runnable "global" +log_unsupported "SKIP because auto partitioning removed for SCALE" + function cleanup { poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_005_pos.ksh index 095f3bc05e66..bccfaa27e31c 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_005_pos.ksh @@ -38,6 +38,8 @@ verify_runnable "global" +log_unsupported "SKIP because auto partitioning removed for SCALE" + function cleanup { log_must zinject -c all diff --git a/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh index c0387e1d3235..d1bf2e8641c3 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh @@ -50,6 +50,7 @@ verify_runnable "both" if is_linux; then + log_unsupported "SKIP because auto partitioning removed for SCALE" # Add one 512b scsi_debug device (4Kn would generate IO errors) # NOTE: must be larger than other "file" vdevs and minimum SPA devsize: # add 32m of fudge diff --git a/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh index ae56ee9919bf..b6b9f789ed30 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh @@ -49,6 +49,8 @@ verify_runnable "both" +log_unsupported "SKIP because auto partitioning removed for SCALE" + if ! is_physical_device $DISKS; then log_unsupported "Unsupported disks for this test." fi From 815f61fda79f83882845c419a1d9b6cdc8ba0186 Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Tue, 31 Jan 2023 11:37:09 -0800 Subject: [PATCH 16/35] Fix ZFS_READONLY implementation on Linux (#121) MS-FSCC 2.6 is the governing document for DOS attribute behavior. It specifies the following: For a file, applications can read the file but cannot write to it or delete it. For a directory, applications cannot delete it, but applications can create and delete files from the directory. Signed-off-by: Andrew Walker --- module/os/linux/zfs/zfs_acl.c | 2 +- module/os/linux/zfs/zfs_vnops_os.c | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/module/os/linux/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c index 9127f79160c9..b2a1d265998b 100644 --- a/module/os/linux/zfs/zfs_acl.c +++ b/module/os/linux/zfs/zfs_acl.c @@ -2620,7 +2620,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, * Also note: DOS R/O is ignored for directories. */ if ((v4_mode & WRITE_MASK_DATA) && - S_ISDIR(ZTOI(zp)->i_mode) && + !S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_READONLY)) { return (SET_ERROR(EPERM)); } diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 9ca09ac38dcb..c2505f9ea1e9 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -1998,10 +1998,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) goto out3; } - if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { - err = SET_ERROR(EPERM); - goto out3; - } + /* ZFS_READONLY will be handled in zfs_zaccess() */ /* * Verify timestamps doesn't overflow 32 bits. From d01282fa5a4eb2bd9f75106d23ceeb16159858de Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Fri, 6 Jan 2023 09:44:56 -0800 Subject: [PATCH 17/35] Simplify get/set NFS4 ACL (#113) This removes an extra memory allocation / free from the NFS4 ACL xattr handler. Initially this was written rather quickly in the alpha cycle of SCALE and implemented in a way to ensure that xattr was exactly matching format used internally in samba's vfs_acl_xattr module. Since this time a more efficient conversion between the Samba format and various other ones was added for the purpose of inclusion in the Kernel NFS server. This change simplifies conversion between internal NFS ACL and external xattr representation, but has no impact on userspace and kernel consumers of this xattr (format does not change). Signed-off-by: Andrew Walker --- module/Kbuild.in | 1 - module/Makefile.in | 9 -- module/os/linux/zfs/nfs41acl.x | 74 ----------- module/os/linux/zfs/zpl_xattr.c | 212 +++++++++++++------------------- 4 files changed, 83 insertions(+), 213 deletions(-) delete mode 100644 module/os/linux/zfs/nfs41acl.x diff --git a/module/Kbuild.in b/module/Kbuild.in index b544ec7b9d50..804a544f2b31 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -440,7 +440,6 @@ ZFS_OBJS_OS := \ abd_os.o \ arc_os.o \ mmp_os.o \ - nfs41acl_xdr.o \ policy.o \ qat.o \ qat_compress.o \ diff --git a/module/Makefile.in b/module/Makefile.in index 757dfc088ddf..d42d9cc1f804 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -53,15 +53,6 @@ FMAKE = env -u MAKEFLAGS make $(FMAKEFLAGS) modules-Linux: mkdir -p $(sort $(dir $(spl-objs) $(spl-))) mkdir -p $(sort $(dir $(zfs-objs) $(zfs-))) - @# Generate the nfs41acl XDR header. - rpcgen -h "@abs_srcdir@/os/linux/zfs/nfs41acl.x" > "@abs_srcdir@/os/linux/zfs/nfs41acl.h" - sed -i "/\/c\\#include \" "@abs_srcdir@/os/linux/zfs/nfs41acl.h" - sed -i "9i #include " "@abs_srcdir@/os/linux/zfs/nfs41acl.h" - @# Generate the nfs41acl XDR code. - rpcgen -c "@abs_srcdir@/os/linux/zfs/nfs41acl.x" > "@abs_srcdir@/os/linux/zfs/nfs41acl_xdr.c" - sed -i "5i #pragma GCC diagnostic push" "@abs_srcdir@/os/linux/zfs/nfs41acl_xdr.c" - sed -i "6i #pragma GCC diagnostic ignored \"-Wunused-variable\"" "@abs_srcdir@/os/linux/zfs/nfs41acl_xdr.c" - echo "#pragma GCC diagnostic pop" >> "@abs_srcdir@/os/linux/zfs/nfs41acl_xdr.c" @# Build the kernel modules. $(MAKE) -C @LINUX_OBJ@ $(if @KERNEL_CC@,CC=@KERNEL_CC@) \ $(if @KERNEL_LD@,LD=@KERNEL_LD@) $(if @KERNEL_LLVM@,LLVM=@KERNEL_LLVM@) \ diff --git a/module/os/linux/zfs/nfs41acl.x b/module/os/linux/zfs/nfs41acl.x deleted file mode 100644 index 1df04e8b8af8..000000000000 --- a/module/os/linux/zfs/nfs41acl.x +++ /dev/null @@ -1,74 +0,0 @@ -const ACE4_ACCESS_ALLOWED_ACE_TYPE = 0x00000000; -const ACE4_ACCESS_DENIED_ACE_TYPE = 0x00000001; -const ACE4_SYSTEM_AUDIT_ACE_TYPE = 0x00000002; -const ACE4_SYSTEM_ALARM_ACE_TYPE = 0x00000003; - -typedef u_int acetype4; - -const ACE4_FILE_INHERIT_ACE = 0x00000001; -const ACE4_DIRECTORY_INHERIT_ACE = 0x00000002; -const ACE4_NO_PROPAGATE_INHERIT_ACE = 0x00000004; -const ACE4_INHERIT_ONLY_ACE = 0x00000008; -const ACE4_SUCCESSFUL_ACCESS_ACE_FLAG = 0x00000010; -const ACE4_FAILED_ACCESS_ACE_FLAG = 0x00000020; -const ACE4_IDENTIFIER_GROUP = 0x00000040; -const ACE4_INHERITED_ACE = 0x00000080; - -typedef u_int aceflag4; - -const ACEI4_SPECIAL_WHO = 0x00000001; - -typedef u_int aceiflag4; - -const ACE4_SPECIAL_OWNER = 1; -const ACE4_SPECIAL_GROUP = 2; -const ACE4_SPECIAL_EVERYONE = 3; -const ACE4_SPECIAL_INTERACTIVE = 4; -const ACE4_SPECIAL_NETWORK = 5; -const ACE4_SPECIAL_DIALUP = 6; -const ACE4_SPECIAL_BATCH = 7; -const ACE4_SPECIAL_ANONYMOUS = 8; -const ACE4_SPECIAL_AUTHENTICATED = 9; -const ACE4_SPECIAL_SERVICE = 10; - -const ACE4_READ_DATA = 0x00000001; -const ACE4_LIST_DIRECTORY = 0x00000001; -const ACE4_WRITE_DATA = 0x00000002; -const ACE4_ADD_FILE = 0x00000002; -const ACE4_APPEND_DATA = 0x00000004; -const ACE4_ADD_SUBDIRECTORY = 0x00000004; -const ACE4_READ_NAMED_ATTRS = 0x00000008; -const ACE4_WRITE_NAMED_ATTRS = 0x00000010; -const ACE4_EXECUTE = 0x00000020; -const ACE4_DELETE_CHILD = 0x00000040; -const ACE4_READ_ATTRIBUTES = 0x00000080; -const ACE4_WRITE_ATTRIBUTES = 0x00000100; -const ACE4_WRITE_RETENTION = 0x00000200; -const ACE4_WRITE_RETENTION_HOLD = 0x00000400; - -const ACE4_DELETE = 0x00010000; -const ACE4_READ_ACL = 0x00020000; -const ACE4_WRITE_ACL = 0x00040000; -const ACE4_WRITE_OWNER = 0x00080000; -const ACE4_SYNCHRONIZE = 0x00100000; - -typedef u_int acemask4; - -struct nfsace4i { - acetype4 type; - aceflag4 flag; - aceiflag4 iflag; - acemask4 access_mask; - u_int who; -}; - -const ACL4_AUTO_INHERIT = 0x00000001; -const ACL4_PROTECTED = 0x00000002; -const ACL4_DEFAULTED = 0x00000004; - -typedef u_int aclflag4; - -struct nfsacl41i { - aclflag4 na41_flag; - nfsace4i na41_aces<>; -}; diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index 326c8215b83d..67f93ea8e8ba 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -85,8 +85,6 @@ #include #include #include -#include -#include "nfs41acl.h" #define NFS41ACL_XATTR "system.nfs4_acl_xdr" @@ -1590,6 +1588,10 @@ zpl_permission(struct inode *ip, int mask) return (ret); } +#define ACEI4_SPECIAL_WHO 1 +#define ACE4_SPECIAL_OWNER 1 +#define ACE4_SPECIAL_GROUP 2 +#define ACE4_SPECIAL_EVERYONE 3 #define NFS41ACL_MAX_ACES 128 #define NFS41_FLAGS (ACE_DIRECTORY_INHERIT_ACE| \ ACE_FILE_INHERIT_ACE| \ @@ -1601,23 +1603,14 @@ zpl_permission(struct inode *ip, int mask) /* * Macros for sanity checks related to XDR and ACL buffer sizes */ -#define ACE4SIZE (sizeof (nfsace4i)) -#define ACLBASE (sizeof (nfsacl41i)) -#define XDRBASE (2 * sizeof (uint_t)) +#define ACE4ELEM 5 +#define ACE4SIZE (ACE4ELEM * sizeof (u32)) +#define XDRBASE (2 * sizeof (u32)) -#define ACES_TO_SIZE(x, y) (x + (y * ACE4SIZE)) -#define SIZE_TO_ACES(x, y) ((y - x) / ACE4SIZE) -#define SIZE_IS_VALID(x, y) ((x >= ACES_TO_SIZE(y, 0)) && \ - (((x - y) % ACE4SIZE) == 0)) - -#define ACES_TO_ACLSIZE(x) (ACES_TO_SIZE(ACLBASE, x)) -#define ACES_TO_XDRSIZE(x) (ACES_TO_SIZE(XDRBASE, x)) - -#define ACLSIZE_TO_ACES(x) (SIZE_TO_ACES(ACLBASE, x)) -#define XDRSIZE_TO_ACES(x) (SIZE_TO_ACES(XDRBASE, x)) - -#define ACLSIZE_IS_VALID(x) (SIZE_IS_VALID(x, ACLBASE)) -#define XDRSIZE_IS_VALID(x) (SIZE_IS_VALID(x, XDRBASE)) +#define ACES_TO_XDRSIZE(x) (XDRBASE + (x * ACE4SIZE)) +#define XDRSIZE_TO_ACES(x) ((x - XDRBASE) / ACE4SIZE) +#define XDRSIZE_IS_VALID(x) ((x >= XDRBASE) && \ + (((x - XDRBASE) % ACE4SIZE) == 0)) static int __zpl_xattr_nfs41acl_list(struct inode *ip, char *list, size_t list_size, @@ -1637,31 +1630,29 @@ __zpl_xattr_nfs41acl_list(struct inode *ip, char *list, size_t list_size, ZPL_XATTR_LIST_WRAPPER(zpl_xattr_nfs41acl_list); static int -acep_to_nfsace4i(const ace_t *acep, nfsace4i *nacep) +acep_to_nfsace4i(const ace_t *acep, u32 *xattrbuf) { - nacep->type = acep->a_type; - nacep->flag = acep->a_flags & NFS41_FLAGS; - nacep->access_mask = acep->a_access_mask; + u32 who = 0, iflag = 0; switch (acep->a_flags & ACE_TYPE_FLAGS) { case ACE_OWNER: - nacep->iflag |= ACEI4_SPECIAL_WHO; - nacep->who = ACE4_SPECIAL_OWNER; + iflag = ACEI4_SPECIAL_WHO; + who = ACE4_SPECIAL_OWNER; break; case ACE_GROUP|ACE_IDENTIFIER_GROUP: - nacep->iflag |= ACEI4_SPECIAL_WHO; - nacep->who = ACE4_SPECIAL_GROUP; + iflag = ACEI4_SPECIAL_WHO; + who = ACE4_SPECIAL_GROUP; break; case ACE_EVERYONE: - nacep->iflag |= ACEI4_SPECIAL_WHO; - nacep->who = ACE4_SPECIAL_EVERYONE; + iflag = ACEI4_SPECIAL_WHO; + who = ACE4_SPECIAL_EVERYONE; break; case ACE_IDENTIFIER_GROUP: case 0: - nacep->who = acep->a_who; + who = acep->a_who; break; default: @@ -1670,49 +1661,48 @@ acep_to_nfsace4i(const ace_t *acep, nfsace4i *nacep) return (-EINVAL); } + *xattrbuf++ = htonl(acep->a_type); + *xattrbuf++ = htonl(acep->a_flags & NFS41_FLAGS); + *xattrbuf++ = htonl(iflag); + *xattrbuf++ = htonl(acep->a_access_mask); + *xattrbuf++ = htonl(who); + return (0); } static int -zfsacl_to_nfsacl41i(const vsecattr_t vsecp, nfsacl41i **_nacl, size_t *_size) +zfsacl_to_nfsacl41i(const vsecattr_t vsecp, u32 *xattrbuf) { - nfsacl41i *nacl = NULL; - nfsace4i *nacep = NULL; + int i, error = 0; ace_t *acep = NULL; - int i, error; - size_t acl_size; - - acl_size = ACES_TO_ACLSIZE(vsecp.vsa_aclcnt); + *xattrbuf++ = htonl(vsecp.vsa_aclflags); + *xattrbuf++ = htonl(vsecp.vsa_aclcnt); - nacl = kmem_alloc(acl_size, KM_SLEEP); - nacl->na41_aces.na41_aces_len = vsecp.vsa_aclcnt; - nacl->na41_flag = vsecp.vsa_aclflags; - nacep = (nfsace4i *)((char *)nacl + sizeof (nfsacl41i)); - nacl->na41_aces.na41_aces_val = nacep; - - for (i = 0; i < nacl->na41_aces.na41_aces_len; i++) { - nacep = &nacl->na41_aces.na41_aces_val[i]; + for (i = 0; i < vsecp.vsa_aclcnt; i++, xattrbuf += ACE4ELEM) { acep = vsecp.vsa_aclentp + (i * sizeof (ace_t)); - error = acep_to_nfsace4i(acep, nacep); - if (error) { - kmem_free(nacl, acl_size); - return (error); - } + + error = acep_to_nfsace4i(acep, xattrbuf); + if (error) + break; } - *_size = acl_size; - *_nacl = nacl; - return (0); + + return (error); } static int -nfsace4i_to_acep(const nfsace4i *nacep, ace_t *acep) +nfsace4i_to_acep(const u32 *xattrbuf, ace_t *acep) { - acep->a_type = nacep->type; - acep->a_flags = nacep->flag & NFS41_FLAGS; - acep->a_access_mask = nacep->access_mask; - if (nacep->iflag & ACEI4_SPECIAL_WHO) { - switch (nacep->who) { + u32 iflag, id; + + acep->a_type = ntohl(*(xattrbuf++)); + acep->a_flags = ntohl(*(xattrbuf++)) & NFS41_FLAGS; + iflag = ntohl(*(xattrbuf++)); + acep->a_access_mask = ntohl(*(xattrbuf++)); + id = ntohl(*(xattrbuf++)); + + if (iflag & ACEI4_SPECIAL_WHO) { + switch (id) { case ACE4_SPECIAL_OWNER: acep->a_flags |= ACE_OWNER; acep->a_who = -1; @@ -1729,51 +1719,57 @@ nfsace4i_to_acep(const nfsace4i *nacep, ace_t *acep) break; default: - dprintf("Unknown id 0x%08x\n", nacep->who); + dprintf("Unknown id 0x%08x\n", id); return (-EINVAL); } } else { - acep->a_who = nacep->who; + acep->a_who = id; } return (0); } static int -nfsacl41i_to_zfsacl(const nfsacl41i *nacl, vsecattr_t *_vsecp) +nfsacl41i_to_zfsacl(const u32 *xattrbuf, size_t bufsz, vsecattr_t *vsecp) { - int i, error = 0; - vsecattr_t vsecp; - - vsecp.vsa_aclcnt = nacl->na41_aces.na41_aces_len; - vsecp.vsa_aclflags = nacl->na41_flag & ACL_FLAGS_ALL; - vsecp.vsa_aclentsz = vsecp.vsa_aclcnt * sizeof (ace_t); - vsecp.vsa_mask = (VSA_ACE | VSA_ACE_ACLFLAGS); - vsecp.vsa_aclentp = kmem_alloc(vsecp.vsa_aclentsz, KM_SLEEP); - - for (i = 0; i < vsecp.vsa_aclcnt; i++) { - ace_t *acep = vsecp.vsa_aclentp + (i * sizeof (ace_t)); - nfsace4i *nacep = &nacl->na41_aces.na41_aces_val[i]; - error = nfsace4i_to_acep(nacep, acep); + int error; + int i; + + vsecp->vsa_aclflags = ntohl(*(xattrbuf++)); + vsecp->vsa_aclcnt = ntohl(*(xattrbuf++)); + bufsz -= (2 * sizeof (u32)); + vsecp->vsa_aclentsz = vsecp->vsa_aclcnt * sizeof (ace_t); + + if (bufsz != (vsecp->vsa_aclcnt * ACE4SIZE)) { + dprintf("Embedded ACL count [%d] is larger than " + "can fit in provided buffer size: %zu\n", + vsecp->vsa_aclcnt, bufsz); + return (-ERANGE); + } + + vsecp->vsa_aclentp = kmem_alloc(vsecp->vsa_aclentsz, KM_SLEEP); + + for (i = 0; i < vsecp->vsa_aclcnt; i++, xattrbuf += ACE4ELEM) { + ace_t *acep = vsecp->vsa_aclentp + (i * sizeof (ace_t)); + + error = nfsace4i_to_acep(xattrbuf, acep); if (error) { + kmem_free(vsecp->vsa_aclentp, vsecp->vsa_aclentsz); return (error); } } - *_vsecp = vsecp; - return (error); + + return (0); } static int __zpl_xattr_nfs41acl_get(struct inode *ip, const char *name, void *buffer, size_t size) { - vsecattr_t vsecp; + vsecattr_t vsecp = {0}; cred_t *cr = CRED(); int ret, fl; - size_t acl_size = 0, xdr_size = 0; - XDR xdr = {0}; - boolean_t ok; - nfsacl41i *nacl = NULL; + size_t xdr_size; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME @@ -1827,22 +1823,9 @@ __zpl_xattr_nfs41acl_get(struct inode *ip, const char *name, goto nfs4acl_get_out; } - ret = zfsacl_to_nfsacl41i(vsecp, &nacl, &acl_size); - if (ret) { - ret = -ENOMEM; - goto nfs4acl_get_out; - } - - xdrmem_create(&xdr, (char *)buffer, xdr_size, XDR_ENCODE); - ok = xdr_nfsacl41i(&xdr, nacl); - if (!ok) { - ret = -ENOMEM; - kmem_free(nacl, acl_size); - goto nfs4acl_get_out; - } - - kmem_free(nacl, acl_size); - ret = xdr_size; + ret = zfsacl_to_nfsacl41i(vsecp, (u32 *)buffer); + if (ret == 0) + ret = xdr_size; nfs4acl_get_out: kmem_free(vsecp.vsa_aclentp, vsecp.vsa_aclentsz); @@ -1858,13 +1841,8 @@ __zpl_xattr_nfs41acl_set(zidmap_t *mnt_ns, { (void) mnt_ns; cred_t *cr = CRED(); - vsecattr_t vsecp; - char *bufp = NULL; - nfsacl41i *nacl = NULL; - boolean_t ok; - XDR xdr = {0}; - size_t acl_size = 0; int error, fl, naces; + vsecattr_t vsecp = { .vsa_mask = (VSA_ACE | VSA_ACE_ACLFLAGS) }; if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_NFSV4) return (-EOPNOTSUPP); @@ -1889,34 +1867,10 @@ __zpl_xattr_nfs41acl_set(zidmap_t *mnt_ns, if (!XDRSIZE_IS_VALID(size)) { return (-EINVAL); } - bufp = (char *)value; - acl_size = ACES_TO_ACLSIZE(naces); - nacl = kmem_alloc(sizeof (nfsacl41i), KM_SLEEP); - /* - * NULL may still be returned with KM_SLEEP set. - * In principal, checks for SIZE of xattr are - * sufficient to protect, but check for NULL anyway. - */ - if (nacl == NULL) { - return (-ENOMEM); - } - xdrmem_create(&xdr, bufp, acl_size, XDR_DECODE); - ok = xdr_nfsacl41i(&xdr, nacl); - if (!ok) { - kmem_free(nacl, sizeof (nfsacl41i)); - return (-ENOMEM); - } - error = nfsacl41i_to_zfsacl(nacl, &vsecp); - if (error) { - kmem_free(nacl, sizeof (nfsacl41i)); + error = nfsacl41i_to_zfsacl((u32 *)value, size, &vsecp); + if (error) return (error); - } - - /* XDR_DECODE allocates memory for the array of aces */ - kmem_free(nacl->na41_aces.na41_aces_val, - (nacl->na41_aces.na41_aces_len * ACE4SIZE)); - kmem_free(nacl, sizeof (nfsacl41i)); crhold(cr); fl = capable(CAP_DAC_OVERRIDE) ? ATTR_NOACLCHECK : 0; From 8a4f60965141e7021cc81221bb4329511ad97ca7 Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Thu, 4 May 2023 12:58:47 -0400 Subject: [PATCH 18/35] Improve zpl_permission performance This function can be frequently called with MAY_EXEC|MAY_NOT_BLOCK during RCU path walk. Where possible we should try not to break out of it. In this case we check whether flag ZFS_NO_EXECS_DENIED is set and check mode (similar to fastexecute check in zfs_acl.c). Signed-off-by: Andrew Walker --- module/os/linux/zfs/zpl_xattr.c | 69 +++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index 67f93ea8e8ba..138c1cb46747 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -1488,6 +1488,14 @@ static xattr_handler_t zpl_xattr_acl_default_handler = { #endif /* CONFIG_FS_POSIX_ACL */ +/* + * zpl_permission() gets called by linux kernel whenever it checks + * inode_permission via inode->i_op->permission. The general preference + * is to defer to the standard in-kernel permission check (generic_permission) + * wherever possible. + * + * https://www.kernel.org/doc/Documentation/filesystems/vfs.txt + */ int #if defined(HAVE_IOPS_PERMISSION_USERNS) zpl_permission(struct user_namespace *userns, struct inode *ip, int mask) @@ -1532,23 +1540,40 @@ zpl_permission(struct inode *ip, int mask) } /* - * Avoid potentially blocking in RCU walk. + * Fast path for execute checks. Do not use zfs_fastaccesschk_execute + * since it may end up granting execute access in presence of explicit + * deny entry for user / group, and it also read the ZFS ACL + * (non-cached) which we wish to avoid in RCU. */ - if (mask & MAY_NOT_BLOCK) { - return (-ECHILD); - } + if ((to_check == ACE_EXECUTE) && + (ITOZ(ip)->z_pflags & ZFS_NO_EXECS_DENIED)) + return (0); + /* + * inode permission operation may be called in rcu-walk mode + * (mask & MAY_NOT_BLOCK). If in rcu-walk mode, the filesystem must + * check the permission without blocking or storing to the inode. + * + * If a situation is encountered that rcu-walk cannot handle, + * return -ECHILD and it will be called again in ref-walk mode. + */ cr = CRED(); crhold(cr); - ret = -zfs_access(ITOZ(ip), to_check, V_ACE_MASK, cr); - if (ret != -EPERM && ret != -EACCES) { - crfree(cr); - return (ret); - } /* - * There are some situations in which capabilities - * may allow overriding the DACL. + * There are some situations in which capabilities may allow overriding + * the DACL. Skip reading ACL if requested permissions are fully + * satisfied by capabilities. + */ + + /* + * CAP_DAC_OVERRIDE may override RWX on directories, and RW on other + * files. Execute may also be overriden if at least one execute bit is + * set. This behavior is not formally documented, but is covered in + * commit messages and code comments in namei.c. + * + * CAP_DAC_READ_SEARCH may bypass file read permission checks and + * directory read and execute permission checks. */ if (S_ISDIR(ip->i_mode)) { #ifdef SB_NFSV4ACL @@ -1565,25 +1590,27 @@ zpl_permission(struct inode *ip, int mask) crfree(cr); return (0); } - crfree(cr); - return (ret); } - if (to_check == ACE_READ_DATA) { - if (capable(CAP_DAC_READ_SEARCH)) { + if (!(mask & MAY_EXEC) || (ip->i_mode & S_IXUGO)) { + if (capable(CAP_DAC_OVERRIDE)) { crfree(cr); return (0); } } - if (!(mask & MAY_EXEC) || - (zfs_fastaccesschk_execute(ITOZ(ip), cr) == 0)) { - if (capable(CAP_DAC_OVERRIDE)) { - crfree(cr); - return (0); - } + if ((to_check == ACE_READ_DATA) && + capable(CAP_DAC_READ_SEARCH)) { + crfree(cr); + return (0); + } + + if (mask & MAY_NOT_BLOCK) { + crfree(cr); + return (-ECHILD); } + ret = -zfs_access(ITOZ(ip), to_check, V_ACE_MASK, cr); crfree(cr); return (ret); } From 343164faac0b5f67d5b636aa45e6b702c7886860 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Wed, 21 Jun 2023 16:34:22 +0500 Subject: [PATCH 19/35] Skip id-mapped tests for now due to nfsv4 acls incompatibility --- .../zfs-tests/tests/functional/idmap_mount/idmap_mount_001.ksh | 2 ++ .../zfs-tests/tests/functional/idmap_mount/idmap_mount_002.ksh | 2 ++ .../zfs-tests/tests/functional/idmap_mount/idmap_mount_003.ksh | 2 ++ .../zfs-tests/tests/functional/idmap_mount/idmap_mount_004.ksh | 2 ++ .../zfs-tests/tests/functional/idmap_mount/idmap_mount_005.ksh | 2 ++ 5 files changed, 10 insertions(+) diff --git a/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_001.ksh b/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_001.ksh index e7187935e532..820e9e544ef9 100755 --- a/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_001.ksh +++ b/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_001.ksh @@ -36,6 +36,8 @@ verify_runnable "global" +log_unsupported "SKIP: nfsv4 acls incompatible with id-mapped mounts for now." + export WORKDIR=$TESTDIR/idmap_test export IDMAPDIR=$TESTDIR/idmap_dest diff --git a/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_002.ksh b/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_002.ksh index 8cba90ea58b7..bb1d379af27b 100755 --- a/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_002.ksh +++ b/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_002.ksh @@ -37,6 +37,8 @@ verify_runnable "global" +log_unsupported "SKIP: nfsv4 acls incompatible with id-mapped mounts for now." + export WORKDIR=$TESTDIR/idmap_test export IDMAPDIR=$TESTDIR/idmap_dest diff --git a/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_003.ksh b/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_003.ksh index 1f1a2aec655e..84e7de13789a 100755 --- a/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_003.ksh +++ b/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_003.ksh @@ -39,6 +39,8 @@ verify_runnable "global" +log_unsupported "SKIP: nfsv4 acls incompatible with id-mapped mounts for now." + export WORKDIR=$TESTDIR/idmap_test export IDMAPDIR=$TESTDIR/idmap_dest diff --git a/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_004.ksh b/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_004.ksh index 89f2f750d23c..3e44da2e0553 100755 --- a/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_004.ksh +++ b/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_004.ksh @@ -38,6 +38,8 @@ verify_runnable "global" +log_unsupported "SKIP: nfsv4 acls incompatible with id-mapped mounts for now." + export WORKDIR=$TESTDIR/idmap_test export IDMAPDIR=$TESTDIR/idmap_dest diff --git a/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_005.ksh b/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_005.ksh index a4ecea92c127..c270dfb8f0d5 100755 --- a/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_005.ksh +++ b/tests/zfs-tests/tests/functional/idmap_mount/idmap_mount_005.ksh @@ -41,6 +41,8 @@ verify_runnable "global" +log_unsupported "SKIP: nfsv4 acls incompatible with id-mapped mounts for now." + export WORKDIR=$TESTDIR/idmap_test export IDMAPDIR=$TESTDIR/idmap_dest From 50f9fd6fcef7f523367acb34ee2189230e76258a Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Tue, 27 Jun 2023 23:24:39 +0500 Subject: [PATCH 20/35] linux 6.3 compat changes for truenas/zfs-2.2-release Signed-off-by: Ameer Hamza --- include/os/linux/zfs/sys/zpl.h | 2 ++ module/os/linux/zfs/policy.c | 2 +- module/os/linux/zfs/zpl_xattr.c | 12 ++++++++---- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index 55cdf51c9711..cfb2a6ed3e9f 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -109,6 +109,8 @@ zpl_chmod_acl(struct inode *ip) #if defined(HAVE_IOPS_PERMISSION_USERNS) extern int zpl_permission(struct user_namespace *userns, struct inode *ip, int mask); +#elif defined(HAVE_IOPS_PERMISSION_IDMAP) +extern int zpl_permission(struct mnt_idmap *idmap, struct inode *ip, int mask); #else extern int zpl_permission(struct inode *ip, int mask); #endif diff --git a/module/os/linux/zfs/policy.c b/module/os/linux/zfs/policy.c index ebec1e93a436..16e1762ec87e 100644 --- a/module/os/linux/zfs/policy.c +++ b/module/os/linux/zfs/policy.c @@ -123,7 +123,7 @@ secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner, if ((uid == owner) || (uid == 0)) return (0); - if (zpl_inode_owner_or_capable(kcred->user_ns, ip)) + if (zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (0); #if defined(CONFIG_USER_NS) diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index 138c1cb46747..4d6ac1049500 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -1499,6 +1499,8 @@ static xattr_handler_t zpl_xattr_acl_default_handler = { int #if defined(HAVE_IOPS_PERMISSION_USERNS) zpl_permission(struct user_namespace *userns, struct inode *ip, int mask) +#elif defined(HAVE_IOPS_PERMISSION_IDMAP) +zpl_permission(struct mnt_idmap *idmap, struct inode *ip, int mask) #else zpl_permission(struct inode *ip, int mask) #endif @@ -1514,8 +1516,9 @@ zpl_permission(struct inode *ip, int mask) */ if ((ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_NFSV4) || ((ITOZ(ip)->z_pflags & ZFS_ACL_TRIVIAL && GENERIC_MASK(mask)))) { -#if defined(HAVE_IOPS_PERMISSION_USERNS) - return (generic_permission(userns, ip, mask)); +#if (defined(HAVE_IOPS_PERMISSION_USERNS) || \ + defined(HAVE_IOPS_PERMISSION_IDMAP)) + return (generic_permission(zfs_init_idmap, ip, mask)); #else return (generic_permission(ip, mask)); #endif @@ -1532,8 +1535,9 @@ zpl_permission(struct inode *ip, int mask) * NFSv4 ACE. Pass back to default kernel permissions check. */ if (to_check == 0) { -#if defined(HAVE_IOPS_PERMISSION_USERNS) - return (generic_permission(userns, ip, mask)); +#if (defined(HAVE_IOPS_PERMISSION_USERNS) || \ + defined(HAVE_IOPS_PERMISSION_IDMAP)) + return (generic_permission(zfs_init_idmap, ip, mask)); #else return (generic_permission(ip, mask)); #endif From 46e619a2ed6c1becd19ca999259e30c9c67c967f Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Thu, 24 Aug 2023 20:11:59 +0500 Subject: [PATCH 21/35] Do not enable zfs-share.service zfs-share.service executes `zfs share` on every boot to share any filesystem/s, that are shared over SMB and/or NFS using the sharesmb and sharenfs properties. Since we do not rely on these properties to share over SMB and NFS and the service fails on boot on TrueNAS if sharesmb and/or sharenfs properties are set, and we rely on middleware to control the SMB and NFS shares, zfs-share.service should be disabled for TrueNAS SCALE. Signed-off-by: Umer Saleem --- contrib/debian/rules.in | 1 - etc/systemd/system/50-zfs.preset | 2 +- etc/systemd/system/zfs-share.service.in | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/contrib/debian/rules.in b/contrib/debian/rules.in index 260f9279158f..38c7038a701f 100755 --- a/contrib/debian/rules.in +++ b/contrib/debian/rules.in @@ -163,7 +163,6 @@ override_dh_installinit: dh_installinit -r --no-restart-after-upgrade --name zfs-import dh_installinit -r --no-restart-after-upgrade --name zfs-mount dh_installinit -r --no-restart-after-upgrade --name zfs-load-key - dh_installinit -R --name zfs-share dh_installinit -R --name zfs-zed override_dh_installsystemd: diff --git a/etc/systemd/system/50-zfs.preset b/etc/systemd/system/50-zfs.preset index e4056a92cd98..d2da85d330eb 100644 --- a/etc/systemd/system/50-zfs.preset +++ b/etc/systemd/system/50-zfs.preset @@ -3,7 +3,7 @@ enable zfs-import-cache.service disable zfs-import-scan.service enable zfs-import.target enable zfs-mount.service -enable zfs-share.service +disable zfs-share.service enable zfs-zed.service enable zfs-volume-wait.service enable zfs.target diff --git a/etc/systemd/system/zfs-share.service.in b/etc/systemd/system/zfs-share.service.in index 1a6342a06fec..7c90f74a684a 100644 --- a/etc/systemd/system/zfs-share.service.in +++ b/etc/systemd/system/zfs-share.service.in @@ -17,4 +17,4 @@ EnvironmentFile=-@initconfdir@/zfs ExecStart=@sbindir@/zfs share -a [Install] -WantedBy=zfs.target +WantedBy=multi-user.target From 02cfb15a5eb35d8e432fe3aca597a8be2d03e323 Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Wed, 15 Nov 2023 06:13:53 -0500 Subject: [PATCH 22/35] Remove inode_owner_or_capable override for secpolicy (#178) This commit fixes a bug whereby owner@ ACL that limits WRITE_DATA access for the owner of a file was not being properly enforced. The owner of a file should be prevented from write access in this case, but being owner of file should still allow the file owner to chmod, chown, and setacl. Signed-off-by: Andrew Walker --- module/os/linux/zfs/policy.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/module/os/linux/zfs/policy.c b/module/os/linux/zfs/policy.c index 16e1762ec87e..a3801c605dd6 100644 --- a/module/os/linux/zfs/policy.c +++ b/module/os/linux/zfs/policy.c @@ -120,10 +120,7 @@ secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner, return (0); } - if ((uid == owner) || (uid == 0)) - return (0); - - if (zpl_inode_owner_or_capable(zfs_init_idmap, ip)) + if (uid == 0) return (0); #if defined(CONFIG_USER_NS) From f27da867861a9f17b899302b113faa87658e39b0 Mon Sep 17 00:00:00 2001 From: themylogin Date: Thu, 4 Jan 2024 13:41:38 +0100 Subject: [PATCH 23/35] Add Dockerfile Signed-off-by: Vladimir Vinogradenko --- .github/workflows/docker_image.yml | 2 +- .gitignore | 1 + Dockerfile | 27 +++++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 Dockerfile diff --git a/.github/workflows/docker_image.yml b/.github/workflows/docker_image.yml index 7f5e0c2d233c..d9d68d69a85c 100644 --- a/.github/workflows/docker_image.yml +++ b/.github/workflows/docker_image.yml @@ -3,7 +3,7 @@ name: build_image on: push: branches: - - 'truenas/zfs-2.1-release' + - 'truenas/zfs-2.3-release' jobs: docker: diff --git a/.gitignore b/.gitignore index bd3d4befd291..b40208702029 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,7 @@ !configure.ac !copy-builtin !COPYRIGHT +!Dockerfile !LICENSE !Makefile.am !META diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000000..421825f68ada --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +FROM debian:bullseye-slim + +ENV DEBIAN_FRONTEND noninteractive + +RUN apt-get update + +RUN apt-get install -y \ + debhelper-compat \ + devscripts + +ENV WORK_DIR /zfs_app/zfs +WORKDIR ${WORK_DIR} + +ADD . ${WORK_DIR}/ + +RUN mk-build-deps --build-dep contrib/debian/control +RUN apt install -y ./*.deb +RUN sh autogen.sh +RUN ./configure +RUN cp -a contrib/debian debian +RUN sed 's/@CFGOPTS@/--enable-debuginfo/g' debian/rules.in > debian/rules +RUN chmod +x debian/rules +RUN dch -b -M --force-distribution --distribution bullseye-truenas-unstable "Tagged from ixsystems/zfs CI" +RUN debuild -us -uc -b +RUN rm ../openzfs-zfs-dracut_*.deb +RUN rm ../openzfs-zfs-initramfs_*.deb +RUN apt-get install -y ../*.deb From a8bc2a41828b3aaff4095918e8c83e6659348872 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 2 Apr 2024 15:14:54 +1100 Subject: [PATCH 24/35] vdev_disk: don't touch vbio after its handed off to the kernel After IO is unplugged, it may complete immediately and vbio_completion be called on interrupt context. That may interrupt or deschedule our task. If its the last bio, the vbio will be freed. Then, we get rescheduled, and try to write to freed memory through vbio->. This patch just removes the the cleanup, and the corresponding assert. These were leftovers from a previous iteration of vbio_submit() and were always "belt and suspenders" ops anyway, never strictly required. Reported-by: Rich Ercolani Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Closes: #16045 Closes: #16050 --- module/os/linux/zfs/vdev_disk.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index ac8fe6cb1bf9..df5fa067797a 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -755,8 +755,6 @@ vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) static void vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) { - ASSERT(vbio->vbio_bdev); - /* * We plug so we can submit the BIOs as we go and only unplug them when * they are fully created and submitted. This is important; if we don't @@ -774,12 +772,15 @@ vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) vbio->vbio_bio->bi_end_io = vbio_completion; vbio->vbio_bio->bi_private = vbio; + /* + * Once submitted, vbio_bio now owns vbio (through bi_private) and we + * can't touch it again. The bio may complete and vbio_completion() be + * called and free the vbio before this task is run again, so we must + * consider it invalid from this point. + */ vdev_submit_bio(vbio->vbio_bio); blk_finish_plug(&plug); - - vbio->vbio_bio = NULL; - vbio->vbio_bdev = NULL; } /* IO completion callback */ From 147d5ff15ab389bbe8ceeca4f71359a95662cf42 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Fri, 12 Apr 2024 03:10:24 +0500 Subject: [PATCH 25/35] Add support for zfs mount -R This commit adds support for mounting a dataset along with all of it's children with '-R' flag for zfs mount. There can be scenarios where we want to mount all datasets under one hierarchy instead of mounting all datasets present on system with '-a' flag. '-R' flag should work on all root and non-root datasets. Usage information and man page has been updated for zfs mount. A test for verifying the behavior for '-R' flag is also added. Reviewed-by: Ameer Hamza Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Umer Saleem Closes #16015 --- cmd/zfs/zfs_main.c | 75 +++++++-- man/man8/zfs-mount.8 | 6 +- tests/runfiles/common.run | 2 +- tests/runfiles/sanity.run | 3 +- tests/zfs-tests/tests/Makefile.am | 1 + .../cli_root/zfs_mount/zfs_mount.cfg | 1 + .../zfs_mount/zfs_mount_recursive.ksh | 146 ++++++++++++++++++ 7 files changed, 216 insertions(+), 18 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index c2147c8f4acd..ec52c563b447 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -309,7 +309,8 @@ get_usage(zfs_help_t idx) "[filesystem|volume|snapshot] ...\n")); case HELP_MOUNT: return (gettext("\tmount\n" - "\tmount [-flvO] [-o opts] <-a | filesystem>\n")); + "\tmount [-flvO] [-o opts] <-a|-R filesystem|" + "filesystem>\n")); case HELP_PROMOTE: return (gettext("\tpromote \n")); case HELP_RECEIVE: @@ -6754,6 +6755,8 @@ zfs_do_holds(int argc, char **argv) #define MOUNT_TIME 1 /* seconds */ typedef struct get_all_state { + char **ga_datasets; + int ga_count; boolean_t ga_verbose; get_all_cb_t *ga_cbp; } get_all_state_t; @@ -6800,19 +6803,35 @@ get_one_dataset(zfs_handle_t *zhp, void *data) return (0); } -static void -get_all_datasets(get_all_cb_t *cbp, boolean_t verbose) +static int +get_recursive_datasets(zfs_handle_t *zhp, void *data) { - get_all_state_t state = { - .ga_verbose = verbose, - .ga_cbp = cbp - }; + get_all_state_t *state = data; + int len = strlen(zfs_get_name(zhp)); + for (int i = 0; i < state->ga_count; ++i) { + if (strcmp(state->ga_datasets[i], zfs_get_name(zhp)) == 0) + return (get_one_dataset(zhp, data)); + else if ((strncmp(state->ga_datasets[i], zfs_get_name(zhp), + len) == 0) && state->ga_datasets[i][len] == '/') { + (void) zfs_iter_filesystems_v2(zhp, 0, + get_recursive_datasets, data); + } + } + zfs_close(zhp); + return (0); +} - if (verbose) +static void +get_all_datasets(get_all_state_t *state) +{ + if (state->ga_verbose) set_progress_header(gettext("Reading ZFS config")); - (void) zfs_iter_root(g_zfs, get_one_dataset, &state); + if (state->ga_datasets == NULL) + (void) zfs_iter_root(g_zfs, get_one_dataset, state); + else + (void) zfs_iter_root(g_zfs, get_recursive_datasets, state); - if (verbose) + if (state->ga_verbose) finish_progress(gettext("done.")); } @@ -7158,18 +7177,22 @@ static int share_mount(int op, int argc, char **argv) { int do_all = 0; + int recursive = 0; boolean_t verbose = B_FALSE; int c, ret = 0; char *options = NULL; int flags = 0; /* check options */ - while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:Of" : "al")) + while ((c = getopt(argc, argv, op == OP_MOUNT ? ":aRlvo:Of" : "al")) != -1) { switch (c) { case 'a': do_all = 1; break; + case 'R': + recursive = 1; + break; case 'v': verbose = B_TRUE; break; @@ -7211,7 +7234,7 @@ share_mount(int op, int argc, char **argv) argv += optind; /* check number of arguments */ - if (do_all) { + if (do_all || recursive) { enum sa_protocol protocol = SA_NO_PROTOCOL; if (op == OP_SHARE && argc > 0) { @@ -7220,14 +7243,38 @@ share_mount(int op, int argc, char **argv) argv++; } - if (argc != 0) { + if (argc != 0 && do_all) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } + if (argc == 0 && recursive) { + (void) fprintf(stderr, + gettext("no dataset provided\n")); + usage(B_FALSE); + } + start_progress_timer(); get_all_cb_t cb = { 0 }; - get_all_datasets(&cb, verbose); + get_all_state_t state = { 0 }; + if (argc == 0) { + state.ga_datasets = NULL; + state.ga_count = -1; + } else { + zfs_handle_t *zhp; + for (int i = 0; i < argc; i++) { + zhp = zfs_open(g_zfs, argv[i], + ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) + usage(B_FALSE); + zfs_close(zhp); + } + state.ga_datasets = argv; + state.ga_count = argc; + } + state.ga_verbose = verbose; + state.ga_cbp = &cb; + get_all_datasets(&state); if (cb.cb_used == 0) { free(options); diff --git a/man/man8/zfs-mount.8 b/man/man8/zfs-mount.8 index 35aa187cf063..20dbe4d0e648 100644 --- a/man/man8/zfs-mount.8 +++ b/man/man8/zfs-mount.8 @@ -43,7 +43,7 @@ .Cm mount .Op Fl Oflv .Op Fl o Ar options -.Fl a Ns | Ns Ar filesystem +.Fl a Ns | Ns Fl R Ar filesystem Ns | Ns Ar filesystem .Nm zfs .Cm unmount .Op Fl fu @@ -61,7 +61,7 @@ Displays all ZFS file systems currently mounted. .Cm mount .Op Fl Oflv .Op Fl o Ar options -.Fl a Ns | Ns Ar filesystem +.Fl a Ns | Ns Fl R Ar filesystem Ns | Ns Ar filesystem .Xc Mount ZFS filesystem on a path described by its .Sy mountpoint @@ -83,6 +83,8 @@ for more information. .It Fl a Mount all available ZFS file systems. Invoked automatically as part of the boot process if configured. +.It Fl R +Mount the specified filesystems along with all their children. .It Ar filesystem Mount the specified filesystem. .It Fl o Ar options diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 912344b4edde..8f96af6bcdbf 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -252,7 +252,7 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', - 'zfs_mount_test_race'] + 'zfs_mount_test_race', 'zfs_mount_recursive'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_program] diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index ab41c05b8473..695d3697a602 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -155,7 +155,8 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', - 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', 'zfs_mount_test_race'] + 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', + 'zfs_mount_test_race', 'zfs_mount_recursive'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_program] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index db6b4c0146a7..f182a2825cd6 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -770,6 +770,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh \ functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh \ functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh \ + functional/cli_root/zfs_mount/zfs_mount_recursive.ksh \ functional/cli_root/zfs_mount/zfs_mount_remount.ksh \ functional/cli_root/zfs_mount/zfs_mount_test_race.ksh \ functional/cli_root/zfs_mount/zfs_multi_mount.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg index 06d25faf0356..739baf16086a 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg @@ -31,6 +31,7 @@ export mountcmd=mount export mountforce="$mountcmd -f" export mountall="$mountcmd -a" +export mountrecursive="$mountcmd -R" export unmountcmd=unmount export unmountforce="$unmountcmd -f" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh new file mode 100755 index 000000000000..0e5cc5d6955e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_recursive.ksh @@ -0,0 +1,146 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2024, iXsystems Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib + +# +# DESCRIPTION: +# Verify zfs mount -R functionality. +# +# STRATEGY: +# 1. Create nested datasets +# 2. Unmount all datasets +# 3. Recusrively mount root datasets, this should mount all datasets +# present in a pool +# 4. Unmount all datasets +# 5. Recusrsively mount child datasets with children. This should mount +# child datasets, but not the root dataset or parent datasets +# 6. Unmount all datasets +# 7. Mount root dataset recursively again and confirm all child +# datasets are mounted. +# + +verify_runnable "both" + +function cleanup +{ + log_must datasetexists $TESTPOOL/$TESTFS1 && \ + destroy_dataset $TESTPOOL/$TESTFS1 -R + log_must datasetexists $TESTPOOL/$TESTFS2 && \ + destroy_dataset $TESTPOOL/$TESTFS2 -R + log_must datasetexists $TESTPOOL/$TESTFS3 && \ + destroy_dataset $TESTPOOL/$TESTFS3 -R +} + +function setup_all +{ + log_must datasetexists $TESTPOOL/$TESTFS || zfs create $TESTPOOL/$TESTFS + log_must zfs create $TESTPOOL/$TESTFS1 + log_must zfs create $TESTPOOL/$TESTFS2 + log_must zfs create $TESTPOOL/$TESTFS3 + log_must zfs create $TESTPOOL/$TESTFS2/child1 + log_must zfs create $TESTPOOL/$TESTFS2/child2 + log_must zfs create $TESTPOOL/$TESTFS2/child3 + log_must zfs create $TESTPOOL/$TESTFS2/child2/subchild + log_must zfs create $TESTPOOL/$TESTFS3/child +} + +log_assert "Verify that 'zfs $mountrecursive' successfully, " \ + "mounts the dataset along with all its children." + +log_onexit cleanup + +log_must setup_all + +log_must zfs $unmountall + +log_must zfs $mountrecursive $TESTPOOL + +log_must mounted $TESTPOOL +log_must mounted $TESTPOOL/$TESTFS +log_must mounted $TESTPOOL/$TESTFS1 +log_must mounted $TESTPOOL/$TESTFS2 +log_must mounted $TESTPOOL/$TESTFS3 +log_must mounted $TESTPOOL/$TESTFS2/child1 +log_must mounted $TESTPOOL/$TESTFS2/child2 +log_must mounted $TESTPOOL/$TESTFS2/child3 +log_must mounted $TESTPOOL/$TESTFS2/child2/subchild +log_must mounted $TESTPOOL/$TESTFS3/child + +log_must zfs $unmountall + +log_mustnot mounted $TESTPOOL +log_mustnot mounted $TESTPOOL/$TESTFS +log_mustnot mounted $TESTPOOL/$TESTFS1 +log_mustnot mounted $TESTPOOL/$TESTFS2 +log_mustnot mounted $TESTPOOL/$TESTFS3 +log_mustnot mounted $TESTPOOL/$TESTFS2/child1 +log_mustnot mounted $TESTPOOL/$TESTFS2/child2 +log_mustnot mounted $TESTPOOL/$TESTFS2/child3 +log_mustnot mounted $TESTPOOL/$TESTFS2/child2/subchild +log_mustnot mounted $TESTPOOL/$TESTFS3/child + +log_must zfs $mountrecursive $TESTPOOL/$TESTFS2 $TESTPOOL/$TESTFS3 + +log_mustnot mounted $TESTPOOL +log_mustnot mounted $TESTPOOL/$TESTFS +log_mustnot mounted $TESTPOOL/$TESTFS1 +log_must mounted $TESTPOOL/$TESTFS2 +log_must mounted $TESTPOOL/$TESTFS3 +log_must mounted $TESTPOOL/$TESTFS2/child1 +log_must mounted $TESTPOOL/$TESTFS2/child2 +log_must mounted $TESTPOOL/$TESTFS2/child3 +log_must mounted $TESTPOOL/$TESTFS2/child2/subchild +log_must mounted $TESTPOOL/$TESTFS3/child + +log_must zfs $unmountall + +log_mustnot mounted $TESTPOOL +log_mustnot mounted $TESTPOOL/$TESTFS +log_mustnot mounted $TESTPOOL/$TESTFS1 +log_mustnot mounted $TESTPOOL/$TESTFS2 +log_mustnot mounted $TESTPOOL/$TESTFS3 +log_mustnot mounted $TESTPOOL/$TESTFS2/child1 +log_mustnot mounted $TESTPOOL/$TESTFS2/child2 +log_mustnot mounted $TESTPOOL/$TESTFS2/child3 +log_mustnot mounted $TESTPOOL/$TESTFS2/child2/subchild +log_mustnot mounted $TESTPOOL/$TESTFS3/child + +log_must zfs $mountrecursive $TESTPOOL/$TESTFS2/child2 + +log_must mounted $TESTPOOL/$TESTFS2/child2 +log_must mounted $TESTPOOL/$TESTFS2/child2/subchild +log_mustnot mounted $TESTPOOL +log_mustnot mounted $TESTPOOL/$TESTFS +log_mustnot mounted $TESTPOOL/$TESTFS1 +log_mustnot mounted $TESTPOOL/$TESTFS2 +log_mustnot mounted $TESTPOOL/$TESTFS3 +log_mustnot mounted $TESTPOOL/$TESTFS2/child1 +log_mustnot mounted $TESTPOOL/$TESTFS2/child3 +log_mustnot mounted $TESTPOOL/$TESTFS3/child + +log_pass "'zfs $mountrecursive' behaves as expected." From 74319f92c153485dd3b4b89fa7eb544fc83483c7 Mon Sep 17 00:00:00 2001 From: themylogin Date: Mon, 22 Apr 2024 14:39:48 +0200 Subject: [PATCH 26/35] Empty commit to trigger CI From 5c57f7ecfc65b6dad84526f7443f1f377a481b57 Mon Sep 17 00:00:00 2001 From: themylogin Date: Mon, 22 Apr 2024 17:38:48 +0200 Subject: [PATCH 27/35] Use debian bookworm to build Docker image TrueNAS 24.10 is based on Debian Bookworm. We need to build py-libzfs CI packages for Bookworm. --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 421825f68ada..7e049537fe71 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM debian:bullseye-slim +FROM debian:bookworm-slim ENV DEBIAN_FRONTEND noninteractive From cdac10e4dedc6178d1aebea57eb8383844bfe841 Mon Sep 17 00:00:00 2001 From: themylogin Date: Mon, 22 Apr 2024 18:07:20 +0200 Subject: [PATCH 28/35] Fix Dockerfile --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 7e049537fe71..4ff1abdd8df1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,7 +6,8 @@ RUN apt-get update RUN apt-get install -y \ debhelper-compat \ - devscripts + devscripts \ + linux-headers-amd64 ENV WORK_DIR /zfs_app/zfs WORKDIR ${WORK_DIR} From 667c282c9723d6077b3b6f8df01c77536a72a6a5 Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Wed, 24 Apr 2024 07:02:27 -0700 Subject: [PATCH 29/35] Fix return value for setting zvol threading (#230) We must return -1 instead of ENOENT if the special zvol threading property set function can't locate the dataset (this would typically happen with an encypted and unmounted zvol) so that the operation gets inserted properly into the nvlist for operations to set. This is because we want the property to be set once the zvol is decrypted again. Signed-off-by: Andrew Walker --- module/zfs/zvol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 5b6a3f5cb410..a71f3738392e 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -372,7 +372,7 @@ zvol_set_volthreading(const char *name, boolean_t value) { zvol_state_t *zv = zvol_find_by_name(name, RW_NONE); if (zv == NULL) - return (ENOENT); + return (-1); zv->zv_threading = value; mutex_exit(&zv->zv_state_lock); return (0); From cbfc3bd9bc54d776fee300cb3b10252f4ec33fd8 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Wed, 8 May 2024 21:09:05 +0500 Subject: [PATCH 30/35] Add libtirpc-dev to Build-Depends in control Signed-off-by: Umer Saleem --- contrib/debian/control | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/debian/control b/contrib/debian/control index 5b7874d11401..a0d10a8f502a 100644 --- a/contrib/debian/control +++ b/contrib/debian/control @@ -10,6 +10,7 @@ Build-Depends: debhelper-compat (= 12), libelf-dev, libpam0g-dev, libssl-dev | libssl1.0-dev, + libtirpc-dev, libtool, libudev-dev, lsb-release, From 92264faef451e0ff943c95457c7c35a1b2a61d9b Mon Sep 17 00:00:00 2001 From: Aiden Baker Date: Tue, 30 Jul 2024 10:53:10 -0400 Subject: [PATCH 31/35] Update zed.rc to disable emailing Signed-off-by: Aiden Baker --- cmd/zed/zed.d/zed.rc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index af56147a969b..bcf774e04d7d 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -14,7 +14,7 @@ # Email will only be sent if ZED_EMAIL_ADDR is defined. # Enabled by default; comment to disable. # -ZED_EMAIL_ADDR="root" +#ZED_EMAIL_ADDR="root" ## # Name or path of executable responsible for sending notifications via email; From e11753560b9cbc8affc815be2469f8894346c1a8 Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Tue, 13 Aug 2024 05:44:16 -0600 Subject: [PATCH 32/35] Fix namespace handling in zpl_permission (#246) This commit fixes user / idmap namespaces in zpl_permission. ZFS updates to address kernel changes were subtly broken and passing the wrong namespace to generic_permission(). Since zpl_permission was initially written, zfs_zaccess() has become idmap-aware. This commit switches from using zfs_access to zfs_zaccess() and improves zfs_zaccess_aces_check() so that uids / gids in ACL entries are converted via idmap configuration prior to checking access. Signed-off-by: Andrew Walker --- module/os/linux/zfs/zfs_acl.c | 7 ++++++- module/os/linux/zfs/zpl_xattr.c | 34 ++++++++++++++++++++++++++------- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/module/os/linux/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c index b2a1d265998b..e83672c468fa 100644 --- a/module/os/linux/zfs/zfs_acl.c +++ b/module/os/linux/zfs/zfs_acl.c @@ -2442,8 +2442,11 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, break; case OWNING_GROUP: who = gowner; - zfs_fallthrough; + checkit = zfs_groupmember(zfsvfs, who, cr); + break; case ACE_IDENTIFIER_GROUP: + who = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ZTOI(zp)), + who); checkit = zfs_groupmember(zfsvfs, who, cr); break; case ACE_EVERYONE: @@ -2454,6 +2457,8 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, default: if (entry_type == 0) { uid_t newid; + who = zfs_uid_to_vfsuid(mnt_ns, + zfs_i_user_ns(ZTOI(zp)), who); newid = zfs_fuid_map_id(zfsvfs, who, cr, ZFS_ACE_USER); diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index 4d6ac1049500..f733d5e1ba39 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -1507,6 +1507,7 @@ zpl_permission(struct inode *ip, int mask) { int to_check = 0, i, ret; cred_t *cr = NULL; + zfsvfs_t *zfsvfs = NULL; /* * If NFSv4 ACLs are not being used, go back to @@ -1516,9 +1517,10 @@ zpl_permission(struct inode *ip, int mask) */ if ((ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_NFSV4) || ((ITOZ(ip)->z_pflags & ZFS_ACL_TRIVIAL && GENERIC_MASK(mask)))) { -#if (defined(HAVE_IOPS_PERMISSION_USERNS) || \ - defined(HAVE_IOPS_PERMISSION_IDMAP)) - return (generic_permission(zfs_init_idmap, ip, mask)); +#if defined(HAVE_IOPS_PERMISSION_USERNS) + return (generic_permission(userns, ip, mask)); +#elif defined(HAVE_IOPS_PERMISSION_IDMAP) + return (generic_permission(idmap, ip, mask)); #else return (generic_permission(ip, mask)); #endif @@ -1535,9 +1537,10 @@ zpl_permission(struct inode *ip, int mask) * NFSv4 ACE. Pass back to default kernel permissions check. */ if (to_check == 0) { -#if (defined(HAVE_IOPS_PERMISSION_USERNS) || \ - defined(HAVE_IOPS_PERMISSION_IDMAP)) - return (generic_permission(zfs_init_idmap, ip, mask)); +#if defined(HAVE_IOPS_PERMISSION_USERNS) + return (generic_permission(userns, ip, mask)); +#elif defined(HAVE_IOPS_PERMISSION_IDMAP) + return (generic_permission(idmap, ip, mask)); #else return (generic_permission(ip, mask)); #endif @@ -1614,7 +1617,24 @@ zpl_permission(struct inode *ip, int mask) return (-ECHILD); } - ret = -zfs_access(ITOZ(ip), to_check, V_ACE_MASK, cr); + zfsvfs = ZTOZSB(ITOZ(ip)); + + if ((ret = -zfs_enter_verify_zp(zfsvfs, ITOZ(ip), FTAG)) != 0) + return (ret); + +#if defined(HAVE_IOPS_PERMISSION_USERNS) + ret = -zfs_zaccess(ITOZ(ip), to_check, V_ACE_MASK, B_FALSE, cr, + userns); +#elif defined(HAVE_IOPS_PERMISSION_IDMAP) + ret = -zfs_zaccess(ITOZ(ip), to_check, V_ACE_MASK, B_FALSE, cr, + idmap); +#else + ret = -zfs_zaccess(ITOZ(ip), to_check, V_ACE_MASK, B_FALSE, cr, + zfs_init_idmap); +#endif + + zfs_exit(zfsvfs, FTAG); + crfree(cr); return (ret); } From 4460085c393889a56e88fdcec6af2df629625c30 Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Thu, 15 Aug 2024 09:59:34 -0600 Subject: [PATCH 33/35] Increase maximum permitted ACL entries (#250) Allow up to ZFS internal maximum ACL entries Signed-off-by: Andrew Walker --- module/os/linux/zfs/zpl_xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index f733d5e1ba39..1b9c354fcec5 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -1643,7 +1643,7 @@ zpl_permission(struct inode *ip, int mask) #define ACE4_SPECIAL_OWNER 1 #define ACE4_SPECIAL_GROUP 2 #define ACE4_SPECIAL_EVERYONE 3 -#define NFS41ACL_MAX_ACES 128 +#define NFS41ACL_MAX_ACES MAX_ACL_ENTRIES #define NFS41_FLAGS (ACE_DIRECTORY_INHERIT_ACE| \ ACE_FILE_INHERIT_ACE| \ ACE_NO_PROPAGATE_INHERIT_ACE| \ From b65aeb44eef0e854bed204a6b8d7db93b276d98b Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Wed, 4 Sep 2024 00:40:44 +0500 Subject: [PATCH 34/35] Revert "Make mount.zfs(8) calling zfs_mount_at for legacy mounts" This reverts commit 34118eac06fba834f0c934419aec1b386c98665a. Signed-off-by: Umer Saleem --- cmd/mount_zfs.c | 5 +++-- module/os/linux/zfs/zfs_ctldir.c | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cmd/mount_zfs.c b/cmd/mount_zfs.c index 283074daf717..fc9220950647 100644 --- a/cmd/mount_zfs.c +++ b/cmd/mount_zfs.c @@ -269,7 +269,8 @@ main(int argc, char **argv) return (MOUNT_USAGE); } - if (sloppy || libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { + if (!zfsutil || sloppy || + libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt); } @@ -336,7 +337,7 @@ main(int argc, char **argv) dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt); if (!fake) { - if (!remount && !sloppy && + if (zfsutil && !sloppy && !libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { error = zfs_mount_at(zhp, mntopts, mntflags, mntpoint); if (error) { diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index e042116333fb..54ed70d0394f 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -1101,8 +1101,8 @@ zfsctl_snapshot_mount(struct path *path, int flags) zfsvfs_t *snap_zfsvfs; zfs_snapentry_t *se; char *full_name, *full_path; - char *argv[] = { "/usr/bin/env", "mount", "-i", "-t", "zfs", "-n", - NULL, NULL, NULL }; + char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL, + NULL }; char *envp[] = { NULL }; int error; struct path spath; @@ -1153,8 +1153,8 @@ zfsctl_snapshot_mount(struct path *path, int flags) * value from call_usermodehelper() will be (exitcode << 8 + signal). */ dprintf("mount; name=%s path=%s\n", full_name, full_path); - argv[6] = full_name; - argv[7] = full_path; + argv[5] = full_name; + argv[6] = full_path; error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (error) { if (!(error & MOUNT_BUSY << 8)) { From 927f6eae697ba327a98a5d061879ac0df98dafe3 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Fri, 6 Sep 2024 21:03:50 +0500 Subject: [PATCH 35/35] Revert "Revert "Make mount.zfs(8) calling zfs_mount_at for legacy mounts"" This reverts commit f91496453c6bb53fe04d21d9f0e5e59c54e80624. Signed-off-by: Umer Saleem --- cmd/mount_zfs.c | 5 ++--- module/os/linux/zfs/zfs_ctldir.c | 8 ++++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cmd/mount_zfs.c b/cmd/mount_zfs.c index fc9220950647..283074daf717 100644 --- a/cmd/mount_zfs.c +++ b/cmd/mount_zfs.c @@ -269,8 +269,7 @@ main(int argc, char **argv) return (MOUNT_USAGE); } - if (!zfsutil || sloppy || - libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { + if (sloppy || libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt); } @@ -337,7 +336,7 @@ main(int argc, char **argv) dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt); if (!fake) { - if (zfsutil && !sloppy && + if (!remount && !sloppy && !libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { error = zfs_mount_at(zhp, mntopts, mntflags, mntpoint); if (error) { diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index 54ed70d0394f..e042116333fb 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -1101,8 +1101,8 @@ zfsctl_snapshot_mount(struct path *path, int flags) zfsvfs_t *snap_zfsvfs; zfs_snapentry_t *se; char *full_name, *full_path; - char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL, - NULL }; + char *argv[] = { "/usr/bin/env", "mount", "-i", "-t", "zfs", "-n", + NULL, NULL, NULL }; char *envp[] = { NULL }; int error; struct path spath; @@ -1153,8 +1153,8 @@ zfsctl_snapshot_mount(struct path *path, int flags) * value from call_usermodehelper() will be (exitcode << 8 + signal). */ dprintf("mount; name=%s path=%s\n", full_name, full_path); - argv[5] = full_name; - argv[6] = full_path; + argv[6] = full_name; + argv[7] = full_path; error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (error) { if (!(error & MOUNT_BUSY << 8)) {