From 98594181944daa201481ad63242806beb7c89ff4 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 16 Nov 2023 16:52:15 +0000 Subject: [PATCH 1/7] iommufd/selftest: Fix _test_mock_dirty_bitmaps() The ASSERT_EQ() macro sneakily expands to two statements, so the loop here needs braces to ensure it captures both and actually terminates the test upon failure. Where these tests are currently failing on my arm64 machine, this reduces the number of logged lines from a rather unreasonable ~197,000 down to 10. While we're at it, we can also clean up the tautologous "count" calculations whose assertions can never fail unless mathematics and/or the C language become fundamentally broken. Fixes: a9af47e382a4 ("iommufd/selftest: Test IOMMU_HWPT_GET_DIRTY_BITMAP") Link: https://lore.kernel.org/r/90e083045243ef407dd592bb1deec89cd1f4ddf2.1700153535.git.robin.murphy@arm.com Signed-off-by: Robin Murphy Reviewed-by: Kevin Tian Reviewed-by: Joao Martins Tested-by: Joao Martins Signed-off-by: Jason Gunthorpe --- tools/testing/selftests/iommu/iommufd_utils.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 050e9751321cf..ad9202335656c 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -293,15 +293,13 @@ static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length, __u64 bitmap_size, __u32 flags, struct __test_metadata *_metadata) { - unsigned long i, count, nbits = bitmap_size * BITS_PER_BYTE; + unsigned long i, nbits = bitmap_size * BITS_PER_BYTE; unsigned long nr = nbits / 2; __u64 out_dirty = 0; /* Mark all even bits as dirty in the mock domain */ - for (count = 0, i = 0; i < nbits; count += !(i % 2), i++) - if (!(i % 2)) - set_bit(i, (unsigned long *)bitmap); - ASSERT_EQ(nr, count); + for (i = 0; i < nbits; i += 2) + set_bit(i, (unsigned long *)bitmap); test_cmd_mock_domain_set_dirty(fd, hwpt_id, length, iova, page_size, bitmap, &out_dirty); @@ -311,9 +309,10 @@ static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length, memset(bitmap, 0, bitmap_size); test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap, flags); - for (count = 0, i = 0; i < nbits; count += !(i % 2), i++) + /* Beware ASSERT_EQ() is two statements -- braces are not redundant! */ + for (i = 0; i < nbits; i++) { ASSERT_EQ(!(i % 2), test_bit(i, (unsigned long *)bitmap)); - ASSERT_EQ(count, out_dirty); + } memset(bitmap, 0, bitmap_size); test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap, From bd7a282650b8beb57bc9d19bfcb714b1ccae843a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 12 Nov 2023 14:50:13 -0400 Subject: [PATCH 2/7] iommufd: Add iommufd_ctx to iommufd_put_object() Will be used in the next patch. Link: https://lore.kernel.org/r/1-v2-ca9e00171c5b+123-iommufd_syz4_jgg@nvidia.com/ Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 14 +++++++------- drivers/iommu/iommufd/hw_pagetable.c | 8 ++++---- drivers/iommu/iommufd/ioas.c | 14 +++++++------- drivers/iommu/iommufd/iommufd_private.h | 3 ++- drivers/iommu/iommufd/selftest.c | 14 +++++++------- drivers/iommu/iommufd/vfio_compat.c | 18 +++++++++--------- 6 files changed, 36 insertions(+), 35 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 59d3a07300d93..873630c111c1f 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -571,7 +571,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, continue; destroy_hwpt = (*do_attach)(idev, hwpt); if (IS_ERR(destroy_hwpt)) { - iommufd_put_object(&hwpt->obj); + iommufd_put_object(idev->ictx, &hwpt->obj); /* * -EINVAL means the domain is incompatible with the * device. Other error codes should propagate to @@ -583,7 +583,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, goto out_unlock; } *pt_id = hwpt->obj.id; - iommufd_put_object(&hwpt->obj); + iommufd_put_object(idev->ictx, &hwpt->obj); goto out_unlock; } @@ -652,7 +652,7 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, destroy_hwpt = ERR_PTR(-EINVAL); goto out_put_pt_obj; } - iommufd_put_object(pt_obj); + iommufd_put_object(idev->ictx, pt_obj); /* This destruction has to be after we unlock everything */ if (destroy_hwpt) @@ -660,7 +660,7 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, return 0; out_put_pt_obj: - iommufd_put_object(pt_obj); + iommufd_put_object(idev->ictx, pt_obj); return PTR_ERR(destroy_hwpt); } @@ -792,7 +792,7 @@ static int iommufd_access_change_ioas_id(struct iommufd_access *access, u32 id) if (IS_ERR(ioas)) return PTR_ERR(ioas); rc = iommufd_access_change_ioas(access, ioas); - iommufd_put_object(&ioas->obj); + iommufd_put_object(access->ictx, &ioas->obj); return rc; } @@ -941,7 +941,7 @@ void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, access->ops->unmap(access->data, iova, length); - iommufd_put_object(&access->obj); + iommufd_put_object(access->ictx, &access->obj); xa_lock(&ioas->iopt.access_list); } xa_unlock(&ioas->iopt.access_list); @@ -1243,6 +1243,6 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) out_free: kfree(data); out_put: - iommufd_put_object(&idev->obj); + iommufd_put_object(ucmd->ictx, &idev->obj); return rc; } diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 2abbeafdbd22d..cbb5df0a6c32f 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -318,9 +318,9 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) if (ioas) mutex_unlock(&ioas->mutex); out_put_pt: - iommufd_put_object(pt_obj); + iommufd_put_object(ucmd->ictx, pt_obj); out_put_idev: - iommufd_put_object(&idev->obj); + iommufd_put_object(ucmd->ictx, &idev->obj); return rc; } @@ -345,7 +345,7 @@ int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd) rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt_paging->common.domain, enable); - iommufd_put_object(&hwpt_paging->common.obj); + iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); return rc; } @@ -368,6 +368,6 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) rc = iopt_read_and_clear_dirty_data( &ioas->iopt, hwpt_paging->common.domain, cmd->flags, cmd); - iommufd_put_object(&hwpt_paging->common.obj); + iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); return rc; } diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index d5624577f79f1..7422482765481 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -105,7 +105,7 @@ int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd) rc = -EMSGSIZE; out_put: up_read(&ioas->iopt.iova_rwsem); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } @@ -175,7 +175,7 @@ int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd) interval_tree_remove(node, &allowed_iova); kfree(container_of(node, struct iopt_allowed, node)); } - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } @@ -228,7 +228,7 @@ int iommufd_ioas_map(struct iommufd_ucmd *ucmd) cmd->iova = iova; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_put: - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } @@ -258,7 +258,7 @@ int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) return PTR_ERR(src_ioas); rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length, &pages_list); - iommufd_put_object(&src_ioas->obj); + iommufd_put_object(ucmd->ictx, &src_ioas->obj); if (rc) return rc; @@ -279,7 +279,7 @@ int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) cmd->dst_iova = iova; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_put_dst: - iommufd_put_object(&dst_ioas->obj); + iommufd_put_object(ucmd->ictx, &dst_ioas->obj); out_pages: iopt_free_pages_list(&pages_list); return rc; @@ -315,7 +315,7 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd) rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_put: - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } @@ -393,6 +393,6 @@ int iommufd_ioas_option(struct iommufd_ucmd *ucmd) rc = -EOPNOTSUPP; } - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index a74cfefffbc6c..f918a22f0d480 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -154,7 +154,8 @@ static inline bool iommufd_lock_obj(struct iommufd_object *obj) struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, enum iommufd_object_type type); -static inline void iommufd_put_object(struct iommufd_object *obj) +static inline void iommufd_put_object(struct iommufd_ctx *ictx, + struct iommufd_object *obj) { refcount_dec(&obj->users); up_read(&obj->destroy_rwsem); diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 5d93434003d8a..022ef8f55088a 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -86,7 +86,7 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, if (IS_ERR(ioas)) return; *iova = iommufd_test_syz_conv_iova(&ioas->iopt, iova); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); } struct mock_iommu_domain { @@ -500,7 +500,7 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, return hwpt; if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED || hwpt->domain->ops != mock_ops.default_domain_ops) { - iommufd_put_object(&hwpt->obj); + iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } *mock = container_of(hwpt->domain, struct mock_iommu_domain, domain); @@ -518,7 +518,7 @@ get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id, return hwpt; if (hwpt->domain->type != IOMMU_DOMAIN_NESTED || hwpt->domain->ops != &domain_nested_ops) { - iommufd_put_object(&hwpt->obj); + iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } *mock_nested = container_of(hwpt->domain, @@ -681,7 +681,7 @@ static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_dev_obj: - iommufd_put_object(dev_obj); + iommufd_put_object(ucmd->ictx, dev_obj); return rc; } @@ -699,7 +699,7 @@ static int iommufd_test_add_reserved(struct iommufd_ucmd *ucmd, down_write(&ioas->iopt.iova_rwsem); rc = iopt_reserve_iova(&ioas->iopt, start, start + length - 1, NULL); up_write(&ioas->iopt.iova_rwsem); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } @@ -754,7 +754,7 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, rc = 0; out_put: - iommufd_put_object(&hwpt->obj); + iommufd_put_object(ucmd->ictx, &hwpt->obj); return rc; } @@ -1233,7 +1233,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, out_free: kvfree(tmp); out_put: - iommufd_put_object(&hwpt->obj); + iommufd_put_object(ucmd->ictx, &hwpt->obj); return rc; } diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c index 538fbf76354d1..a3ad5f0b6c59d 100644 --- a/drivers/iommu/iommufd/vfio_compat.c +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -41,7 +41,7 @@ int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id) if (IS_ERR(ioas)) return PTR_ERR(ioas); *out_ioas_id = ioas->obj.id; - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return 0; } EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO); @@ -98,7 +98,7 @@ int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx) if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) { ret = 0; - iommufd_put_object(&ictx->vfio_ioas->obj); + iommufd_put_object(ictx, &ictx->vfio_ioas->obj); goto out_abort; } ictx->vfio_ioas = ioas; @@ -133,7 +133,7 @@ int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) if (IS_ERR(ioas)) return PTR_ERR(ioas); cmd->ioas_id = ioas->obj.id; - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); case IOMMU_VFIO_IOAS_SET: @@ -143,7 +143,7 @@ int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) xa_lock(&ucmd->ictx->objects); ucmd->ictx->vfio_ioas = ioas; xa_unlock(&ucmd->ictx->objects); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ucmd->ictx, &ioas->obj); return 0; case IOMMU_VFIO_IOAS_CLEAR: @@ -190,7 +190,7 @@ static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd, iova = map.iova; rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr), map.size, iommu_prot, 0); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return rc; } @@ -249,7 +249,7 @@ static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd, rc = -EFAULT; err_put: - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return rc; } @@ -272,7 +272,7 @@ static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) } mutex_unlock(&ioas->mutex); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return rc; } @@ -349,7 +349,7 @@ static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type) */ if (type == VFIO_TYPE1_IOMMU) rc = iopt_disable_large_pages(&ioas->iopt); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return rc; } @@ -511,7 +511,7 @@ static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx, out_put: up_read(&ioas->iopt.iova_rwsem); - iommufd_put_object(&ioas->obj); + iommufd_put_object(ictx, &ioas->obj); return rc; } From 6f9c4d8c468c189d6dc470324bd52955f8aa0a10 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 12 Nov 2023 15:44:08 -0400 Subject: [PATCH 3/7] iommufd: Do not UAF during iommufd_put_object() The mixture of kernel and user space lifecycle objects continues to be complicated inside iommufd. The obj->destroy_rwsem is used to bring order to the kernel driver destruction sequence but it cannot be sequenced right with the other refcounts so we end up possibly UAF'ing: BUG: KASAN: slab-use-after-free in __up_read+0x627/0x750 kernel/locking/rwsem.c:1342 Read of size 8 at addr ffff888073cde868 by task syz-executor934/6535 CPU: 1 PID: 6535 Comm: syz-executor934 Not tainted 6.6.0-rc7-syzkaller-00195-g2af9b20dbb39 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd9/0x1b0 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:364 [inline] print_report+0xc4/0x620 mm/kasan/report.c:475 kasan_report+0xda/0x110 mm/kasan/report.c:588 __up_read+0x627/0x750 kernel/locking/rwsem.c:1342 iommufd_put_object drivers/iommu/iommufd/iommufd_private.h:149 [inline] iommufd_vfio_ioas+0x46c/0x580 drivers/iommu/iommufd/vfio_compat.c:146 iommufd_fops_ioctl+0x347/0x4d0 drivers/iommu/iommufd/main.c:398 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl fs/ioctl.c:857 [inline] __x64_sys_ioctl+0x18f/0x210 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd There are two races here, the more obvious one: CPU 0 CPU 1 iommufd_put_object() iommufd_destroy() refcount_dec(&obj->users) iommufd_object_remove() kfree() up_read(&obj->destroy_rwsem) // Boom And there is also perhaps some possibility that the rwsem could hit an issue: CPU 0 CPU 1 iommufd_put_object() iommufd_object_destroy_user() refcount_dec(&obj->users); down_write(&obj->destroy_rwsem) up_read(&obj->destroy_rwsem); atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); tmp = atomic_long_add_return_release() rwsem_try_write_lock() iommufd_object_remove() up_write(&obj->destroy_rwsem) kfree() clear_nonspinnable() // Boom Fix this by reorganizing this again so that two refcounts are used to keep track of things with a rule that users == 0 && shortterm_users == 0 means no other threads have that memory. Put a wait_queue in the iommufd_ctx object that is triggered when any sub object reaches a 0 shortterm_users. This allows the same wait for userspace ioctls to finish behavior that the rwsem was providing. This is weaker still than the prior versions: - There is no bias on shortterm_users so if some thread is waiting to destroy other threads can continue to get new read sides - If destruction fails, eg because of an active in-kernel user, then shortterm_users will have cycled to zero momentarily blocking new users - If userspace races destroy with other userspace operations they continue to get an EBUSY since we still can't intermix looking up an ID and sleeping for its unref In all cases these are things that userspace brings on itself, correct programs will not hit them. Fixes: 99f98a7c0d69 ("iommufd: IOMMUFD_DESTROY should not increase the refcount") Link: https://lore.kernel.org/all/2-v2-ca9e00171c5b+123-iommufd_syz4_jgg@nvidia.com/ Reported-by: syzbot+d31adfb277377ef8fcba@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/00000000000055ef9a0609336580@google.com Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_private.h | 67 +++++++++-- drivers/iommu/iommufd/main.c | 146 +++++++++++++----------- 2 files changed, 135 insertions(+), 78 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index f918a22f0d480..abae041e256f7 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -21,6 +21,7 @@ struct iommufd_ctx { struct file *file; struct xarray objects; struct xarray groups; + wait_queue_head_t destroy_wait; u8 account_mode; /* Compatibility with VFIO no iommu */ @@ -135,7 +136,7 @@ enum iommufd_object_type { /* Base struct for all objects with a userspace ID handle. */ struct iommufd_object { - struct rw_semaphore destroy_rwsem; + refcount_t shortterm_users; refcount_t users; enum iommufd_object_type type; unsigned int id; @@ -143,10 +144,15 @@ struct iommufd_object { static inline bool iommufd_lock_obj(struct iommufd_object *obj) { - if (!down_read_trylock(&obj->destroy_rwsem)) + if (!refcount_inc_not_zero(&obj->users)) return false; - if (!refcount_inc_not_zero(&obj->users)) { - up_read(&obj->destroy_rwsem); + if (!refcount_inc_not_zero(&obj->shortterm_users)) { + /* + * If the caller doesn't already have a ref on obj this must be + * called under the xa_lock. Otherwise the caller is holding a + * ref on users. Thus it cannot be one before this decrement. + */ + refcount_dec(&obj->users); return false; } return true; @@ -157,8 +163,13 @@ struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, static inline void iommufd_put_object(struct iommufd_ctx *ictx, struct iommufd_object *obj) { + /* + * Users first, then shortterm so that REMOVE_WAIT_SHORTTERM never sees + * a spurious !0 users with a 0 shortterm_users. + */ refcount_dec(&obj->users); - up_read(&obj->destroy_rwsem); + if (refcount_dec_and_test(&obj->shortterm_users)) + wake_up_interruptible_all(&ictx->destroy_wait); } void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj); @@ -166,17 +177,49 @@ void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, struct iommufd_object *obj); void iommufd_object_finalize(struct iommufd_ctx *ictx, struct iommufd_object *obj); -void __iommufd_object_destroy_user(struct iommufd_ctx *ictx, - struct iommufd_object *obj, bool allow_fail); + +enum { + REMOVE_WAIT_SHORTTERM = 1, +}; +int iommufd_object_remove(struct iommufd_ctx *ictx, + struct iommufd_object *to_destroy, u32 id, + unsigned int flags); + +/* + * The caller holds a users refcount and wants to destroy the object. At this + * point the caller has no shortterm_users reference and at least the xarray + * will be holding one. + */ static inline void iommufd_object_destroy_user(struct iommufd_ctx *ictx, struct iommufd_object *obj) { - __iommufd_object_destroy_user(ictx, obj, false); + int ret; + + ret = iommufd_object_remove(ictx, obj, obj->id, REMOVE_WAIT_SHORTTERM); + + /* + * If there is a bug and we couldn't destroy the object then we did put + * back the caller's users refcount and will eventually try to free it + * again during close. + */ + WARN_ON(ret); } -static inline void iommufd_object_deref_user(struct iommufd_ctx *ictx, - struct iommufd_object *obj) + +/* + * The HWPT allocated by autodomains is used in possibly many devices and + * is automatically destroyed when its refcount reaches zero. + * + * If userspace uses the HWPT manually, even for a short term, then it will + * disrupt this refcounting and the auto-free in the kernel will not work. + * Userspace that tries to use the automatically allocated HWPT must be careful + * to ensure that it is consistently destroyed, eg by not racing accesses + * and by not attaching an automatic HWPT to a device manually. + */ +static inline void +iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx, + struct iommufd_object *obj) { - __iommufd_object_destroy_user(ictx, obj, true); + iommufd_object_remove(ictx, obj, obj->id, 0); } struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, @@ -312,7 +355,7 @@ static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, lockdep_assert_not_held(&hwpt_paging->ioas->mutex); if (hwpt_paging->auto_domain) { - iommufd_object_deref_user(ictx, &hwpt->obj); + iommufd_object_put_and_try_destroy(ictx, &hwpt->obj); return; } } diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 45b9d40773b13..c9091e46d208a 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -33,7 +33,6 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, enum iommufd_object_type type) { - static struct lock_class_key obj_keys[IOMMUFD_OBJ_MAX]; struct iommufd_object *obj; int rc; @@ -41,15 +40,8 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, if (!obj) return ERR_PTR(-ENOMEM); obj->type = type; - /* - * In most cases the destroy_rwsem is obtained with try so it doesn't - * interact with lockdep, however on destroy we have to sleep. This - * means if we have to destroy an object while holding a get on another - * object it triggers lockdep. Using one locking class per object type - * is a simple and reasonable way to avoid this. - */ - __init_rwsem(&obj->destroy_rwsem, "iommufd_object::destroy_rwsem", - &obj_keys[type]); + /* Starts out bias'd by 1 until it is removed from the xarray */ + refcount_set(&obj->shortterm_users, 1); refcount_set(&obj->users, 1); /* @@ -129,92 +121,113 @@ struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, return obj; } +static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx, + struct iommufd_object *to_destroy) +{ + if (refcount_dec_and_test(&to_destroy->shortterm_users)) + return 0; + + if (wait_event_timeout(ictx->destroy_wait, + refcount_read(&to_destroy->shortterm_users) == + 0, + msecs_to_jiffies(10000))) + return 0; + + pr_crit("Time out waiting for iommufd object to become free\n"); + refcount_inc(&to_destroy->shortterm_users); + return -EBUSY; +} + /* * Remove the given object id from the xarray if the only reference to the - * object is held by the xarray. The caller must call ops destroy(). + * object is held by the xarray. */ -static struct iommufd_object *iommufd_object_remove(struct iommufd_ctx *ictx, - u32 id, bool extra_put) +int iommufd_object_remove(struct iommufd_ctx *ictx, + struct iommufd_object *to_destroy, u32 id, + unsigned int flags) { struct iommufd_object *obj; XA_STATE(xas, &ictx->objects, id); - - xa_lock(&ictx->objects); - obj = xas_load(&xas); - if (xa_is_zero(obj) || !obj) { - obj = ERR_PTR(-ENOENT); - goto out_xa; - } + bool zerod_shortterm = false; + int ret; /* - * If the caller is holding a ref on obj we put it here under the - * spinlock. + * The purpose of the shortterm_users is to ensure deterministic + * destruction of objects used by external drivers and destroyed by this + * function. Any temporary increment of the refcount must increment + * shortterm_users, such as during ioctl execution. */ - if (extra_put) + if (flags & REMOVE_WAIT_SHORTTERM) { + ret = iommufd_object_dec_wait_shortterm(ictx, to_destroy); + if (ret) { + /* + * We have a bug. Put back the callers reference and + * defer cleaning this object until close. + */ + refcount_dec(&to_destroy->users); + return ret; + } + zerod_shortterm = true; + } + + xa_lock(&ictx->objects); + obj = xas_load(&xas); + if (to_destroy) { + /* + * If the caller is holding a ref on obj we put it here under + * the spinlock. + */ refcount_dec(&obj->users); + if (WARN_ON(obj != to_destroy)) { + ret = -ENOENT; + goto err_xa; + } + } else if (xa_is_zero(obj) || !obj) { + ret = -ENOENT; + goto err_xa; + } + if (!refcount_dec_if_one(&obj->users)) { - obj = ERR_PTR(-EBUSY); - goto out_xa; + ret = -EBUSY; + goto err_xa; } xas_store(&xas, NULL); if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj)) ictx->vfio_ioas = NULL; - -out_xa: xa_unlock(&ictx->objects); - /* The returned object reference count is zero */ - return obj; -} - -/* - * The caller holds a users refcount and wants to destroy the object. Returns - * true if the object was destroyed. In all cases the caller no longer has a - * reference on obj. - */ -void __iommufd_object_destroy_user(struct iommufd_ctx *ictx, - struct iommufd_object *obj, bool allow_fail) -{ - struct iommufd_object *ret; - /* - * The purpose of the destroy_rwsem is to ensure deterministic - * destruction of objects used by external drivers and destroyed by this - * function. Any temporary increment of the refcount must hold the read - * side of this, such as during ioctl execution. - */ - down_write(&obj->destroy_rwsem); - ret = iommufd_object_remove(ictx, obj->id, true); - up_write(&obj->destroy_rwsem); - - if (allow_fail && IS_ERR(ret)) - return; - - /* - * If there is a bug and we couldn't destroy the object then we did put - * back the caller's refcount and will eventually try to free it again - * during close. + * Since users is zero any positive users_shortterm must be racing + * iommufd_put_object(), or we have a bug. */ - if (WARN_ON(IS_ERR(ret))) - return; + if (!zerod_shortterm) { + ret = iommufd_object_dec_wait_shortterm(ictx, obj); + if (WARN_ON(ret)) + return ret; + } iommufd_object_ops[obj->type].destroy(obj); kfree(obj); + return 0; + +err_xa: + if (zerod_shortterm) { + /* Restore the xarray owned reference */ + refcount_set(&obj->shortterm_users, 1); + } + xa_unlock(&ictx->objects); + + /* The returned object reference count is zero */ + return ret; } static int iommufd_destroy(struct iommufd_ucmd *ucmd) { struct iommu_destroy *cmd = ucmd->cmd; - struct iommufd_object *obj; - obj = iommufd_object_remove(ucmd->ictx, cmd->id, false); - if (IS_ERR(obj)) - return PTR_ERR(obj); - iommufd_object_ops[obj->type].destroy(obj); - kfree(obj); - return 0; + return iommufd_object_remove(ucmd->ictx, NULL, cmd->id, 0); } static int iommufd_fops_open(struct inode *inode, struct file *filp) @@ -238,6 +251,7 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); xa_init(&ictx->groups); ictx->file = filp; + init_waitqueue_head(&ictx->destroy_wait); filp->private_data = ictx; return 0; } From 480b3e73720f6b5d76bef2387b1f9d19ed67573b Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 3 Nov 2023 05:26:27 -0700 Subject: [PATCH 4/7] vdpa/mlx5: preserve CVQ vringh index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mlx5_vdpa does not preserve userland's view of vring base for the control queue in the following sequence: ioctl VHOST_SET_VRING_BASE ioctl VHOST_VDPA_SET_STATUS VIRTIO_CONFIG_S_DRIVER_OK mlx5_vdpa_set_status() setup_cvq_vring() vringh_init_iotlb() vringh_init_kern() vrh->last_avail_idx = 0; ioctl VHOST_GET_VRING_BASE To fix, restore the value of cvq->vring.last_avail_idx after calling vringh_init_iotlb. Fixes: 5262912ef3cf ("vdpa/mlx5: Add support for control VQ and MAC setting") Signed-off-by: Steve Sistare Acked-by: Eugenio PĂ©rez Acked-by: Jason Wang Message-Id: <1699014387-194368-1-git-send-email-steven.sistare@oracle.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 12ac3397f39b8..26ba7da6b4106 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2815,13 +2815,18 @@ static int setup_cvq_vring(struct mlx5_vdpa_dev *mvdev) struct mlx5_control_vq *cvq = &mvdev->cvq; int err = 0; - if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) + if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) { + u16 idx = cvq->vring.last_avail_idx; + err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features, MLX5_CVQ_MAX_ENT, false, (struct vring_desc *)(uintptr_t)cvq->desc_addr, (struct vring_avail *)(uintptr_t)cvq->driver_addr, (struct vring_used *)(uintptr_t)cvq->device_addr); + if (!err) + cvq->vring.last_avail_idx = cvq->vring.last_used_idx = idx; + } return err; } From 4f317d6529d7fc3ab7769ef89645d43fc7eec61b Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 10 Nov 2023 14:18:00 -0800 Subject: [PATCH 5/7] pds_vdpa: fix up format-truncation complaint Our friendly kernel test robot has recently been pointing out some format-truncation issues. Here's a fix for one of them. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311040109.RfgJoE7L-lkp@intel.com/ Signed-off-by: Shannon Nelson Message-Id: <20231110221802.46841-2-shannon.nelson@amd.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/pds/debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vdpa/pds/debugfs.c b/drivers/vdpa/pds/debugfs.c index 9b04aad6ec35d..c328e694f6e7f 100644 --- a/drivers/vdpa/pds/debugfs.c +++ b/drivers/vdpa/pds/debugfs.c @@ -261,7 +261,7 @@ void pds_vdpa_debugfs_add_vdpadev(struct pds_vdpa_aux *vdpa_aux) debugfs_create_file("config", 0400, vdpa_aux->dentry, vdpa_aux->pdsv, &config_fops); for (i = 0; i < vdpa_aux->pdsv->num_vqs; i++) { - char name[8]; + char name[16]; snprintf(name, sizeof(name), "vq%02d", i); debugfs_create_file(name, 0400, vdpa_aux->dentry, From dd3b8de16e90c5594eddd29aeeb99e97c6f863be Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 10 Nov 2023 14:18:01 -0800 Subject: [PATCH 6/7] pds_vdpa: clear config callback when status goes to 0 If the client driver is setting status to 0, something is getting shutdown and possibly removed. Make sure we clear the config_cb so that it doesn't end up crashing when trying to call a bogus callback. Signed-off-by: Shannon Nelson Message-Id: <20231110221802.46841-3-shannon.nelson@amd.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/pds/vdpa_dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c index 52b2449182ad7..9fc89c82d1f01 100644 --- a/drivers/vdpa/pds/vdpa_dev.c +++ b/drivers/vdpa/pds/vdpa_dev.c @@ -461,8 +461,10 @@ static void pds_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) pds_vdpa_cmd_set_status(pdsv, status); - /* Note: still working with FW on the need for this reset cmd */ if (status == 0) { + struct vdpa_callback null_cb = { }; + + pds_vdpa_set_config_cb(vdpa_dev, &null_cb); pds_vdpa_cmd_reset(pdsv); for (i = 0; i < pdsv->num_vqs; i++) { From cefc9ba6aed48a3aa085888e3262ac2aa975714b Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 10 Nov 2023 14:18:02 -0800 Subject: [PATCH 7/7] pds_vdpa: set features order Fix up the order that the device and negotiated features are checked to get a more reliable difference when things get changed. Signed-off-by: Shannon Nelson Message-Id: <20231110221802.46841-4-shannon.nelson@amd.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/pds/vdpa_dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c index 9fc89c82d1f01..25c0fe5ec3d5d 100644 --- a/drivers/vdpa/pds/vdpa_dev.c +++ b/drivers/vdpa/pds/vdpa_dev.c @@ -318,9 +318,8 @@ static int pds_vdpa_set_driver_features(struct vdpa_device *vdpa_dev, u64 featur return -EOPNOTSUPP; } - pdsv->negotiated_features = nego_features; - driver_features = pds_vdpa_get_driver_features(vdpa_dev); + pdsv->negotiated_features = nego_features; dev_dbg(dev, "%s: %#llx => %#llx\n", __func__, driver_features, nego_features);