From 1f0bbf28940cf5edad90ab57b62aa8197bf5e836 Mon Sep 17 00:00:00 2001 From: Varun Prakash Date: Wed, 9 Aug 2023 15:56:45 +0530 Subject: [PATCH 001/161] nvmet-tcp: pass iov_len instead of sg->length to bvec_set_page() iov_len is the valid data length, so pass iov_len instead of sg->length to bvec_set_page(). Fixes: 5bfaba275ae6 ("nvmet-tcp: don't map pages which can't come from HIGHMEM") Signed-off-by: Rakshana Sridhar Signed-off-by: Varun Prakash Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 868aa4de2e4c4..cd92d7ddf5ed1 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -348,7 +348,7 @@ static void nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd) while (length) { u32 iov_len = min_t(u32, length, sg->length - sg_offset); - bvec_set_page(iov, sg_page(sg), sg->length, + bvec_set_page(iov, sg_page(sg), iov_len, sg->offset + sg_offset); length -= iov_len; From 71be868472dc5beb82feb4da2d3eb9cba785d660 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 20 Aug 2023 11:21:39 +0200 Subject: [PATCH 002/161] nvme: host: hwmon: constify pointers to hwmon_channel_info Statically allocated array of pointed to hwmon_channel_info can be made const for safety. Signed-off-by: Krzysztof Kozlowski Acked-by: Christoph Hellwig Acked-by: Guenter Roeck Signed-off-by: Keith Busch --- drivers/nvme/host/hwmon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c index 316f3e4ca7cc6..8df73a0b3980c 100644 --- a/drivers/nvme/host/hwmon.c +++ b/drivers/nvme/host/hwmon.c @@ -187,7 +187,7 @@ static umode_t nvme_hwmon_is_visible(const void *_data, return 0; } -static const struct hwmon_channel_info *nvme_hwmon_info[] = { +static const struct hwmon_channel_info *const nvme_hwmon_info[] = { HWMON_CHANNEL_INFO(chip, HWMON_C_REGISTER_TZ), HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN | From 8ae5b3a685dc59a8cf7ccfe0e850999ba9727a3c Mon Sep 17 00:00:00 2001 From: Nigel Kirkland Date: Thu, 17 Aug 2023 12:43:01 -0700 Subject: [PATCH 003/161] nvme-fc: Prevent null pointer dereference in nvme_fc_io_getuuid() The nvme_fc_fcp_op structure describing an AEN operation is initialized with a null request structure pointer. An FC LLDD may make a call to nvme_fc_io_getuuid passing a pointer to an nvmefc_fcp_req for an AEN operation. Add validation of the request structure pointer before dereference. Signed-off-by: Nigel Kirkland Reviewed-by: James Smart Signed-off-by: Keith Busch --- drivers/nvme/host/fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 1cd2bf82319a9..a15b37750d6e9 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1924,7 +1924,7 @@ char *nvme_fc_io_getuuid(struct nvmefc_fcp_req *req) struct nvme_fc_fcp_op *op = fcp_req_to_fcp_op(req); struct request *rq = op->rq; - if (!IS_ENABLED(CONFIG_BLK_CGROUP_FC_APPID) || !rq->bio) + if (!IS_ENABLED(CONFIG_BLK_CGROUP_FC_APPID) || !rq || !rq->bio) return NULL; return blkcg_get_fc_appid(rq->bio); } From 0c3b063ef4136191312a88ea7a670a6a2a2dae5a Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Thu, 24 Aug 2023 08:37:01 +0100 Subject: [PATCH 004/161] drm/drm_connector: Provide short description of param 'supported_colorspaces' Fixes the following W=1 kernel build warning(s): drivers/gpu/drm/drm_connector.c:2215: warning: Function parameter or member 'supported_colorspaces' not described in 'drm_mode_create_hdmi_colorspace_property' drivers/gpu/drm/drm_connector.c:2239: warning: Function parameter or member 'supported_colorspaces' not described in 'drm_mode_create_dp_colorspace_property' Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20230824073710.2677348-17-lee@kernel.org Signed-off-by: Maxime Ripard --- drivers/gpu/drm/drm_connector.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c index 3ed4cfcb350c4..f287257362372 100644 --- a/drivers/gpu/drm/drm_connector.c +++ b/drivers/gpu/drm/drm_connector.c @@ -2203,6 +2203,7 @@ static int drm_mode_create_colorspace_property(struct drm_connector *connector, /** * drm_mode_create_hdmi_colorspace_property - create hdmi colorspace property * @connector: connector to create the Colorspace property on. + * @supported_colorspaces: bitmap of supported color spaces * * Called by a driver the first time it's needed, must be attached to desired * HDMI connectors. @@ -2227,6 +2228,7 @@ EXPORT_SYMBOL(drm_mode_create_hdmi_colorspace_property); /** * drm_mode_create_dp_colorspace_property - create dp colorspace property * @connector: connector to create the Colorspace property on. + * @supported_colorspaces: bitmap of supported color spaces * * Called by a driver the first time it's needed, must be attached to desired * DP connectors. From 6428bc7bd3f35e43c8cb7359cb89d83248d339d2 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 30 Aug 2023 07:56:04 +0200 Subject: [PATCH 005/161] parisc: sba_iommu: Fix build warning if procfs if disabled Clean up the code, e.g. make proc_mckinley_root static, drop the now empty mckinley header file and remove some unneeded ifdefs around procfs functions. Signed-off-by: Helge Deller Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202308300800.Jod4sHzM-lkp@intel.com/ Fixes: 77e0ddf097d6 ("parisc: ccio-dma: Create private runway procfs root entry") --- arch/parisc/include/asm/mckinley.h | 8 -------- drivers/parisc/sba_iommu.c | 10 ++-------- 2 files changed, 2 insertions(+), 16 deletions(-) delete mode 100644 arch/parisc/include/asm/mckinley.h diff --git a/arch/parisc/include/asm/mckinley.h b/arch/parisc/include/asm/mckinley.h deleted file mode 100644 index 1314390b9034b..0000000000000 --- a/arch/parisc/include/asm/mckinley.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ASM_PARISC_MCKINLEY_H -#define ASM_PARISC_MCKINLEY_H - -/* declared in arch/parisc/kernel/setup.c */ -extern struct proc_dir_entry * proc_mckinley_root; - -#endif /*ASM_PARISC_MCKINLEY_H*/ diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index f6b5106753189..ee5a2f4b74745 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -46,8 +46,6 @@ #include #include -#include /* for proc_mckinley_root */ -#include /* for proc_runway_root */ #include /* for PAGE0 */ #include /* for PDC_MODEL_* */ #include /* for is_pdc_pat() */ @@ -122,7 +120,7 @@ MODULE_PARM_DESC(sba_reserve_agpgart, "Reserve half of IO pdir as AGPGART"); #endif static struct proc_dir_entry *proc_runway_root __ro_after_init; -struct proc_dir_entry *proc_mckinley_root __ro_after_init; +static struct proc_dir_entry *proc_mckinley_root __ro_after_init; /************************************ ** SBA register read and write support @@ -1899,9 +1897,7 @@ static int __init sba_driver_callback(struct parisc_device *dev) int i; char *version; void __iomem *sba_addr = ioremap(dev->hpa.start, SBA_FUNC_SIZE); -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *root; -#endif + struct proc_dir_entry *root __maybe_unused; sba_dump_ranges(sba_addr); @@ -1967,7 +1963,6 @@ static int __init sba_driver_callback(struct parisc_device *dev) hppa_dma_ops = &sba_ops; -#ifdef CONFIG_PROC_FS switch (dev->id.hversion) { case PLUTO_MCKINLEY_PORT: if (!proc_mckinley_root) @@ -1985,7 +1980,6 @@ static int __init sba_driver_callback(struct parisc_device *dev) proc_create_single("sba_iommu", 0, root, sba_proc_info); proc_create_single("sba_iommu-bitmap", 0, root, sba_proc_bitmap_info); -#endif return 0; } From eb3255ee8f6f4691471a28fbf22db5e8901116cd Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 30 Aug 2023 08:10:01 +0200 Subject: [PATCH 006/161] parisc: sba: Fix compile warning wrt list of SBA devices Fix this makecheck warning: drivers/parisc/sba_iommu.c:98:19: warning: symbol 'sba_list' was not declared. Should it be static? Signed-off-by: Helge Deller --- arch/parisc/include/asm/ropes.h | 3 +++ drivers/char/agp/parisc-agp.c | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/parisc/include/asm/ropes.h b/arch/parisc/include/asm/ropes.h index fd96706c7234a..d7b941cfbccd8 100644 --- a/arch/parisc/include/asm/ropes.h +++ b/arch/parisc/include/asm/ropes.h @@ -86,6 +86,9 @@ struct sba_device { struct ioc ioc[MAX_IOC]; }; +/* list of SBA's in system, see drivers/parisc/sba_iommu.c */ +extern struct sba_device *sba_list; + #define ASTRO_RUNWAY_PORT 0x582 #define IKE_MERCED_PORT 0x803 #define REO_MERCED_PORT 0x804 diff --git a/drivers/char/agp/parisc-agp.c b/drivers/char/agp/parisc-agp.c index 514f9f287a781..c6f181702b9a7 100644 --- a/drivers/char/agp/parisc-agp.c +++ b/drivers/char/agp/parisc-agp.c @@ -394,8 +394,6 @@ find_quicksilver(struct device *dev, void *data) static int __init parisc_agp_init(void) { - extern struct sba_device *sba_list; - int err = -1; struct parisc_device *sba = NULL, *lba = NULL; struct lba_device *lbadev = NULL; From c1ebb94071cb4455177bafa619423acb3494d15d Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 30 Aug 2023 11:49:57 +0200 Subject: [PATCH 007/161] parisc: sba-iommu: Fix sparse warnigs Fix sparse warnings, as pdir is __le64 *. Signed-off-by: Helge Deller --- arch/parisc/include/asm/ropes.h | 4 ++-- drivers/parisc/iommu-helpers.h | 4 ++-- drivers/parisc/sba_iommu.c | 28 ++++++++++++++-------------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/parisc/include/asm/ropes.h b/arch/parisc/include/asm/ropes.h index d7b941cfbccd8..e2d2d7e9bfdea 100644 --- a/arch/parisc/include/asm/ropes.h +++ b/arch/parisc/include/asm/ropes.h @@ -29,7 +29,7 @@ struct ioc { void __iomem *ioc_hpa; /* I/O MMU base address */ char *res_map; /* resource map, bit == pdir entry */ - u64 *pdir_base; /* physical base address */ + __le64 *pdir_base; /* physical base address */ unsigned long ibase; /* pdir IOV Space base - shared w/lba_pci */ unsigned long imask; /* pdir IOV Space mask - shared w/lba_pci */ #ifdef ZX1_SUPPORT @@ -113,7 +113,7 @@ static inline int IS_PLUTO(struct parisc_device *d) { #define SBA_PDIR_VALID_BIT 0x8000000000000000ULL -#define SBA_AGPGART_COOKIE 0x0000badbadc0ffeeULL +#define SBA_AGPGART_COOKIE (__force __le64) 0x0000badbadc0ffeeULL #define SBA_FUNC_ID 0x0000 /* function id */ #define SBA_FCLASS 0x0008 /* function class, bist, header, rev... */ diff --git a/drivers/parisc/iommu-helpers.h b/drivers/parisc/iommu-helpers.h index 0905be256de08..a00c38b6224ab 100644 --- a/drivers/parisc/iommu-helpers.h +++ b/drivers/parisc/iommu-helpers.h @@ -14,13 +14,13 @@ static inline unsigned int iommu_fill_pdir(struct ioc *ioc, struct scatterlist *startsg, int nents, unsigned long hint, - void (*iommu_io_pdir_entry)(u64 *, space_t, unsigned long, + void (*iommu_io_pdir_entry)(__le64 *, space_t, unsigned long, unsigned long)) { struct scatterlist *dma_sg = startsg; /* pointer to current DMA */ unsigned int n_mappings = 0; unsigned long dma_offset = 0, dma_len = 0; - u64 *pdirp = NULL; + __le64 *pdirp = NULL; /* Horrible hack. For efficiency's sake, dma_sg starts one * entry below the true start (it is immediately incremented diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index ee5a2f4b74745..05e7103d1d407 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -202,7 +202,7 @@ static void sba_dump_pdir_entry(struct ioc *ioc, char *msg, uint pide) { /* start printing from lowest pde in rval */ - u64 *ptr = &(ioc->pdir_base[pide & (~0U * BITS_PER_LONG)]); + __le64 *ptr = &(ioc->pdir_base[pide & (~0U * BITS_PER_LONG)]); unsigned long *rptr = (unsigned long *) &(ioc->res_map[(pide >>3) & ~(sizeof(unsigned long) - 1)]); uint rcnt; @@ -569,7 +569,7 @@ typedef unsigned long space_t; */ static void -sba_io_pdir_entry(u64 *pdir_ptr, space_t sid, unsigned long vba, +sba_io_pdir_entry(__le64 *pdir_ptr, space_t sid, unsigned long vba, unsigned long hint) { u64 pa; /* physical address */ @@ -613,7 +613,7 @@ static void sba_mark_invalid(struct ioc *ioc, dma_addr_t iova, size_t byte_cnt) { u32 iovp = (u32) SBA_IOVP(ioc,iova); - u64 *pdir_ptr = &ioc->pdir_base[PDIR_INDEX(iovp)]; + __le64 *pdir_ptr = &ioc->pdir_base[PDIR_INDEX(iovp)]; #ifdef ASSERT_PDIR_SANITY /* Assert first pdir entry is set. @@ -714,7 +714,7 @@ sba_map_single(struct device *dev, void *addr, size_t size, unsigned long flags; dma_addr_t iovp; dma_addr_t offset; - u64 *pdir_start; + __le64 *pdir_start; int pide; ioc = GET_IOC(dev); @@ -1432,7 +1432,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num) ioc->pdir_size = pdir_size = (iova_space_size/IOVP_SIZE) * sizeof(u64); - DBG_INIT("%s() hpa 0x%lx mem %ldMB IOV %dMB (%d bits)\n", + DBG_INIT("%s() hpa %px mem %ldMB IOV %dMB (%d bits)\n", __func__, ioc->ioc_hpa, (unsigned long) totalram_pages() >> (20 - PAGE_SHIFT), @@ -1469,7 +1469,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num) ioc->iovp_mask = ~(iova_space_mask + PAGE_SIZE - 1); #endif - DBG_INIT("%s() IOV base 0x%lx mask 0x%0lx\n", + DBG_INIT("%s() IOV base %#lx mask %#0lx\n", __func__, ioc->ibase, ioc->imask); /* @@ -1581,7 +1581,7 @@ printk("sba_hw_init(): mem_boot 0x%x 0x%x 0x%x 0x%x\n", PAGE0->mem_boot.hpa, if (!IS_PLUTO(sba_dev->dev)) { ioc_ctl = READ_REG(sba_dev->sba_hpa+IOC_CTRL); - DBG_INIT("%s() hpa 0x%lx ioc_ctl 0x%Lx ->", + DBG_INIT("%s() hpa %px ioc_ctl 0x%Lx ->", __func__, sba_dev->sba_hpa, ioc_ctl); ioc_ctl &= ~(IOC_CTRL_RM | IOC_CTRL_NC | IOC_CTRL_CE); ioc_ctl |= IOC_CTRL_DD | IOC_CTRL_D4 | IOC_CTRL_TC; @@ -1666,14 +1666,14 @@ printk("sba_hw_init(): mem_boot 0x%x 0x%x 0x%x 0x%x\n", PAGE0->mem_boot.hpa, /* flush out the last writes */ READ_REG(sba_dev->ioc[i].ioc_hpa + ROPE7_CTL); - DBG_INIT(" ioc[%d] ROPE_CFG 0x%Lx ROPE_DBG 0x%Lx\n", + DBG_INIT(" ioc[%d] ROPE_CFG %#lx ROPE_DBG %lx\n", i, - READ_REG(sba_dev->ioc[i].ioc_hpa + 0x40), - READ_REG(sba_dev->ioc[i].ioc_hpa + 0x50) + (unsigned long) READ_REG(sba_dev->ioc[i].ioc_hpa + 0x40), + (unsigned long) READ_REG(sba_dev->ioc[i].ioc_hpa + 0x50) ); - DBG_INIT(" STATUS_CONTROL 0x%Lx FLUSH_CTRL 0x%Lx\n", - READ_REG(sba_dev->ioc[i].ioc_hpa + 0x108), - READ_REG(sba_dev->ioc[i].ioc_hpa + 0x400) + DBG_INIT(" STATUS_CONTROL %#lx FLUSH_CTRL %#lx\n", + (unsigned long) READ_REG(sba_dev->ioc[i].ioc_hpa + 0x108), + (unsigned long) READ_REG(sba_dev->ioc[i].ioc_hpa + 0x400) ); if (IS_PLUTO(sba_dev->dev)) { @@ -1737,7 +1737,7 @@ sba_common_init(struct sba_device *sba_dev) #ifdef ASSERT_PDIR_SANITY /* Mark first bit busy - ie no IOVA 0 */ sba_dev->ioc[i].res_map[0] = 0x80; - sba_dev->ioc[i].pdir_base[0] = 0xeeffc0addbba0080ULL; + sba_dev->ioc[i].pdir_base[0] = (__force __le64) 0xeeffc0addbba0080ULL; #endif /* Third (and last) part of PIRANHA BUG */ From 9a47a710cf517801a8b4fff9949c4cecb5fd019a Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 30 Aug 2023 11:36:52 +0200 Subject: [PATCH 008/161] parisc: ccio-dma: Fix sparse warnings Signed-off-by: Helge Deller --- drivers/parisc/ccio-dma.c | 18 +++++++++--------- drivers/parisc/iommu-helpers.h | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index 509a4072d50af..9ce0d20a6c581 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -214,7 +214,7 @@ struct ioa_registers { struct ioc { struct ioa_registers __iomem *ioc_regs; /* I/O MMU base address */ u8 *res_map; /* resource map, bit == pdir entry */ - u64 *pdir_base; /* physical base address */ + __le64 *pdir_base; /* physical base address */ u32 pdir_size; /* bytes, function of IOV Space size */ u32 res_hint; /* next available IOVP - circular search */ @@ -339,7 +339,7 @@ ccio_alloc_range(struct ioc *ioc, struct device *dev, size_t size) BUG_ON(pages_needed == 0); BUG_ON((pages_needed * IOVP_SIZE) > DMA_CHUNK_SIZE); - DBG_RES("%s() size: %d pages_needed %d\n", + DBG_RES("%s() size: %zu pages_needed %d\n", __func__, size, pages_needed); /* @@ -427,7 +427,7 @@ ccio_free_range(struct ioc *ioc, dma_addr_t iova, unsigned long pages_mapped) BUG_ON((pages_mapped * IOVP_SIZE) > DMA_CHUNK_SIZE); BUG_ON(pages_mapped > BITS_PER_LONG); - DBG_RES("%s(): res_idx: %d pages_mapped %d\n", + DBG_RES("%s(): res_idx: %d pages_mapped %lu\n", __func__, res_idx, pages_mapped); #ifdef CCIO_COLLECT_STATS @@ -543,7 +543,7 @@ static u32 hint_lookup[] = { * index are bits 12:19 of the value returned by LCI. */ static void -ccio_io_pdir_entry(u64 *pdir_ptr, space_t sid, unsigned long vba, +ccio_io_pdir_entry(__le64 *pdir_ptr, space_t sid, unsigned long vba, unsigned long hints) { register unsigned long pa; @@ -719,7 +719,7 @@ ccio_map_single(struct device *dev, void *addr, size_t size, unsigned long flags; dma_addr_t iovp; dma_addr_t offset; - u64 *pdir_start; + __le64 *pdir_start; unsigned long hint = hint_lookup[(int)direction]; BUG_ON(!dev); @@ -746,8 +746,8 @@ ccio_map_single(struct device *dev, void *addr, size_t size, pdir_start = &(ioc->pdir_base[idx]); - DBG_RUN("%s() 0x%p -> 0x%lx size: %0x%x\n", - __func__, addr, (long)iovp | offset, size); + DBG_RUN("%s() %px -> %#lx size: %zu\n", + __func__, addr, (long)(iovp | offset), size); /* If not cacheline aligned, force SAFE_DMA on the whole mess */ if((size % L1_CACHE_BYTES) || ((unsigned long)addr % L1_CACHE_BYTES)) @@ -805,7 +805,7 @@ ccio_unmap_page(struct device *dev, dma_addr_t iova, size_t size, return; } - DBG_RUN("%s() iovp 0x%lx/%x\n", + DBG_RUN("%s() iovp %#lx/%zx\n", __func__, (long)iova, size); iova ^= offset; /* clear offset bits */ @@ -1283,7 +1283,7 @@ ccio_ioc_init(struct ioc *ioc) iova_space_size>>20, iov_order + PAGE_SHIFT); - ioc->pdir_base = (u64 *)__get_free_pages(GFP_KERNEL, + ioc->pdir_base = (__le64 *)__get_free_pages(GFP_KERNEL, get_order(ioc->pdir_size)); if(NULL == ioc->pdir_base) { panic("%s() could not allocate I/O Page Table\n", __func__); diff --git a/drivers/parisc/iommu-helpers.h b/drivers/parisc/iommu-helpers.h index a00c38b6224ab..c43f1a212a5c8 100644 --- a/drivers/parisc/iommu-helpers.h +++ b/drivers/parisc/iommu-helpers.h @@ -31,8 +31,8 @@ iommu_fill_pdir(struct ioc *ioc, struct scatterlist *startsg, int nents, unsigned long vaddr; long size; - DBG_RUN_SG(" %d : %08lx/%05x %p/%05x\n", nents, - (unsigned long)sg_dma_address(startsg), cnt, + DBG_RUN_SG(" %d : %08lx %p/%05x\n", nents, + (unsigned long)sg_dma_address(startsg), sg_virt(startsg), startsg->length ); From 927c6c8aa27c284a799b8c18784e37d3373af908 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 30 Aug 2023 11:59:55 +0200 Subject: [PATCH 009/161] parisc: iosapic.c: Fix sparse warnings Signed-off-by: Helge Deller --- drivers/parisc/iosapic.c | 4 ++-- drivers/parisc/iosapic_private.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c index a7df764f1a722..a4011461189b6 100644 --- a/drivers/parisc/iosapic.c +++ b/drivers/parisc/iosapic.c @@ -202,9 +202,9 @@ static inline void iosapic_write(void __iomem *iosapic, unsigned int reg, u32 va static DEFINE_SPINLOCK(iosapic_lock); -static inline void iosapic_eoi(void __iomem *addr, unsigned int data) +static inline void iosapic_eoi(__le32 __iomem *addr, __le32 data) { - __raw_writel(data, addr); + __raw_writel((__force u32)data, addr); } /* diff --git a/drivers/parisc/iosapic_private.h b/drivers/parisc/iosapic_private.h index 73ecc657ad954..bd8ff40162b4b 100644 --- a/drivers/parisc/iosapic_private.h +++ b/drivers/parisc/iosapic_private.h @@ -118,8 +118,8 @@ struct iosapic_irt { struct vector_info { struct iosapic_info *iosapic; /* I/O SAPIC this vector is on */ struct irt_entry *irte; /* IRT entry */ - u32 __iomem *eoi_addr; /* precalculate EOI reg address */ - u32 eoi_data; /* IA64: ? PA: swapped txn_data */ + __le32 __iomem *eoi_addr; /* precalculate EOI reg address */ + __le32 eoi_data; /* IA64: ? PA: swapped txn_data */ int txn_irq; /* virtual IRQ number for processor */ ulong txn_addr; /* IA64: id_eid PA: partial HPA */ u32 txn_data; /* CPU interrupt bit */ From b137b9d60b8add5620a06c687a71ce18776730b0 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 31 Aug 2023 22:08:32 +0200 Subject: [PATCH 010/161] parisc: drivers: Fix sparse warning Fix "warning: directive in macro's argument list" warning. Signed-off-by: Helge Deller --- arch/parisc/kernel/drivers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c index 8f4b77648491a..ed8b759480614 100644 --- a/arch/parisc/kernel/drivers.c +++ b/arch/parisc/kernel/drivers.c @@ -925,9 +925,9 @@ static __init void qemu_header(void) pr_info("#define PARISC_MODEL \"%s\"\n\n", boot_cpu_data.pdc.sys_model_name); + #define p ((unsigned long *)&boot_cpu_data.pdc.model) pr_info("#define PARISC_PDC_MODEL 0x%lx, 0x%lx, 0x%lx, " "0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%lx\n\n", - #define p ((unsigned long *)&boot_cpu_data.pdc.model) p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8]); #undef p From b1bef1388c427cdad7331a9c8eb4ebbbe5b954b0 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 31 Aug 2023 22:36:12 +0200 Subject: [PATCH 011/161] parisc: irq: Make irq_stack_union static to avoid sparse warning Signed-off-by: Helge Deller --- arch/parisc/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c index 12c4d4104ade4..2f81bfd4f15e1 100644 --- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -365,7 +365,7 @@ union irq_stack_union { volatile unsigned int lock[1]; }; -DEFINE_PER_CPU(union irq_stack_union, irq_stack_union) = { +static DEFINE_PER_CPU(union irq_stack_union, irq_stack_union) = { .slock = { 1,1,1,1 }, }; #endif From e2884fe84a83c562346eb9d92783a3576ce67177 Mon Sep 17 00:00:00 2001 From: Simon Pilkington Date: Fri, 1 Sep 2023 08:17:38 +0100 Subject: [PATCH 012/161] drm/amd: Make fence wait in suballocator uninterruptible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit c103a23f2f29 ("drm/amd: Convert amdgpu to use suballocation helper.") made the fence wait in amdgpu_sa_bo_new() interruptible but there is no code to handle an interrupt. This caused the kernel to randomly explode in high-VRAM-pressure situations so make it uninterruptible again. Signed-off-by: Simon Pilkington Fixes: c103a23f2f29 ("drm/amd: Convert amdgpu to use suballocation helper.") Reviewed-by: Christian König Signed-off-by: Christian König Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2761 CC: stable@vger.kernel.org # 6.4+ --- drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c index c6b4337eb20c3..10df731998b22 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c @@ -81,7 +81,7 @@ int amdgpu_sa_bo_new(struct amdgpu_sa_manager *sa_manager, unsigned int size) { struct drm_suballoc *sa = drm_suballoc_new(&sa_manager->base, size, - GFP_KERNEL, true, 0); + GFP_KERNEL, false, 0); if (IS_ERR(sa)) { *sa_bo = NULL; From dce19a3fede254fd11e0013f1c6fe3bfc37a4d73 Mon Sep 17 00:00:00 2001 From: David Gow Date: Wed, 30 Aug 2023 08:21:15 +0800 Subject: [PATCH 013/161] kunit: test: Make filter strings in executor_test writable KUnit's attribute filtering feature needs the filter strings passed in to be writable, as it modifies them in-place during parsing. This works for the filters passed on the kernel command line, but the string literals used in the executor tests are at least theoretically read-only (though they work on x86_64 for some reason). s390 wasn't fooled, and crashed when these tests were run. Use a 'char[]' instead, (and make an explicit variable for the current filter in parse_filter_attr_test), which will store the string in a writable segment. Fixes: 76066f93f1df ("kunit: add tests for filtering attributes") Closes: https://lore.kernel.org/linux-kselftest/55950256-c00a-4d21-a2c0-cf9f0e5b8a9a@roeck-us.net/ Signed-off-by: David Gow Tested-by: Guenter Roeck Reviewed-by: Rae Moar Signed-off-by: Shuah Khan --- lib/kunit/executor_test.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/lib/kunit/executor_test.c b/lib/kunit/executor_test.c index 4084071d0eb5b..b4f6f96b28445 100644 --- a/lib/kunit/executor_test.c +++ b/lib/kunit/executor_test.c @@ -119,7 +119,7 @@ static void parse_filter_attr_test(struct kunit *test) { int j, filter_count; struct kunit_attr_filter *parsed_filters; - char *filters = "speed>slow, module!=example"; + char filters[] = "speed>slow, module!=example", *filter = filters; int err = 0; filter_count = kunit_get_filter_count(filters); @@ -128,7 +128,7 @@ static void parse_filter_attr_test(struct kunit *test) parsed_filters = kunit_kcalloc(test, filter_count, sizeof(*parsed_filters), GFP_KERNEL); for (j = 0; j < filter_count; j++) { - parsed_filters[j] = kunit_next_attr_filter(&filters, &err); + parsed_filters[j] = kunit_next_attr_filter(&filter, &err); KUNIT_ASSERT_EQ_MSG(test, err, 0, "failed to parse filter '%s'", filters[j]); } @@ -154,6 +154,7 @@ static void filter_attr_test(struct kunit *test) .start = subsuite, .end = &subsuite[2], }; struct kunit_suite_set got; + char filter[] = "speed>slow"; int err = 0; subsuite[0] = alloc_fake_suite(test, "normal_suite", dummy_attr_test_cases); @@ -168,7 +169,7 @@ static void filter_attr_test(struct kunit *test) * attribute is unset and thus, the filtering is based on the parent attribute * of slow. */ - got = kunit_filter_suites(&suite_set, NULL, "speed>slow", NULL, &err); + got = kunit_filter_suites(&suite_set, NULL, filter, NULL, &err); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, got.start); KUNIT_ASSERT_EQ(test, err, 0); kfree_at_end(test, got.start); @@ -191,12 +192,13 @@ static void filter_attr_empty_test(struct kunit *test) .start = subsuite, .end = &subsuite[2], }; struct kunit_suite_set got; + char filter[] = "module!=dummy"; int err = 0; subsuite[0] = alloc_fake_suite(test, "suite1", dummy_attr_test_cases); subsuite[1] = alloc_fake_suite(test, "suite2", dummy_attr_test_cases); - got = kunit_filter_suites(&suite_set, NULL, "module!=dummy", NULL, &err); + got = kunit_filter_suites(&suite_set, NULL, filter, NULL, &err); KUNIT_ASSERT_EQ(test, err, 0); kfree_at_end(test, got.start); /* just in case */ @@ -211,12 +213,13 @@ static void filter_attr_skip_test(struct kunit *test) .start = subsuite, .end = &subsuite[1], }; struct kunit_suite_set got; + char filter[] = "speed>slow"; int err = 0; subsuite[0] = alloc_fake_suite(test, "suite", dummy_attr_test_cases); /* Want: suite(slow, normal), NULL -> suite(slow with SKIP, normal), NULL */ - got = kunit_filter_suites(&suite_set, NULL, "speed>slow", "skip", &err); + got = kunit_filter_suites(&suite_set, NULL, filter, "skip", &err); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, got.start); KUNIT_ASSERT_EQ(test, err, 0); kfree_at_end(test, got.start); From 45dc8fc07d01b6786db88b5b176c67f9e3487d1e Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Sat, 2 Sep 2023 10:51:02 +0100 Subject: [PATCH 014/161] fbdev/g364fb: fix build failure with mips Fix the typo which resulted in the driver using FB_DEFAULT_IOMEM_HELPERS instead of FB_DEFAULT_IOMEM_OPS as the fbdev I/O helpers. Fixes: 501126083855 ("fbdev/g364fb: Use fbdev I/O helpers") Suggested-by: Linus Torvalds Signed-off-by: Sudip Mukherjee Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/20230902095102.5908-1-sudip.mukherjee@codethink.co.uk --- drivers/video/fbdev/g364fb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/g364fb.c b/drivers/video/fbdev/g364fb.c index 7a1013b22fa71..ee6fe51e0a6b1 100644 --- a/drivers/video/fbdev/g364fb.c +++ b/drivers/video/fbdev/g364fb.c @@ -112,7 +112,7 @@ static int g364fb_blank(int blank, struct fb_info *info); static const struct fb_ops g364fb_ops = { .owner = THIS_MODULE, - FB_DEFAULT_IOMEM_HELPERS, + FB_DEFAULT_IOMEM_OPS, .fb_setcolreg = g364fb_setcolreg, .fb_pan_display = g364fb_pan_display, .fb_blank = g364fb_blank, From 7583028d359db3cd0072badcc576b4f9455fd27a Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 4 Sep 2023 10:14:20 +0800 Subject: [PATCH 015/161] drm: gm12u320: Fix the timeout usage for usb_bulk_msg() The timeout arg of usb_bulk_msg() is ms already, which has been converted to jiffies by msecs_to_jiffies() in usb_start_wait_urb(). So fix the usage by removing the redundant msecs_to_jiffies() in the macros. And as Hans suggested, also remove msecs_to_jiffies() for the IDLE_TIMEOUT macro to make it consistent here and so change IDLE_TIMEOUT to msecs_to_jiffies(IDLE_TIMEOUT) where it is used. Fixes: e4f86e437164 ("drm: Add Grain Media GM12U320 driver v2") Signed-off-by: Jinjie Ruan Suggested-by: Hans de Goede Reviewed-by: Hans de Goede Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/20230904021421.1663892-1-ruanjinjie@huawei.com --- drivers/gpu/drm/tiny/gm12u320.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/tiny/gm12u320.c b/drivers/gpu/drm/tiny/gm12u320.c index c5bb683e440c5..0187539ff5eaa 100644 --- a/drivers/gpu/drm/tiny/gm12u320.c +++ b/drivers/gpu/drm/tiny/gm12u320.c @@ -70,10 +70,10 @@ MODULE_PARM_DESC(eco_mode, "Turn on Eco mode (less bright, more silent)"); #define READ_STATUS_SIZE 13 #define MISC_VALUE_SIZE 4 -#define CMD_TIMEOUT msecs_to_jiffies(200) -#define DATA_TIMEOUT msecs_to_jiffies(1000) -#define IDLE_TIMEOUT msecs_to_jiffies(2000) -#define FIRST_FRAME_TIMEOUT msecs_to_jiffies(2000) +#define CMD_TIMEOUT 200 +#define DATA_TIMEOUT 1000 +#define IDLE_TIMEOUT 2000 +#define FIRST_FRAME_TIMEOUT 2000 #define MISC_REQ_GET_SET_ECO_A 0xff #define MISC_REQ_GET_SET_ECO_B 0x35 @@ -389,7 +389,7 @@ static void gm12u320_fb_update_work(struct work_struct *work) * switches back to showing its logo. */ queue_delayed_work(system_long_wq, &gm12u320->fb_update.work, - IDLE_TIMEOUT); + msecs_to_jiffies(IDLE_TIMEOUT)); return; err: From ab048302026d7701e7fbd718917e0dbcff0c4223 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 4 Sep 2023 14:17:56 +0300 Subject: [PATCH 016/161] ovl: fix failed copyup of fileattr on a symlink Some local filesystems support setting persistent fileattr flags (e.g. FS_NOATIME_FL) on directories and regular files via ioctl. Some of those persistent fileattr flags are reflected to vfs as in-memory inode flags (e.g. S_NOATIME). Overlayfs uses the in-memory inode flags (e.g. S_NOATIME) on a lower file as an indication that a the lower file may have persistent inode fileattr flags (e.g. FS_NOATIME_FL) that need to be copied to upper file. However, in some cases, the S_NOATIME in-memory flag could be a false indication for persistent FS_NOATIME_FL fileattr. For example, with NFS and FUSE lower fs, as was the case in the two bug reports, the S_NOATIME flag is set unconditionally for all inodes. Users cannot set persistent fileattr flags on symlinks and special files, but in some local fs, such as ext4/btrfs/tmpfs, the FS_NOATIME_FL fileattr flag are inheritted to symlinks and special files from parent directory. In both cases described above, when lower symlink has the S_NOATIME flag, overlayfs will try to copy the symlink's fileattrs and fail with error ENOXIO, because it could not open the symlink for the ioctl security hook. To solve this failure, do not attempt to copyup fileattrs for anything other than directories and regular files. Reported-by: Ruiwen Zhao Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217850 Fixes: 72db82115d2b ("ovl: copy up sync/noatime fileattr flags") Cc: # v5.15 Reviewed-by: Miklos Szeredi Signed-off-by: Amir Goldstein --- fs/overlayfs/copy_up.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index bae404a1bad48..d1761ec5866aa 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -618,7 +618,8 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) if (err) return err; - if (inode->i_flags & OVL_COPY_I_FLAGS_MASK) { + if (inode->i_flags & OVL_COPY_I_FLAGS_MASK && + (S_ISREG(c->stat.mode) || S_ISDIR(c->stat.mode))) { /* * Copy the fileattr inode flags that are the source of already * copied i_flags From 724768a39374d35b70eaeae8dd87048a2ec7ae8e Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 22 Aug 2023 20:50:59 +0300 Subject: [PATCH 017/161] ovl: fix incorrect fdput() on aio completion ovl_{read,write}_iter() always call fdput(real) to put one or zero refcounts of the real file, but for aio, whether it was submitted or not, ovl_aio_put() also calls fdput(), which is not balanced. This is only a problem in the less common case when FDPUT_FPUT flag is set. To fix the problem use get_file() to take file refcount and use fput() instead of fdput() in ovl_aio_put(). Fixes: 2406a307ac7d ("ovl: implement async IO routines") Cc: # v5.6 Reviewed-by: Miklos Szeredi Signed-off-by: Amir Goldstein --- fs/overlayfs/file.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 3b4cc633d7637..4193633c4c7a7 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -19,7 +19,6 @@ struct ovl_aio_req { struct kiocb iocb; refcount_t ref; struct kiocb *orig_iocb; - struct fd fd; }; static struct kmem_cache *ovl_aio_request_cachep; @@ -280,7 +279,7 @@ static rwf_t ovl_iocb_to_rwf(int ifl) static inline void ovl_aio_put(struct ovl_aio_req *aio_req) { if (refcount_dec_and_test(&aio_req->ref)) { - fdput(aio_req->fd); + fput(aio_req->iocb.ki_filp); kmem_cache_free(ovl_aio_request_cachep, aio_req); } } @@ -342,10 +341,9 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (!aio_req) goto out; - aio_req->fd = real; real.flags = 0; aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, real.file); + kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); aio_req->iocb.ki_complete = ovl_aio_rw_complete; refcount_set(&aio_req->ref, 2); ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); @@ -409,10 +407,9 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (!aio_req) goto out; - aio_req->fd = real; real.flags = 0; aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, real.file); + kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); aio_req->iocb.ki_flags = ifl; aio_req->iocb.ki_complete = ovl_aio_rw_complete; refcount_set(&aio_req->ref, 2); From 2810c1e99867a811e631dd24e63e6c1e3b78a59d Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Sun, 3 Sep 2023 15:10:25 +0800 Subject: [PATCH 018/161] kunit: Fix wild-memory-access bug in kunit_free_suite_set() Inject fault while probing kunit-example-test.ko, if kstrdup() fails in mod_sysfs_setup() in load_module(), the mod->state will switch from MODULE_STATE_COMING to MODULE_STATE_GOING instead of from MODULE_STATE_LIVE to MODULE_STATE_GOING, so only kunit_module_exit() will be called without kunit_module_init(), and the mod->kunit_suites is no set correctly and the free in kunit_free_suite_set() will cause below wild-memory-access bug. The mod->state state machine when load_module() succeeds: MODULE_STATE_UNFORMED ---> MODULE_STATE_COMING ---> MODULE_STATE_LIVE ^ | | | delete_module +---------------- MODULE_STATE_GOING <---------+ The mod->state state machine when load_module() fails at mod_sysfs_setup(): MODULE_STATE_UNFORMED ---> MODULE_STATE_COMING ---> MODULE_STATE_GOING ^ | | | +-----------------------------------------------+ Call kunit_module_init() at MODULE_STATE_COMING state to fix the issue because MODULE_STATE_LIVE is transformed from it. Unable to handle kernel paging request at virtual address ffffff341e942a88 KASAN: maybe wild-memory-access in range [0x0003f9a0f4a15440-0x0003f9a0f4a15447] Mem abort info: ESR = 0x0000000096000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x04: level 0 translation fault Data abort info: ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000000441ea000 [ffffff341e942a88] pgd=0000000000000000, p4d=0000000000000000 Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP Modules linked in: kunit_example_test(-) cfg80211 rfkill 8021q garp mrp stp llc ipv6 [last unloaded: kunit_example_test] CPU: 3 PID: 2035 Comm: modprobe Tainted: G W N 6.5.0-next-20230828+ #136 Hardware name: linux,dummy-virt (DT) pstate: a0000005 (NzCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : kfree+0x2c/0x70 lr : kunit_free_suite_set+0xcc/0x13c sp : ffff8000829b75b0 x29: ffff8000829b75b0 x28: ffff8000829b7b90 x27: 0000000000000000 x26: dfff800000000000 x25: ffffcd07c82a7280 x24: ffffcd07a50ab300 x23: ffffcd07a50ab2e8 x22: 1ffff00010536ec0 x21: dfff800000000000 x20: ffffcd07a50ab2f0 x19: ffffcd07a50ab2f0 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: ffffcd07c24b6764 x14: ffffcd07c24b63c0 x13: ffffcd07c4cebb94 x12: ffff700010536ec7 x11: 1ffff00010536ec6 x10: ffff700010536ec6 x9 : dfff800000000000 x8 : 00008fffefac913a x7 : 0000000041b58ab3 x6 : 0000000000000000 x5 : 1ffff00010536ec5 x4 : ffff8000829b7628 x3 : dfff800000000000 x2 : ffffff341e942a80 x1 : ffffcd07a50aa000 x0 : fffffc0000000000 Call trace: kfree+0x2c/0x70 kunit_free_suite_set+0xcc/0x13c kunit_module_notify+0xd8/0x360 blocking_notifier_call_chain+0xc4/0x128 load_module+0x382c/0x44a4 init_module_from_file+0xd4/0x128 idempotent_init_module+0x2c8/0x524 __arm64_sys_finit_module+0xac/0x100 invoke_syscall+0x6c/0x258 el0_svc_common.constprop.0+0x160/0x22c do_el0_svc+0x44/0x5c el0_svc+0x38/0x78 el0t_64_sync_handler+0x13c/0x158 el0t_64_sync+0x190/0x194 Code: aa0003e1 b25657e0 d34cfc42 8b021802 (f9400440) ---[ end trace 0000000000000000 ]--- Kernel panic - not syncing: Oops: Fatal exception SMP: stopping secondary CPUs Kernel Offset: 0x4d0742200000 from 0xffff800080000000 PHYS_OFFSET: 0xffffee43c0000000 CPU features: 0x88000203,3c020000,1000421b Memory Limit: none Rebooting in 1 seconds.. Fixes: 3d6e44623841 ("kunit: unify module and builtin suite definitions") Signed-off-by: Jinjie Ruan Reviewed-by: Rae Moar Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/test.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 49698a168437a..421f139814123 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -784,12 +784,13 @@ static int kunit_module_notify(struct notifier_block *nb, unsigned long val, switch (val) { case MODULE_STATE_LIVE: - kunit_module_init(mod); break; case MODULE_STATE_GOING: kunit_module_exit(mod); break; case MODULE_STATE_COMING: + kunit_module_init(mod); + break; case MODULE_STATE_UNFORMED: break; } From 4b00920da1dd2bbb33baeb2e7b9808af4c68de97 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Sun, 3 Sep 2023 15:10:26 +0800 Subject: [PATCH 019/161] kunit: Fix the wrong err path and add goto labels in kunit_filter_suites() Take the last kfree(parsed_filters) and add it to be the first. Take the first kfree(copy) and add it to be the last. The Best practice is to return these errors reversely. And as David suggested, add several labels which target only the things which actually have been allocated so far. Fixes: 529534e8cba3 ("kunit: Add ability to filter attributes") Fixes: abbf73816b6f ("kunit: fix possible memory leak in kunit_filter_suites()") Signed-off-by: Jinjie Ruan Reviewed-by: Rae Moar Suggested-by: David Gow Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/executor.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c index 5181aa2e760b6..0eda42b0c9bba 100644 --- a/lib/kunit/executor.c +++ b/lib/kunit/executor.c @@ -166,7 +166,7 @@ kunit_filter_suites(const struct kunit_suite_set *suite_set, for (j = 0; j < filter_count; j++) parsed_filters[j] = kunit_next_attr_filter(&filters, err); if (*err) - goto err; + goto free_parsed_filters; } for (i = 0; &suite_set->start[i] != suite_set->end; i++) { @@ -178,7 +178,7 @@ kunit_filter_suites(const struct kunit_suite_set *suite_set, parsed_glob.test_glob); if (IS_ERR(filtered_suite)) { *err = PTR_ERR(filtered_suite); - goto err; + goto free_parsed_filters; } } if (filter_count > 0 && parsed_filters != NULL) { @@ -195,10 +195,11 @@ kunit_filter_suites(const struct kunit_suite_set *suite_set, filtered_suite = new_filtered_suite; if (*err) - goto err; + goto free_parsed_filters; + if (IS_ERR(filtered_suite)) { *err = PTR_ERR(filtered_suite); - goto err; + goto free_parsed_filters; } if (!filtered_suite) break; @@ -213,17 +214,19 @@ kunit_filter_suites(const struct kunit_suite_set *suite_set, filtered.start = copy_start; filtered.end = copy; -err: - if (*err) - kfree(copy); +free_parsed_filters: + if (filter_count) + kfree(parsed_filters); +free_parsed_glob: if (filter_glob) { kfree(parsed_glob.suite_glob); kfree(parsed_glob.test_glob); } - if (filter_count) - kfree(parsed_filters); +free_copy: + if (*err) + kfree(copy); return filtered; } From 2b56a4b79b7b3086e842d39611db4e19b19dbe2a Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Sun, 3 Sep 2023 15:10:27 +0800 Subject: [PATCH 020/161] kunit: Fix possible null-ptr-deref in kunit_parse_glob_filter() Inject fault while probing kunit-example-test.ko, if kzalloc fails in kunit_parse_glob_filter(), strcpy() or strncpy() to NULL will cause below null-ptr-deref bug. So check NULL for kzalloc() and return int instead of void for kunit_parse_glob_filter(). Unable to handle kernel paging request at virtual address dfff800000000000 KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] Mem abort info: ESR = 0x0000000096000005 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x05: level 1 translation fault Data abort info: ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [dfff800000000000] address between user and kernel address ranges Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP Modules linked in: kunit_example_test cfg80211 rfkill 8021q garp mrp stp llc ipv6 [last unloaded: kunit_example_test] CPU: 4 PID: 6047 Comm: modprobe Tainted: G W N 6.5.0-next-20230829+ #141 Hardware name: linux,dummy-virt (DT) pstate: 80000005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : strncpy+0x58/0xc0 lr : kunit_filter_suites+0x15c/0xa84 sp : ffff800082a17420 x29: ffff800082a17420 x28: 0000000000000000 x27: 0000000000000004 x26: 0000000000000000 x25: ffffa847e40a5320 x24: 0000000000000001 x23: 0000000000000000 x22: 0000000000000001 x21: dfff800000000000 x20: 000000000000002a x19: 0000000000000000 x18: 00000000750b3b54 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 0000000000000000 x13: 34393178302f3039 x12: ffff7508fcea4ec1 x11: 1ffff508fcea4ec0 x10: ffff7508fcea4ec0 x9 : dfff800000000000 x8 : ffff6051b1a7f86a x7 : ffff800082a17270 x6 : 0000000000000002 x5 : 0000000000000098 x4 : ffff028d9817b250 x3 : 0000000000000000 x2 : 0000000000000000 x1 : ffffa847e40a5320 x0 : 0000000000000000 Call trace: strncpy+0x58/0xc0 kunit_filter_suites+0x15c/0xa84 kunit_module_notify+0x1b0/0x3ac blocking_notifier_call_chain+0xc4/0x128 do_init_module+0x250/0x594 load_module+0x37b0/0x44b4 init_module_from_file+0xd4/0x128 idempotent_init_module+0x2c8/0x524 __arm64_sys_finit_module+0xac/0x100 invoke_syscall+0x6c/0x258 el0_svc_common.constprop.0+0x160/0x22c do_el0_svc+0x44/0x5c el0_svc+0x38/0x78 el0t_64_sync_handler+0x13c/0x158 el0t_64_sync+0x190/0x194 Code: 5400028a d343fe63 12000a62 39400034 (38f56863) ---[ end trace 0000000000000000 ]--- Kernel panic - not syncing: Oops: Fatal exception SMP: stopping secondary CPUs Kernel Offset: 0x284761400000 from 0xffff800080000000 PHYS_OFFSET: 0xfffffd7380000000 CPU features: 0x88000203,3c020000,1000421b Memory Limit: none Rebooting in 1 seconds.. Fixes: a127b154a8f2 ("kunit: tool: allow filtering test cases via glob") Signed-off-by: Jinjie Ruan Reviewed-by: Rae Moar Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/executor.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c index 0eda42b0c9bba..28f144de748b5 100644 --- a/lib/kunit/executor.c +++ b/lib/kunit/executor.c @@ -65,7 +65,7 @@ struct kunit_glob_filter { }; /* Split "suite_glob.test_glob" into two. Assumes filter_glob is not empty. */ -static void kunit_parse_glob_filter(struct kunit_glob_filter *parsed, +static int kunit_parse_glob_filter(struct kunit_glob_filter *parsed, const char *filter_glob) { const int len = strlen(filter_glob); @@ -73,16 +73,28 @@ static void kunit_parse_glob_filter(struct kunit_glob_filter *parsed, if (!period) { parsed->suite_glob = kzalloc(len + 1, GFP_KERNEL); + if (!parsed->suite_glob) + return -ENOMEM; + parsed->test_glob = NULL; strcpy(parsed->suite_glob, filter_glob); - return; + return 0; } parsed->suite_glob = kzalloc(period - filter_glob + 1, GFP_KERNEL); + if (!parsed->suite_glob) + return -ENOMEM; + parsed->test_glob = kzalloc(len - (period - filter_glob) + 1, GFP_KERNEL); + if (!parsed->test_glob) { + kfree(parsed->suite_glob); + return -ENOMEM; + } strncpy(parsed->suite_glob, filter_glob, period - filter_glob); strncpy(parsed->test_glob, period + 1, len - (period - filter_glob)); + + return 0; } /* Create a copy of suite with only tests that match test_glob. */ @@ -152,8 +164,11 @@ kunit_filter_suites(const struct kunit_suite_set *suite_set, } copy_start = copy; - if (filter_glob) - kunit_parse_glob_filter(&parsed_glob, filter_glob); + if (filter_glob) { + *err = kunit_parse_glob_filter(&parsed_glob, filter_glob); + if (*err) + goto free_copy; + } /* Parse attribute filters */ if (filters) { From 9076bc476d7ebf0565903c4b048442131825c1c3 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Sun, 3 Sep 2023 15:10:28 +0800 Subject: [PATCH 021/161] kunit: Fix possible memory leak in kunit_filter_suites() If both filter_glob and filters are not NULL, and kunit_parse_glob_filter() succeed, but kcalloc parsed_filters fails, the suite_glob and test_glob of parsed kzalloc in kunit_parse_glob_filter() will be leaked. As Rae suggested, assign -ENOMEM to *err to correctly free copy and goto free_parsed_glob to free the suite/test_glob of parsed. Fixes: 1c9fd080dffe ("kunit: fix uninitialized variables bug in attributes filtering") Signed-off-by: Jinjie Ruan Suggested-by: Rae Moar Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/executor.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c index 28f144de748b5..a6348489d45fe 100644 --- a/lib/kunit/executor.c +++ b/lib/kunit/executor.c @@ -175,8 +175,8 @@ kunit_filter_suites(const struct kunit_suite_set *suite_set, filter_count = kunit_get_filter_count(filters); parsed_filters = kcalloc(filter_count, sizeof(*parsed_filters), GFP_KERNEL); if (!parsed_filters) { - kfree(copy); - return filtered; + *err = -ENOMEM; + goto free_parsed_glob; } for (j = 0; j < filter_count; j++) parsed_filters[j] = kunit_next_attr_filter(&filters, err); From f4e4ada586995b17f828c6d147d1800eb1471450 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Mon, 26 Jun 2023 08:11:44 +0800 Subject: [PATCH 022/161] selftests/ftrace: Correctly enable event in instance-event.tc Function instance_set() expects to enable event 'sched_switch', so we should set 1 to its 'enable' file. Testcase passed after this patch: # ./ftracetest test.d/instances/instance-event.tc === Ftrace unit tests === [1] Test creation and deletion of trace instances while setting an event [PASS] # of passed: 1 # of failed: 0 # of unresolved: 0 # of untested: 0 # of unsupported: 0 # of xfailed: 0 # of undefined(test bug): 0 Signed-off-by: Zheng Yejian Acked-by: Masami Hiramatsu (Google) Acked-by: Steven Rostedt (Google) Signed-off-by: Shuah Khan --- .../testing/selftests/ftrace/test.d/instances/instance-event.tc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/instances/instance-event.tc b/tools/testing/selftests/ftrace/test.d/instances/instance-event.tc index 0eb47fbb3f44d..42422e4251078 100644 --- a/tools/testing/selftests/ftrace/test.d/instances/instance-event.tc +++ b/tools/testing/selftests/ftrace/test.d/instances/instance-event.tc @@ -39,7 +39,7 @@ instance_read() { instance_set() { while :; do - echo 1 > foo/events/sched/sched_switch + echo 1 > foo/events/sched/sched_switch/enable done 2> /dev/null } From 51aab5ffceb43e05119eb059048fd75765d2bc21 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 5 Sep 2023 14:26:08 -0400 Subject: [PATCH 023/161] tracefs: Add missing lockdown check to tracefs_create_dir() The function tracefs_create_dir() was missing a lockdown check and was called by the RV code. This gave an inconsistent behavior of this function returning success while other tracefs functions failed. This caused the inode being freed by the wrong kmem_cache. Link: https://lkml.kernel.org/r/20230905182711.692687042@goodmis.org Link: https://lore.kernel.org/all/202309050916.58201dc6-oliver.sang@intel.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Ajay Kaher Cc: Ching-lin Yu Fixes: bf8e602186ec4 ("tracing: Do not create tracefs files if tracefs lockdown is in effect") Reported-by: kernel test robot Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index de5b72216b1a7..3b8dd938b1c8f 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -673,6 +673,9 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, */ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) { + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + return __create_dir(name, parent, &simple_dir_inode_operations); } From e24709454c458283146485b76abfcc6d8ea964c7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 5 Sep 2023 14:26:09 -0400 Subject: [PATCH 024/161] tracefs/eventfs: Add missing lockdown checks All the eventfs external functions do not check if TRACEFS_LOCKDOWN was set or not. This can caused some functions to return success while others fail, which can trigger unexpected errors. Add the missing lockdown checks. Link: https://lkml.kernel.org/r/20230905182711.899724045@goodmis.org Link: https://lore.kernel.org/all/202309050916.58201dc6-oliver.sang@intel.com/ Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Ajay Kaher Cc: Ching-lin Yu Reported-by: kernel test robot Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 237c6f370ad9d..fa1a1679a8866 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -491,6 +491,9 @@ struct dentry *eventfs_create_events_dir(const char *name, struct tracefs_inode *ti; struct inode *inode; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + if (IS_ERR(dentry)) return dentry; @@ -538,6 +541,9 @@ struct eventfs_file *eventfs_add_subsystem_dir(const char *name, struct eventfs_inode *ei_parent; struct eventfs_file *ef; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + if (!parent) return ERR_PTR(-EINVAL); @@ -569,6 +575,9 @@ struct eventfs_file *eventfs_add_dir(const char *name, { struct eventfs_file *ef; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + if (!ef_parent) return ERR_PTR(-EINVAL); @@ -606,6 +615,9 @@ int eventfs_add_events_file(const char *name, umode_t mode, struct eventfs_inode *ei; struct eventfs_file *ef; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return -ENODEV; + if (!parent) return -EINVAL; @@ -654,6 +666,9 @@ int eventfs_add_file(const char *name, umode_t mode, { struct eventfs_file *ef; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return -ENODEV; + if (!ef_parent) return -EINVAL; From 3f091387a39795812aab4303949bbc9baa22c077 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 1 Sep 2023 08:04:31 +0200 Subject: [PATCH 025/161] parisc: shmparam.h: Document aliasing requirements of PA-RISC Add some documentation why PA-RISC uses SHMLBA and SHM_COLOUR. Signed-off-by: Helge Deller --- arch/parisc/include/asm/shmparam.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/parisc/include/asm/shmparam.h b/arch/parisc/include/asm/shmparam.h index 74f74e4d35b72..5a95b0f62b87e 100644 --- a/arch/parisc/include/asm/shmparam.h +++ b/arch/parisc/include/asm/shmparam.h @@ -2,6 +2,21 @@ #ifndef _ASMPARISC_SHMPARAM_H #define _ASMPARISC_SHMPARAM_H +/* + * PA-RISC uses virtually indexed & physically tagged (VIPT) caches + * which has strict requirements when two pages to the same physical + * address are accessed through different mappings. Read the section + * "Address Aliasing" in the arch docs for more detail: + * PA-RISC 1.1 (page 3-6): + * https://parisc.wiki.kernel.org/images-parisc/6/68/Pa11_acd.pdf + * PA-RISC 2.0 (page F-5): + * https://parisc.wiki.kernel.org/images-parisc/7/73/Parisc2.0.pdf + * + * For Linux we allow kernel and userspace to map pages on page size + * granularity (SHMLBA) but have to ensure that, if two pages are + * mapped to the same physical address, the virtual and physical + * addresses modulo SHM_COLOUR are identical. + */ #define SHMLBA PAGE_SIZE /* attach addr a multiple of this */ #define SHM_COLOUR 0x00400000 /* shared mappings colouring */ From 70bd68d5b61a63f87b5e986d9100c8ce46c5df4d Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 1 Sep 2023 16:09:23 +0200 Subject: [PATCH 026/161] parisc: Prepare for Block-TLB support on 32-bit kernel Change HUGEPAGE_SIZE to become 4 MB on 32-bit kernels, which leads that kernel code and kernel data will start on 4 MB boundaries. Although a 32-bit kernel does not support huge pages, most machines have support for Block-TLBs (BTLB) which allow to configure the system to use large pages (block TLBs) to minimize the TLB contention. This is done through calls to PDC and the 32-bit kernel can then call BTLB PDC functions to tell the machine to optimize the TLBs. Signed-off-by: Helge Deller --- arch/parisc/kernel/asm-offsets.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c index 94652e13c2603..757816a7bd4b2 100644 --- a/arch/parisc/kernel/asm-offsets.c +++ b/arch/parisc/kernel/asm-offsets.c @@ -275,6 +275,8 @@ int main(void) * and kernel data on physical huge pages */ #ifdef CONFIG_HUGETLB_PAGE DEFINE(HUGEPAGE_SIZE, 1UL << REAL_HPAGE_SHIFT); +#elif !defined(CONFIG_64BIT) + DEFINE(HUGEPAGE_SIZE, 4*1024*1024); #else DEFINE(HUGEPAGE_SIZE, PAGE_SIZE); #endif From eda205211a522312b667d5bd25d58bee8504c09e Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 7 Sep 2023 08:03:35 +0200 Subject: [PATCH 027/161] parisc: BTLB: Clear possibly existing BTLB entries Call PDC to remove all existing BTLB entries (which may exist from some previous operating system runs) before switching to virtual mode. Signed-off-by: Helge Deller --- arch/parisc/kernel/head.S | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S index fd15fd4bbb61b..a171bf3c6b318 100644 --- a/arch/parisc/kernel/head.S +++ b/arch/parisc/kernel/head.S @@ -180,10 +180,10 @@ $pgt_fill_loop: std %dp,0x18(%r10) #endif -#ifdef CONFIG_64BIT - /* Get PDCE_PROC for monarch CPU. */ #define MEM_PDC_LO 0x388 #define MEM_PDC_HI 0x35C +#ifdef CONFIG_64BIT + /* Get PDCE_PROC for monarch CPU. */ ldw MEM_PDC_LO(%r0),%r3 ldw MEM_PDC_HI(%r0),%r10 depd %r10, 31, 32, %r3 /* move to upper word */ @@ -269,7 +269,17 @@ stext_pdc_ret: tovirt_r1 %r6 mtctl %r6,%cr30 /* restore task thread info */ #endif - + +#ifndef CONFIG_64BIT + /* clear all BTLBs */ + ldi PDC_BLOCK_TLB,%arg0 + load32 PA(stext_pdc_btlb_ret), %rp + ldw MEM_PDC_LO(%r0),%r3 + bv (%r3) + ldi PDC_BTLB_PURGE_ALL,%arg1 +stext_pdc_btlb_ret: +#endif + /* PARANOID: clear user scratch/user space SR's */ mtsp %r0,%sr0 mtsp %r0,%sr1 From 510610f96d65277940a02f47d7bc7a06c8a2ab7a Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 7 Sep 2023 08:06:37 +0200 Subject: [PATCH 028/161] parisc: BTLB: Add BTLB insert and purge firmware function wrappers Signed-off-by: Helge Deller --- arch/parisc/include/asm/pdc.h | 3 +++ arch/parisc/kernel/firmware.c | 42 +++++++++++++++++++++++++++-------- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/arch/parisc/include/asm/pdc.h b/arch/parisc/include/asm/pdc.h index 269b9a159f01f..2fd7eb2f37465 100644 --- a/arch/parisc/include/asm/pdc.h +++ b/arch/parisc/include/asm/pdc.h @@ -46,6 +46,9 @@ int pdc_cache_info(struct pdc_cache_info *cache); int pdc_spaceid_bits(unsigned long *space_bits); #ifndef CONFIG_PA20 int pdc_btlb_info(struct pdc_btlb_info *btlb); +int pdc_btlb_insert(unsigned long long vpage, unsigned long physpage, unsigned long len, + unsigned long entry_info, unsigned long slot); +int pdc_btlb_purge_all(void); int pdc_mem_map_hpa(struct pdc_memory_map *r_addr, struct pdc_module_path *mod_path); #endif /* !CONFIG_PA20 */ int pdc_pim_toc11(struct pdc_toc_pim_11 *ret); diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c index 8f37e75f2fb9c..79bf44d04aa85 100644 --- a/arch/parisc/kernel/firmware.c +++ b/arch/parisc/kernel/firmware.c @@ -696,18 +696,42 @@ int pdc_spaceid_bits(unsigned long *space_bits) */ int pdc_btlb_info(struct pdc_btlb_info *btlb) { - int retval; + int retval; unsigned long flags; - spin_lock_irqsave(&pdc_lock, flags); - retval = mem_pdc_call(PDC_BLOCK_TLB, PDC_BTLB_INFO, __pa(pdc_result), 0); - memcpy(btlb, pdc_result, sizeof(*btlb)); - spin_unlock_irqrestore(&pdc_lock, flags); + spin_lock_irqsave(&pdc_lock, flags); + retval = mem_pdc_call(PDC_BLOCK_TLB, PDC_BTLB_INFO, __pa(pdc_result), 0); + memcpy(btlb, pdc_result, sizeof(*btlb)); + spin_unlock_irqrestore(&pdc_lock, flags); - if(retval < 0) { - btlb->max_size = 0; - } - return retval; + if(retval < 0) { + btlb->max_size = 0; + } + return retval; +} + +int pdc_btlb_insert(unsigned long long vpage, unsigned long physpage, unsigned long len, + unsigned long entry_info, unsigned long slot) +{ + int retval; + unsigned long flags; + + spin_lock_irqsave(&pdc_lock, flags); + retval = mem_pdc_call(PDC_BLOCK_TLB, PDC_BTLB_INSERT, (unsigned long) (vpage >> 32), + (unsigned long) vpage, physpage, len, entry_info, slot); + spin_unlock_irqrestore(&pdc_lock, flags); + return retval; +} + +int pdc_btlb_purge_all(void) +{ + int retval; + unsigned long flags; + + spin_lock_irqsave(&pdc_lock, flags); + retval = mem_pdc_call(PDC_BLOCK_TLB, PDC_BTLB_PURGE_ALL); + spin_unlock_irqrestore(&pdc_lock, flags); + return retval; } /** From 4695e45ec0cd4d422694b69e9f39c742bb1f35fc Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 7 Sep 2023 08:07:55 +0200 Subject: [PATCH 029/161] parisc: BTLB: _edata symbol has to be page aligned for BTLB support Signed-off-by: Helge Deller --- arch/parisc/kernel/vmlinux.lds.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 1aaa2ca098003..58694d1989c23 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -154,6 +154,7 @@ SECTIONS } /* End of data section */ + . = ALIGN(PAGE_SIZE); _edata = .; /* BSS */ From 3756597a684da9fbf966f91ed351033ac1a0f55f Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 7 Sep 2023 08:51:20 +0200 Subject: [PATCH 030/161] parisc: firmware: Simplify calling non-PA20 functions Instead of usig #ifdefs, simply return PDC_BAD_PROC for functions which aren't available on 64-bit CPUs. Signed-off-by: Helge Deller --- arch/parisc/include/asm/pdc.h | 2 -- arch/parisc/kernel/firmware.c | 14 ++++++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/parisc/include/asm/pdc.h b/arch/parisc/include/asm/pdc.h index 2fd7eb2f37465..5d2d9737e579d 100644 --- a/arch/parisc/include/asm/pdc.h +++ b/arch/parisc/include/asm/pdc.h @@ -44,13 +44,11 @@ int pdc_model_capabilities(unsigned long *capabilities); int pdc_model_platform_info(char *orig_prod_num, char *current_prod_num, char *serial_no); int pdc_cache_info(struct pdc_cache_info *cache); int pdc_spaceid_bits(unsigned long *space_bits); -#ifndef CONFIG_PA20 int pdc_btlb_info(struct pdc_btlb_info *btlb); int pdc_btlb_insert(unsigned long long vpage, unsigned long physpage, unsigned long len, unsigned long entry_info, unsigned long slot); int pdc_btlb_purge_all(void); int pdc_mem_map_hpa(struct pdc_memory_map *r_addr, struct pdc_module_path *mod_path); -#endif /* !CONFIG_PA20 */ int pdc_pim_toc11(struct pdc_toc_pim_11 *ret); int pdc_pim_toc20(struct pdc_toc_pim_20 *ret); int pdc_lan_station_id(char *lan_addr, unsigned long net_hpa); diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c index 79bf44d04aa85..81078abec521a 100644 --- a/arch/parisc/kernel/firmware.c +++ b/arch/parisc/kernel/firmware.c @@ -687,7 +687,6 @@ int pdc_spaceid_bits(unsigned long *space_bits) return retval; } -#ifndef CONFIG_PA20 /** * pdc_btlb_info - Return block TLB information. * @btlb: The return buffer. @@ -699,6 +698,9 @@ int pdc_btlb_info(struct pdc_btlb_info *btlb) int retval; unsigned long flags; + if (IS_ENABLED(CONFIG_PA20)) + return PDC_BAD_PROC; + spin_lock_irqsave(&pdc_lock, flags); retval = mem_pdc_call(PDC_BLOCK_TLB, PDC_BTLB_INFO, __pa(pdc_result), 0); memcpy(btlb, pdc_result, sizeof(*btlb)); @@ -716,6 +718,9 @@ int pdc_btlb_insert(unsigned long long vpage, unsigned long physpage, unsigned l int retval; unsigned long flags; + if (IS_ENABLED(CONFIG_PA20)) + return PDC_BAD_PROC; + spin_lock_irqsave(&pdc_lock, flags); retval = mem_pdc_call(PDC_BLOCK_TLB, PDC_BTLB_INSERT, (unsigned long) (vpage >> 32), (unsigned long) vpage, physpage, len, entry_info, slot); @@ -728,6 +733,9 @@ int pdc_btlb_purge_all(void) int retval; unsigned long flags; + if (IS_ENABLED(CONFIG_PA20)) + return PDC_BAD_PROC; + spin_lock_irqsave(&pdc_lock, flags); retval = mem_pdc_call(PDC_BLOCK_TLB, PDC_BTLB_PURGE_ALL); spin_unlock_irqrestore(&pdc_lock, flags); @@ -752,6 +760,9 @@ int pdc_mem_map_hpa(struct pdc_memory_map *address, int retval; unsigned long flags; + if (IS_ENABLED(CONFIG_PA20)) + return PDC_BAD_PROC; + spin_lock_irqsave(&pdc_lock, flags); memcpy(pdc_result2, mod_path, sizeof(*mod_path)); retval = mem_pdc_call(PDC_MEM_MAP, PDC_MEM_MAP_HPA, __pa(pdc_result), @@ -761,7 +772,6 @@ int pdc_mem_map_hpa(struct pdc_memory_map *address, return retval; } -#endif /* !CONFIG_PA20 */ /** * pdc_lan_station_id - Get the LAN address. From e5ef93d02d6c9cc3a14e7348481c9e41a528caa1 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 7 Sep 2023 08:57:44 +0200 Subject: [PATCH 031/161] parisc: BTLB: Initialize BTLB tables at CPU startup Initialize the BTLB entries when starting up a CPU. Note that BTLBs are not available on 64-bit CPUs. Signed-off-by: Helge Deller --- arch/parisc/include/asm/cache.h | 1 + arch/parisc/include/asm/processor.h | 1 + arch/parisc/kernel/cache.c | 8 +--- arch/parisc/kernel/processor.c | 2 + arch/parisc/mm/init.c | 72 +++++++++++++++++++++++++++++ 5 files changed, 77 insertions(+), 7 deletions(-) diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h index e23d06b51a204..2a60d7a72f1fa 100644 --- a/arch/parisc/include/asm/cache.h +++ b/arch/parisc/include/asm/cache.h @@ -37,6 +37,7 @@ extern int split_tlb; extern int dcache_stride; extern int icache_stride; extern struct pdc_cache_info cache_info; +extern struct pdc_btlb_info btlb_info; void parisc_setup_cache_timing(void); #define pdtlb(sr, addr) asm volatile("pdtlb 0(%%sr%0,%1)" \ diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index d77c43d329743..ff6cbdb6903bc 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -310,6 +310,7 @@ extern void do_syscall_trace_exit(struct pt_regs *); struct seq_file; extern void early_trap_init(void); extern void collect_boot_cpu_data(void); +extern void btlb_init_per_cpu(void); extern int show_cpuinfo (struct seq_file *m, void *v); /* driver code in driver/parisc */ diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 442109a489406..268d90a9325b4 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -58,7 +58,7 @@ int pa_serialize_tlb_flushes __ro_after_init; struct pdc_cache_info cache_info __ro_after_init; #ifndef CONFIG_PA20 -static struct pdc_btlb_info btlb_info __ro_after_init; +struct pdc_btlb_info btlb_info __ro_after_init; #endif DEFINE_STATIC_KEY_TRUE(parisc_has_cache); @@ -264,12 +264,6 @@ parisc_cache_init(void) icache_stride = CAFL_STRIDE(cache_info.ic_conf); #undef CAFL_STRIDE -#ifndef CONFIG_PA20 - if (pdc_btlb_info(&btlb_info) < 0) { - memset(&btlb_info, 0, sizeof btlb_info); - } -#endif - if ((boot_cpu_data.pdc.capabilities & PDC_MODEL_NVA_MASK) == PDC_MODEL_NVA_UNSUPPORTED) { printk(KERN_WARNING "parisc_cache_init: Only equivalent aliasing supported!\n"); diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index a0e2d37c5b3b5..1fc89fa2c2d21 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -368,6 +368,8 @@ int init_per_cpu(int cpunum) /* FUTURE: Enable Performance Monitor : ccr bit 0x20 */ init_percpu_prof(cpunum); + btlb_init_per_cpu(); + return ret; } diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index a088c243edeaa..a2a3e89f2d9ab 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -32,6 +32,7 @@ #include #include #include +#include extern int data_start; extern void parisc_kernel_start(void); /* Kernel entry point in head.S */ @@ -720,6 +721,77 @@ void __init paging_init(void) parisc_bootmem_free(); } +static void alloc_btlb(unsigned long start, unsigned long end, int *slot, + unsigned long entry_info) +{ + const int slot_max = btlb_info.fixed_range_info.num_comb; + int min_num_pages = btlb_info.min_size; + unsigned long size; + + /* map at minimum 4 pages */ + if (min_num_pages < 4) + min_num_pages = 4; + + size = HUGEPAGE_SIZE; + while (start < end && *slot < slot_max && size >= PAGE_SIZE) { + /* starting address must have same alignment as size! */ + /* if correctly aligned and fits in double size, increase */ + if (((start & (2 * size - 1)) == 0) && + (end - start) >= (2 * size)) { + size <<= 1; + continue; + } + /* if current size alignment is too big, try smaller size */ + if ((start & (size - 1)) != 0) { + size >>= 1; + continue; + } + if ((end - start) >= size) { + if ((size >> PAGE_SHIFT) >= min_num_pages) + pdc_btlb_insert(start >> PAGE_SHIFT, __pa(start) >> PAGE_SHIFT, + size >> PAGE_SHIFT, entry_info, *slot); + (*slot)++; + start += size; + continue; + } + size /= 2; + continue; + } +} + +void btlb_init_per_cpu(void) +{ + unsigned long s, t, e; + int slot; + + /* BTLBs are not available on 64-bit CPUs */ + if (IS_ENABLED(CONFIG_PA20)) + return; + else if (pdc_btlb_info(&btlb_info) < 0) { + memset(&btlb_info, 0, sizeof btlb_info); + } + + /* insert BLTLBs for code and data segments */ + s = (uintptr_t) dereference_function_descriptor(&_stext); + e = (uintptr_t) dereference_function_descriptor(&_etext); + t = (uintptr_t) dereference_function_descriptor(&_sdata); + BUG_ON(t != e); + + /* code segments */ + slot = 0; + alloc_btlb(s, e, &slot, 0x13800000); + + /* sanity check */ + t = (uintptr_t) dereference_function_descriptor(&_edata); + e = (uintptr_t) dereference_function_descriptor(&__bss_start); + BUG_ON(t != e); + + /* data segments */ + s = (uintptr_t) dereference_function_descriptor(&_sdata); + e = (uintptr_t) dereference_function_descriptor(&__bss_stop); + alloc_btlb(s, e, &slot, 0x11800000); +} + #ifdef CONFIG_PA20 /* From d20b484c674d2eae816978a98fa38b4054aeca3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Wed, 6 Sep 2023 11:50:39 +0200 Subject: [PATCH 032/161] drm/drm_exec: Work around a WW mutex lockdep oddity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If *any* object of a certain WW mutex class is locked, lockdep will consider *all* mutexes of that class as locked. Also the lock allocation tracking code will apparently register only the address of the first mutex of a given class locked in a sequence. This has the odd consequence that if that first mutex is unlocked while other mutexes of the same class remain locked and then its memory then freed, the lock alloc tracking code will incorrectly assume that memory is freed with a held lock in there. For now, work around that for drm_exec by releasing the first grabbed object lock last. v2: - Fix a typo (Danilo Krummrich) - Reword the commit message a bit. - Add a Fixes: tag Related lock alloc tracking warning: [ 322.660067] ========================= [ 322.660070] WARNING: held lock freed! [ 322.660074] 6.5.0-rc7+ #155 Tainted: G U N [ 322.660078] ------------------------- [ 322.660081] kunit_try_catch/4981 is freeing memory ffff888112adc000-ffff888112adc3ff, with a lock still held there! [ 322.660089] ffff888112adc1a0 (reservation_ww_class_mutex){+.+.}-{3:3}, at: drm_exec_lock_obj+0x11a/0x600 [drm_exec] [ 322.660104] 2 locks held by kunit_try_catch/4981: [ 322.660108] #0: ffffc9000343fe18 (reservation_ww_class_acquire){+.+.}-{0:0}, at: test_early_put+0x22f/0x490 [drm_exec_test] [ 322.660123] #1: ffff888112adc1a0 (reservation_ww_class_mutex){+.+.}-{3:3}, at: drm_exec_lock_obj+0x11a/0x600 [drm_exec] [ 322.660135] stack backtrace: [ 322.660139] CPU: 7 PID: 4981 Comm: kunit_try_catch Tainted: G U N 6.5.0-rc7+ #155 [ 322.660146] Hardware name: ASUS System Product Name/PRIME B560M-A AC, BIOS 0403 01/26/2021 [ 322.660152] Call Trace: [ 322.660155] [ 322.660158] dump_stack_lvl+0x57/0x90 [ 322.660164] debug_check_no_locks_freed+0x20b/0x2b0 [ 322.660172] slab_free_freelist_hook+0xa1/0x160 [ 322.660179] ? drm_exec_unlock_all+0x168/0x2a0 [drm_exec] [ 322.660186] __kmem_cache_free+0xb2/0x290 [ 322.660192] drm_exec_unlock_all+0x168/0x2a0 [drm_exec] [ 322.660200] drm_exec_fini+0xf/0x1c0 [drm_exec] [ 322.660206] test_early_put+0x289/0x490 [drm_exec_test] [ 322.660215] ? __pfx_test_early_put+0x10/0x10 [drm_exec_test] [ 322.660222] ? __kasan_check_byte+0xf/0x40 [ 322.660227] ? __ksize+0x63/0x140 [ 322.660233] ? drmm_add_final_kfree+0x3e/0xa0 [drm] [ 322.660289] ? _raw_spin_unlock_irqrestore+0x30/0x60 [ 322.660294] ? lockdep_hardirqs_on+0x7d/0x100 [ 322.660301] ? __pfx_kunit_try_run_case+0x10/0x10 [kunit] [ 322.660310] ? __pfx_kunit_generic_run_threadfn_adapter+0x10/0x10 [kunit] [ 322.660319] kunit_generic_run_threadfn_adapter+0x4a/0x90 [kunit] [ 322.660328] kthread+0x2e7/0x3c0 [ 322.660334] ? __pfx_kthread+0x10/0x10 [ 322.660339] ret_from_fork+0x2d/0x70 [ 322.660345] ? __pfx_kthread+0x10/0x10 [ 322.660349] ret_from_fork_asm+0x1b/0x30 [ 322.660358] [ 322.660818] ok 8 test_early_put Cc: Christian König Cc: Boris Brezillon Cc: Danilo Krummrich Cc: dri-devel@lists.freedesktop.org Fixes: 09593216bff1 ("drm: execution context for GEM buffers v7") Signed-off-by: Thomas Hellström Reviewed-by: Boris Brezillon Reviewed-by: Danilo Krummrich Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20230906095039.3320-4-thomas.hellstrom@linux.intel.com --- drivers/gpu/drm/drm_exec.c | 2 +- include/drm/drm_exec.h | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/drm_exec.c b/drivers/gpu/drm/drm_exec.c index ff69cf0fb42aa..5d2809de4517c 100644 --- a/drivers/gpu/drm/drm_exec.c +++ b/drivers/gpu/drm/drm_exec.c @@ -56,7 +56,7 @@ static void drm_exec_unlock_all(struct drm_exec *exec) struct drm_gem_object *obj; unsigned long index; - drm_exec_for_each_locked_object(exec, index, obj) { + drm_exec_for_each_locked_object_reverse(exec, index, obj) { dma_resv_unlock(obj->resv); drm_gem_object_put(obj); } diff --git a/include/drm/drm_exec.h b/include/drm/drm_exec.h index e0462361adf9e..b5bf0b6da791c 100644 --- a/include/drm/drm_exec.h +++ b/include/drm/drm_exec.h @@ -51,6 +51,20 @@ struct drm_exec { struct drm_gem_object *prelocked; }; +/** + * drm_exec_obj() - Return the object for a give drm_exec index + * @exec: Pointer to the drm_exec context + * @index: The index. + * + * Return: Pointer to the locked object corresponding to @index if + * index is within the number of locked objects. NULL otherwise. + */ +static inline struct drm_gem_object * +drm_exec_obj(struct drm_exec *exec, unsigned long index) +{ + return index < exec->num_objects ? exec->objects[index] : NULL; +} + /** * drm_exec_for_each_locked_object - iterate over all the locked objects * @exec: drm_exec object @@ -59,10 +73,23 @@ struct drm_exec { * * Iterate over all the locked GEM objects inside the drm_exec object. */ -#define drm_exec_for_each_locked_object(exec, index, obj) \ - for (index = 0, obj = (exec)->objects[0]; \ - index < (exec)->num_objects; \ - ++index, obj = (exec)->objects[index]) +#define drm_exec_for_each_locked_object(exec, index, obj) \ + for ((index) = 0; ((obj) = drm_exec_obj(exec, index)); ++(index)) + +/** + * drm_exec_for_each_locked_object_reverse - iterate over all the locked + * objects in reverse locking order + * @exec: drm_exec object + * @index: unsigned long index for the iteration + * @obj: the current GEM object + * + * Iterate over all the locked GEM objects inside the drm_exec object in + * reverse locking order. Note that @index may go below zero and wrap, + * but that will be caught by drm_exec_obj(), returning a NULL object. + */ +#define drm_exec_for_each_locked_object_reverse(exec, index, obj) \ + for ((index) = (exec)->num_objects - 1; \ + ((obj) = drm_exec_obj(exec, index)); --(index)) /** * drm_exec_until_all_locked - loop until all GEM objects are locked From dcbad727513d277144aee482b2ffbcd2255c37aa Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 6 Sep 2023 15:55:17 -0400 Subject: [PATCH 033/161] drm/radeon: make fence wait in suballocator uninterrruptable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 254986e324ad ("drm/radeon: Use the drm suballocation manager implementation.") made the fence wait in amdgpu_sa_bo_new() interruptible but there is no code to handle an interrupt. This caused the kernel to randomly explode in high-VRAM-pressure situations so make it uninterruptible again. Fixes: 254986e324ad ("drm/radeon: Use the drm suballocation manager implementation.") Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2769 Signed-off-by: Alex Deucher CC: stable@vger.kernel.org # 6.4+ CC: Simon Pilkington Link: https://patchwork.freedesktop.org/patch/msgid/20230906195517.1345717-1-alexander.deucher@amd.com Signed-off-by: Christian König --- drivers/gpu/drm/radeon/radeon_sa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/radeon_sa.c b/drivers/gpu/drm/radeon/radeon_sa.c index c87a57c9c592b..22dd8b4456855 100644 --- a/drivers/gpu/drm/radeon/radeon_sa.c +++ b/drivers/gpu/drm/radeon/radeon_sa.c @@ -123,7 +123,7 @@ int radeon_sa_bo_new(struct radeon_sa_manager *sa_manager, unsigned int size, unsigned int align) { struct drm_suballoc *sa = drm_suballoc_new(&sa_manager->base, size, - GFP_KERNEL, true, align); + GFP_KERNEL, false, align); if (IS_ERR(sa)) { *sa_bo = NULL; From 9879e5e1c528d1ee9c4473b1d0668ba988bfb6ca Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 6 Sep 2023 22:47:11 -0400 Subject: [PATCH 034/161] tracefs/eventfs: Use dput to free the toplevel events directory Currently when rmdir on an instance is done, eventfs_remove_events_dir() is called and it does a dput on the dentry and then frees the eventfs_inode that represents the events directory. But there's no protection against a reader reading the top level events directory at the same time and we can get a use after free error. Instead, use the dput() associated to the dentry to also free the eventfs_inode associated to the events directory, as that will get called when the last reference to the directory is released. This issue triggered the following KASAN report: ================================================================== BUG: KASAN: slab-use-after-free in eventfs_root_lookup+0x88/0x1b0 Read of size 8 at addr ffff888120130ca0 by task ftracetest/1201 CPU: 4 PID: 1201 Comm: ftracetest Not tainted 6.5.0-test-10737-g469e0a8194e7 #13 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 Call Trace: dump_stack_lvl+0x57/0x90 print_report+0xcf/0x670 ? __pfx_ring_buffer_record_off+0x10/0x10 ? _raw_spin_lock_irqsave+0x2b/0x70 ? __virt_addr_valid+0xd9/0x160 kasan_report+0xd4/0x110 ? eventfs_root_lookup+0x88/0x1b0 ? eventfs_root_lookup+0x88/0x1b0 eventfs_root_lookup+0x88/0x1b0 ? eventfs_root_lookup+0x33/0x1b0 __lookup_slow+0x194/0x2a0 ? __pfx___lookup_slow+0x10/0x10 ? down_read+0x11c/0x330 walk_component+0x166/0x220 link_path_walk.part.0.constprop.0+0x3a3/0x5a0 ? seqcount_lockdep_reader_access+0x82/0x90 ? __pfx_link_path_walk.part.0.constprop.0+0x10/0x10 path_openat+0x143/0x11f0 ? __lock_acquire+0xa1a/0x3220 ? __pfx_path_openat+0x10/0x10 ? __pfx___lock_acquire+0x10/0x10 do_filp_open+0x166/0x290 ? __pfx_do_filp_open+0x10/0x10 ? lock_is_held_type+0xce/0x120 ? preempt_count_sub+0xb7/0x100 ? _raw_spin_unlock+0x29/0x50 ? alloc_fd+0x1a0/0x320 do_sys_openat2+0x126/0x160 ? rcu_is_watching+0x34/0x60 ? __pfx_do_sys_openat2+0x10/0x10 ? __might_resched+0x2cf/0x3b0 ? __fget_light+0xdf/0x100 __x64_sys_openat+0xcd/0x140 ? __pfx___x64_sys_openat+0x10/0x10 ? syscall_enter_from_user_mode+0x22/0x90 ? lockdep_hardirqs_on+0x7d/0x100 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 RIP: 0033:0x7f1dceef5e51 Code: 75 57 89 f0 25 00 00 41 00 3d 00 00 41 00 74 49 80 3d 9a 27 0e 00 00 74 6d 89 da 48 89 ee bf 9c ff ff ff b8 01 01 00 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 93 00 00 00 48 8b 54 24 28 64 48 2b 14 25 RSP: 002b:00007fff2cddf380 EFLAGS: 00000202 ORIG_RAX: 0000000000000101 RAX: ffffffffffffffda RBX: 0000000000000241 RCX: 00007f1dceef5e51 RDX: 0000000000000241 RSI: 000055d7520677d0 RDI: 00000000ffffff9c RBP: 000055d7520677d0 R08: 000000000000001e R09: 0000000000000001 R10: 00000000000001b6 R11: 0000000000000202 R12: 0000000000000000 R13: 0000000000000003 R14: 000055d752035678 R15: 000055d752067788 Allocated by task 1200: kasan_save_stack+0x2f/0x50 kasan_set_track+0x21/0x30 __kasan_kmalloc+0x8b/0x90 eventfs_create_events_dir+0x54/0x220 create_event_toplevel_files+0x42/0x130 event_trace_add_tracer+0x33/0x180 trace_array_create_dir+0x52/0xf0 trace_array_create+0x361/0x410 instance_mkdir+0x6b/0xb0 tracefs_syscall_mkdir+0x57/0x80 vfs_mkdir+0x275/0x380 do_mkdirat+0x1da/0x210 __x64_sys_mkdir+0x74/0xa0 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 Freed by task 1251: kasan_save_stack+0x2f/0x50 kasan_set_track+0x21/0x30 kasan_save_free_info+0x27/0x40 __kasan_slab_free+0x106/0x180 __kmem_cache_free+0x149/0x2e0 event_trace_del_tracer+0xcb/0x120 __remove_instance+0x16a/0x340 instance_rmdir+0x77/0xa0 tracefs_syscall_rmdir+0x77/0xc0 vfs_rmdir+0xed/0x2d0 do_rmdir+0x235/0x280 __x64_sys_rmdir+0x5f/0x90 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 The buggy address belongs to the object at ffff888120130ca0 which belongs to the cache kmalloc-16 of size 16 The buggy address is located 0 bytes inside of freed 16-byte region [ffff888120130ca0, ffff888120130cb0) The buggy address belongs to the physical page: page:000000004dbddbb0 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x120130 flags: 0x17ffffc0000800(slab|node=0|zone=2|lastcpupid=0x1fffff) page_type: 0xffffffff() raw: 0017ffffc0000800 ffff8881000423c0 dead000000000122 0000000000000000 raw: 0000000000000000 0000000000800080 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888120130b80: 00 00 fc fc 00 05 fc fc 00 00 fc fc 00 02 fc fc ffff888120130c00: 00 07 fc fc 00 00 fc fc 00 00 fc fc fa fb fc fc >ffff888120130c80: 00 00 fc fc fa fb fc fc 00 00 fc fc 00 00 fc fc ^ ffff888120130d00: 00 00 fc fc 00 00 fc fc 00 00 fc fc fa fb fc fc ffff888120130d80: 00 00 fc fc 00 00 fc fc 00 00 fc fc 00 00 fc fc ================================================================== Link: https://lkml.kernel.org/r/20230907024803.250873643@goodmis.org Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/ Cc: Ajay Kaher Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Fixes: 5bdcd5f5331a2 eventfs: ("Implement removal of meta data from eventfs") Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Reported-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 17 ++++++++++++----- fs/tracefs/inode.c | 2 +- fs/tracefs/internal.h | 5 +++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index fa1a1679a8866..609ccb5b7cfcd 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -185,17 +185,27 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void * /** * eventfs_set_ef_status_free - set the ef->status to free + * @ti: the tracefs_inode of the dentry * @dentry: dentry who's status to be freed * * eventfs_set_ef_status_free will be called if no more * references remain */ -void eventfs_set_ef_status_free(struct dentry *dentry) +void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry) { struct tracefs_inode *ti_parent; + struct eventfs_inode *ei; struct eventfs_file *ef; mutex_lock(&eventfs_mutex); + + /* The top level events directory may be freed by this */ + if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) { + ei = ti->private; + kfree(ei); + goto out; + } + ti_parent = get_tracefs(dentry->d_parent->d_inode); if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE)) goto out; @@ -510,7 +520,7 @@ struct dentry *eventfs_create_events_dir(const char *name, INIT_LIST_HEAD(&ei->e_top_files); ti = get_tracefs(inode); - ti->flags |= TRACEFS_EVENT_INODE; + ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE; ti->private = ei; inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; @@ -806,7 +816,6 @@ void eventfs_remove(struct eventfs_file *ef) void eventfs_remove_events_dir(struct dentry *dentry) { struct tracefs_inode *ti; - struct eventfs_inode *ei; if (!dentry || !dentry->d_inode) return; @@ -815,8 +824,6 @@ void eventfs_remove_events_dir(struct dentry *dentry) if (!ti || !(ti->flags & TRACEFS_EVENT_INODE)) return; - ei = ti->private; d_invalidate(dentry); dput(dentry); - kfree(ei); } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 3b8dd938b1c8f..891653ba9cf35 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -385,7 +385,7 @@ static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode) ti = get_tracefs(inode); if (ti && ti->flags & TRACEFS_EVENT_INODE) - eventfs_set_ef_status_free(dentry); + eventfs_set_ef_status_free(ti, dentry); iput(inode); } diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 69c2b1d87c464..4f2e49e2197b1 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -3,7 +3,8 @@ #define _TRACEFS_INTERNAL_H enum { - TRACEFS_EVENT_INODE = BIT(1), + TRACEFS_EVENT_INODE = BIT(1), + TRACEFS_EVENT_TOP_INODE = BIT(2), }; struct tracefs_inode { @@ -24,6 +25,6 @@ struct inode *tracefs_get_inode(struct super_block *sb); struct dentry *eventfs_start_creating(const char *name, struct dentry *parent); struct dentry *eventfs_failed_creating(struct dentry *dentry); struct dentry *eventfs_end_creating(struct dentry *dentry); -void eventfs_set_ef_status_free(struct dentry *dentry); +void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry); #endif /* _TRACEFS_INTERNAL_H */ From f5ca233e2e66dc1c249bf07eefa37e34a6c9346a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 6 Sep 2023 22:47:12 -0400 Subject: [PATCH 035/161] tracing: Increase trace array ref count on enable and filter files When the trace event enable and filter files are opened, increment the trace array ref counter, otherwise they can be accessed when the trace array is being deleted. The ref counter keeps the trace array from being deleted while those files are opened. Link: https://lkml.kernel.org/r/20230907024803.456187066@goodmis.org Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Fixes: 8530dec63e7b4 ("tracing: Add tracing_check_open_get_tr()") Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Reported-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 27 +++++++++++++++++++++++++++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_events.c | 6 ++++-- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 35783a7baf151..0827037ee3b81 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4973,6 +4973,33 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp) return 0; } +/* + * The private pointer of the inode is the trace_event_file. + * Update the tr ref count associated to it. + */ +int tracing_open_file_tr(struct inode *inode, struct file *filp) +{ + struct trace_event_file *file = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(file->tr); + if (ret) + return ret; + + filp->private_data = inode->i_private; + + return 0; +} + +int tracing_release_file_tr(struct inode *inode, struct file *filp) +{ + struct trace_event_file *file = inode->i_private; + + trace_array_put(file->tr); + + return 0; +} + static int tracing_mark_open(struct inode *inode, struct file *filp) { stream_open(inode, filp); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5669dd1f90d9e..77debe53f07cf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -610,6 +610,8 @@ void tracing_reset_all_online_cpus(void); void tracing_reset_all_online_cpus_unlocked(void); int tracing_open_generic(struct inode *inode, struct file *filp); int tracing_open_generic_tr(struct inode *inode, struct file *filp); +int tracing_open_file_tr(struct inode *inode, struct file *filp); +int tracing_release_file_tr(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ed367d713be07..2af92177b7658 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2103,9 +2103,10 @@ static const struct file_operations ftrace_set_event_notrace_pid_fops = { }; static const struct file_operations ftrace_enable_fops = { - .open = tracing_open_generic, + .open = tracing_open_file_tr, .read = event_enable_read, .write = event_enable_write, + .release = tracing_release_file_tr, .llseek = default_llseek, }; @@ -2122,9 +2123,10 @@ static const struct file_operations ftrace_event_id_fops = { }; static const struct file_operations ftrace_event_filter_fops = { - .open = tracing_open_generic, + .open = tracing_open_file_tr, .read = event_filter_read, .write = event_filter_write, + .release = tracing_release_file_tr, .llseek = default_llseek, }; From 7d660c9b2bc95107f90a9f4c4759be85309a6550 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 6 Sep 2023 22:47:13 -0400 Subject: [PATCH 036/161] tracing: Have tracing_max_latency inc the trace array ref count The tracing_max_latency file points to the trace_array max_latency field. For an instance, if the file is opened and the instance is deleted, reading or writing to the file will cause a use after free. Up the ref count of the trace_array when tracing_max_latency is opened. Link: https://lkml.kernel.org/r/20230907024803.666889383@goodmis.org Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Zheng Yejian Fixes: 8530dec63e7b4 ("tracing: Add tracing_check_open_get_tr()") Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0827037ee3b81..c8b8b4c6feafa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1772,7 +1772,7 @@ static void trace_create_maxlat_file(struct trace_array *tr, init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); tr->d_max_latency = trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, - d_tracer, &tr->max_latency, + d_tracer, tr, &tracing_max_lat_fops); } @@ -1805,7 +1805,7 @@ void latency_fsnotify(struct trace_array *tr) #define trace_create_maxlat_file(tr, d_tracer) \ trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ - d_tracer, &tr->max_latency, &tracing_max_lat_fops) + d_tracer, tr, &tracing_max_lat_fops) #endif @@ -6717,14 +6717,18 @@ static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos); + struct trace_array *tr = filp->private_data; + + return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos); } static ssize_t tracing_max_lat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos); + struct trace_array *tr = filp->private_data; + + return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos); } #endif @@ -7778,10 +7782,11 @@ static const struct file_operations tracing_thresh_fops = { #ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_max_lat_read, .write = tracing_max_lat_write, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; #endif From 9b37febc578b2e1ad76a105aab11d00af5ec3d27 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 6 Sep 2023 22:47:14 -0400 Subject: [PATCH 037/161] tracing: Have current_trace inc the trace array ref count The current_trace updates the trace array tracer. For an instance, if the file is opened and the instance is deleted, reading or writing to the file will cause a use after free. Up the ref count of the trace array when current_trace is opened. Link: https://lkml.kernel.org/r/20230907024803.877687227@goodmis.org Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Zheng Yejian Fixes: 8530dec63e7b4 ("tracing: Add tracing_check_open_get_tr()") Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c8b8b4c6feafa..b82df33d20ffd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7791,10 +7791,11 @@ static const struct file_operations tracing_max_lat_fops = { #endif static const struct file_operations set_tracer_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_set_trace_read, .write = tracing_set_trace_write, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; static const struct file_operations tracing_pipe_fops = { From 7e2cfbd2d3c86afcd5c26b5c4b1dd251f63c5838 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 6 Sep 2023 22:47:15 -0400 Subject: [PATCH 038/161] tracing: Have option files inc the trace array ref count The option files update the options for a given trace array. For an instance, if the file is opened and the instance is deleted, reading or writing to the file will cause a use after free. Up the ref count of the trace_array when an option file is opened. Link: https://lkml.kernel.org/r/20230907024804.086679464@goodmis.org Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Zheng Yejian Fixes: 8530dec63e7b4 ("tracing: Add tracing_check_open_get_tr()") Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b82df33d20ffd..0608ad20cf30e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8988,12 +8988,33 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static int tracing_open_options(struct inode *inode, struct file *filp) +{ + struct trace_option_dentry *topt = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(topt->tr); + if (ret) + return ret; + + filp->private_data = inode->i_private; + return 0; +} + +static int tracing_release_options(struct inode *inode, struct file *file) +{ + struct trace_option_dentry *topt = file->private_data; + + trace_array_put(topt->tr); + return 0; +} static const struct file_operations trace_options_fops = { - .open = tracing_open_generic, + .open = tracing_open_options, .read = trace_options_read, .write = trace_options_write, .llseek = generic_file_llseek, + .release = tracing_release_options, }; /* From e5c624f027ac74f97e97c8f36c69228ac9f1102d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 6 Sep 2023 22:47:16 -0400 Subject: [PATCH 039/161] tracing: Have event inject files inc the trace array ref count The event inject files add events for a specific trace array. For an instance, if the file is opened and the instance is deleted, reading or writing to the file will cause a use after free. Up the ref count of the trace_array when a event inject file is opened. Link: https://lkml.kernel.org/r/20230907024804.292337868@goodmis.org Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Zheng Yejian Fixes: 6c3edaf9fd6a ("tracing: Introduce trace event injection") Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_inject.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index abe805d471eb8..8650562bdaa98 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -328,7 +328,8 @@ event_inject_read(struct file *file, char __user *buf, size_t size, } const struct file_operations event_inject_fops = { - .open = tracing_open_generic, + .open = tracing_open_file_tr, .read = event_inject_read, .write = event_inject_write, + .release = tracing_release_file_tr, }; From f6bd2c92488c30ef53b5bd80c52f0a7eee9d545a Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 6 Sep 2023 16:19:30 +0800 Subject: [PATCH 040/161] ring-buffer: Avoid softlockup in ring_buffer_resize() When user resize all trace ring buffer through file 'buffer_size_kb', then in ring_buffer_resize(), kernel allocates buffer pages for each cpu in a loop. If the kernel preemption model is PREEMPT_NONE and there are many cpus and there are many buffer pages to be allocated, it may not give up cpu for a long time and finally cause a softlockup. To avoid it, call cond_resched() after each cpu buffer allocation. Link: https://lore.kernel.org/linux-trace-kernel/20230906081930.3939106-1-zhengyejian1@huawei.com Cc: Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 78502d4c7214e..72ccf75defd0f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2198,6 +2198,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, err = -ENOMEM; goto out_err; } + + cond_resched(); } cpus_read_lock(); From ac28b1ec6135649b5d78b028e47264cb3ebca5ea Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Thu, 7 Sep 2023 10:57:09 +0800 Subject: [PATCH 041/161] net: ipv4: fix one memleak in __inet_del_ifa() I got the below warning when do fuzzing test: unregister_netdevice: waiting for bond0 to become free. Usage count = 2 It can be repoduced via: ip link add bond0 type bond sysctl -w net.ipv4.conf.bond0.promote_secondaries=1 ip addr add 4.117.174.103/0 scope 0x40 dev bond0 ip addr add 192.168.100.111/255.255.255.254 scope 0 dev bond0 ip addr add 0.0.0.4/0 scope 0x40 secondary dev bond0 ip addr del 4.117.174.103/0 scope 0x40 dev bond0 ip link delete bond0 type bond In this reproduction test case, an incorrect 'last_prim' is found in __inet_del_ifa(), as a result, the secondary address(0.0.0.4/0 scope 0x40) is lost. The memory of the secondary address is leaked and the reference of in_device and net_device is leaked. Fix this problem: Look for 'last_prim' starting at location of the deleted IP and inserting the promoted IP into the location of 'last_prim'. Fixes: 0ff60a45678e ("[IPV4]: Fix secondary IP addresses after promotion") Signed-off-by: Liu Jian Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 9cf64ee47dd2b..ca0ff15dc8fa3 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -355,14 +355,14 @@ static void __inet_del_ifa(struct in_device *in_dev, { struct in_ifaddr *promote = NULL; struct in_ifaddr *ifa, *ifa1; - struct in_ifaddr *last_prim; + struct in_ifaddr __rcu **last_prim; struct in_ifaddr *prev_prom = NULL; int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); ifa1 = rtnl_dereference(*ifap); - last_prim = rtnl_dereference(in_dev->ifa_list); + last_prim = ifap; if (in_dev->dead) goto no_promotions; @@ -376,7 +376,7 @@ static void __inet_del_ifa(struct in_device *in_dev, while ((ifa = rtnl_dereference(*ifap1)) != NULL) { if (!(ifa->ifa_flags & IFA_F_SECONDARY) && ifa1->ifa_scope <= ifa->ifa_scope) - last_prim = ifa; + last_prim = &ifa->ifa_next; if (!(ifa->ifa_flags & IFA_F_SECONDARY) || ifa1->ifa_mask != ifa->ifa_mask || @@ -440,9 +440,9 @@ static void __inet_del_ifa(struct in_device *in_dev, rcu_assign_pointer(prev_prom->ifa_next, next_sec); - last_sec = rtnl_dereference(last_prim->ifa_next); + last_sec = rtnl_dereference(*last_prim); rcu_assign_pointer(promote->ifa_next, last_sec); - rcu_assign_pointer(last_prim->ifa_next, promote); + rcu_assign_pointer(*last_prim, promote); } promote->ifa_flags &= ~IFA_F_SECONDARY; From 2d6cd791e63ec0c68ae95ecd55dc6c50ac7829cf Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Sep 2023 12:10:31 +0100 Subject: [PATCH 042/161] btrfs: fix race between finishing block group creation and its item update Commit 675dfe1223a6 ("btrfs: fix block group item corruption after inserting new block group") fixed one race that resulted in not persisting a block group's item when its "used" bytes field decreases to zero. However there's another race that can happen in a much shorter time window that results in the same problem. The following sequence of steps explains how it can happen: 1) Task A creates a metadata block group X, its "used" and "commit_used" fields are initialized to 0; 2) Two extents are allocated from block group X, so its "used" field is updated to 32K, and its "commit_used" field remains as 0; 3) Transaction commit starts, by some task B, and it enters btrfs_start_dirty_block_groups(). There it tries to update the block group item for block group X, which currently has its "used" field with a value of 32K and its "commit_used" field with a value of 0. However that fails since the block group item was not yet inserted, so at update_block_group_item(), the btrfs_search_slot() call returns 1, and then we set 'ret' to -ENOENT. Before jumping to the label 'fail'... 4) The block group item is inserted by task A, when for example btrfs_create_pending_block_groups() is called when releasing its transaction handle. This results in insert_block_group_item() inserting the block group item in the extent tree (or block group tree), with a "used" field having a value of 32K and setting "commit_used", in struct btrfs_block_group, to the same value (32K); 5) Task B jumps to the 'fail' label and then resets the "commit_used" field to 0. At btrfs_start_dirty_block_groups(), because -ENOENT was returned from update_block_group_item(), we add the block group again to the list of dirty block groups, so that we will try again in the critical section of the transaction commit when calling btrfs_write_dirty_block_groups(); 6) Later the two extents from block group X are freed, so its "used" field becomes 0; 7) If no more extents are allocated from block group X before we get into btrfs_write_dirty_block_groups(), then when we call update_block_group_item() again for block group X, we will not update the block group item to reflect that it has 0 bytes used, because the "used" and "commit_used" fields in struct btrfs_block_group have the same value, a value of 0. As a result after committing the transaction we have an empty block group with its block group item having a 32K value for its "used" field. This will trigger errors from fsck ("btrfs check" command) and after mounting again the fs, the cleaner kthread will not automatically delete the empty block group, since its "used" field is not 0. Possibly there are other issues due to this inconsistency. When this issue happens, the error reported by fsck is like this: [1/7] checking root items [2/7] checking extents block group [1104150528 1073741824] used 39796736 but extent items used 0 ERROR: errors found in extent allocation tree or chunk allocation (...) So fix this by not resetting the "commit_used" field of a block group when we don't find the block group item at update_block_group_item(). Fixes: 7248e0cebbef ("btrfs: skip update of block group item if used bytes are the same") CC: stable@vger.kernel.org # 6.2+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 0cb1dee965a02..b2e5107b7cecc 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -3028,8 +3028,16 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); fail: btrfs_release_path(path); - /* We didn't update the block group item, need to revert @commit_used. */ - if (ret < 0) { + /* + * We didn't update the block group item, need to revert commit_used + * unless the block group item didn't exist yet - this is to prevent a + * race with a concurrent insertion of the block group item, with + * insert_block_group_item(), that happened just after we attempted to + * update. In that case we would reset commit_used to 0 just after the + * insertion set it to a value greater than 0 - if the block group later + * becomes with 0 used bytes, we would incorrectly skip its update. + */ + if (ret < 0 && ret != -ENOENT) { spin_lock(&cache->lock); cache->commit_used = old_commit_used; spin_unlock(&cache->lock); From ee34a82e890a7babb5585daf1a6dd7d4d1cf142a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 26 Aug 2023 11:28:20 +0100 Subject: [PATCH 043/161] btrfs: release path before inode lookup during the ino lookup ioctl During the ino lookup ioctl we can end up calling btrfs_iget() to get an inode reference while we are holding on a root's btree. If btrfs_iget() needs to lookup the inode from the root's btree, because it's not currently loaded in memory, then it will need to lock another or the same path in the same root btree. This may result in a deadlock and trigger the following lockdep splat: WARNING: possible circular locking dependency detected 6.5.0-rc7-syzkaller-00004-gf7757129e3de #0 Not tainted ------------------------------------------------------ syz-executor277/5012 is trying to acquire lock: ffff88802df41710 (btrfs-tree-01){++++}-{3:3}, at: __btrfs_tree_read_lock+0x2f/0x220 fs/btrfs/locking.c:136 but task is already holding lock: ffff88802df418e8 (btrfs-tree-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x2f/0x220 fs/btrfs/locking.c:136 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (btrfs-tree-00){++++}-{3:3}: down_read_nested+0x49/0x2f0 kernel/locking/rwsem.c:1645 __btrfs_tree_read_lock+0x2f/0x220 fs/btrfs/locking.c:136 btrfs_search_slot+0x13a4/0x2f80 fs/btrfs/ctree.c:2302 btrfs_init_root_free_objectid+0x148/0x320 fs/btrfs/disk-io.c:4955 btrfs_init_fs_root fs/btrfs/disk-io.c:1128 [inline] btrfs_get_root_ref+0x5ae/0xae0 fs/btrfs/disk-io.c:1338 btrfs_get_fs_root fs/btrfs/disk-io.c:1390 [inline] open_ctree+0x29c8/0x3030 fs/btrfs/disk-io.c:3494 btrfs_fill_super+0x1c7/0x2f0 fs/btrfs/super.c:1154 btrfs_mount_root+0x7e0/0x910 fs/btrfs/super.c:1519 legacy_get_tree+0xef/0x190 fs/fs_context.c:611 vfs_get_tree+0x8c/0x270 fs/super.c:1519 fc_mount fs/namespace.c:1112 [inline] vfs_kern_mount+0xbc/0x150 fs/namespace.c:1142 btrfs_mount+0x39f/0xb50 fs/btrfs/super.c:1579 legacy_get_tree+0xef/0x190 fs/fs_context.c:611 vfs_get_tree+0x8c/0x270 fs/super.c:1519 do_new_mount+0x28f/0xae0 fs/namespace.c:3335 do_mount fs/namespace.c:3675 [inline] __do_sys_mount fs/namespace.c:3884 [inline] __se_sys_mount+0x2d9/0x3c0 fs/namespace.c:3861 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd -> #0 (btrfs-tree-01){++++}-{3:3}: check_prev_add kernel/locking/lockdep.c:3142 [inline] check_prevs_add kernel/locking/lockdep.c:3261 [inline] validate_chain kernel/locking/lockdep.c:3876 [inline] __lock_acquire+0x39ff/0x7f70 kernel/locking/lockdep.c:5144 lock_acquire+0x1e3/0x520 kernel/locking/lockdep.c:5761 down_read_nested+0x49/0x2f0 kernel/locking/rwsem.c:1645 __btrfs_tree_read_lock+0x2f/0x220 fs/btrfs/locking.c:136 btrfs_tree_read_lock fs/btrfs/locking.c:142 [inline] btrfs_read_lock_root_node+0x292/0x3c0 fs/btrfs/locking.c:281 btrfs_search_slot_get_root fs/btrfs/ctree.c:1832 [inline] btrfs_search_slot+0x4ff/0x2f80 fs/btrfs/ctree.c:2154 btrfs_lookup_inode+0xdc/0x480 fs/btrfs/inode-item.c:412 btrfs_read_locked_inode fs/btrfs/inode.c:3892 [inline] btrfs_iget_path+0x2d9/0x1520 fs/btrfs/inode.c:5716 btrfs_search_path_in_tree_user fs/btrfs/ioctl.c:1961 [inline] btrfs_ioctl_ino_lookup_user+0x77a/0xf50 fs/btrfs/ioctl.c:2105 btrfs_ioctl+0xb0b/0xd40 fs/btrfs/ioctl.c:4683 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:870 [inline] __se_sys_ioctl+0xf8/0x170 fs/ioctl.c:856 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- rlock(btrfs-tree-00); lock(btrfs-tree-01); lock(btrfs-tree-00); rlock(btrfs-tree-01); *** DEADLOCK *** 1 lock held by syz-executor277/5012: #0: ffff88802df418e8 (btrfs-tree-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x2f/0x220 fs/btrfs/locking.c:136 stack backtrace: CPU: 1 PID: 5012 Comm: syz-executor277 Not tainted 6.5.0-rc7-syzkaller-00004-gf7757129e3de #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1e7/0x2d0 lib/dump_stack.c:106 check_noncircular+0x375/0x4a0 kernel/locking/lockdep.c:2195 check_prev_add kernel/locking/lockdep.c:3142 [inline] check_prevs_add kernel/locking/lockdep.c:3261 [inline] validate_chain kernel/locking/lockdep.c:3876 [inline] __lock_acquire+0x39ff/0x7f70 kernel/locking/lockdep.c:5144 lock_acquire+0x1e3/0x520 kernel/locking/lockdep.c:5761 down_read_nested+0x49/0x2f0 kernel/locking/rwsem.c:1645 __btrfs_tree_read_lock+0x2f/0x220 fs/btrfs/locking.c:136 btrfs_tree_read_lock fs/btrfs/locking.c:142 [inline] btrfs_read_lock_root_node+0x292/0x3c0 fs/btrfs/locking.c:281 btrfs_search_slot_get_root fs/btrfs/ctree.c:1832 [inline] btrfs_search_slot+0x4ff/0x2f80 fs/btrfs/ctree.c:2154 btrfs_lookup_inode+0xdc/0x480 fs/btrfs/inode-item.c:412 btrfs_read_locked_inode fs/btrfs/inode.c:3892 [inline] btrfs_iget_path+0x2d9/0x1520 fs/btrfs/inode.c:5716 btrfs_search_path_in_tree_user fs/btrfs/ioctl.c:1961 [inline] btrfs_ioctl_ino_lookup_user+0x77a/0xf50 fs/btrfs/ioctl.c:2105 btrfs_ioctl+0xb0b/0xd40 fs/btrfs/ioctl.c:4683 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:870 [inline] __se_sys_ioctl+0xf8/0x170 fs/ioctl.c:856 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f0bec94ea39 Fix this simply by releasing the path before calling btrfs_iget() as at point we don't need the path anymore. Reported-by: syzbot+bf66ad948981797d2f1d@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/00000000000045fa140603c4a969@google.com/ Fixes: 23d0b79dfaed ("btrfs: Add unprivileged version of ino_lookup ioctl") CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a895d105464b6..d27b0d86b8e2c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1958,6 +1958,13 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, goto out_put; } + /* + * We don't need the path anymore, so release it and + * avoid deadlocks and lockdep warnings in case + * btrfs_iget() needs to lookup the inode from its root + * btree and lock the same leaf. + */ + btrfs_release_path(path); temp_inode = btrfs_iget(sb, key2.objectid, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); @@ -1978,7 +1985,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, goto out_put; } - btrfs_release_path(path); key.objectid = key.offset; key.offset = (u64)-1; dirid = key.objectid; From 77d20c685b6baeb942606a93ed861c191381b73e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 24 Aug 2023 16:59:22 -0400 Subject: [PATCH 044/161] btrfs: do not block starts waiting on previous transaction commit Internally I got a report of very long stalls on normal operations like creating a new file when auto relocation was running. The reporter used the 'bpf offcputime' tracer to show that we would get stuck in start_transaction for 5 to 30 seconds, and were always being woken up by the transaction commit. Using my timing-everything script, which times how long a function takes and what percentage of that total time is taken up by its children, I saw several traces like this 1083 took 32812902424 ns 29929002926 ns 91.2110% wait_for_commit_duration 25568 ns 7.7920e-05% commit_fs_roots_duration 1007751 ns 0.00307% commit_cowonly_roots_duration 446855602 ns 1.36182% btrfs_run_delayed_refs_duration 271980 ns 0.00082% btrfs_run_delayed_items_duration 2008 ns 6.1195e-06% btrfs_apply_pending_changes_duration 9656 ns 2.9427e-05% switch_commit_roots_duration 1598 ns 4.8700e-06% btrfs_commit_device_sizes_duration 4314 ns 1.3147e-05% btrfs_free_log_root_tree_duration Here I was only tracing functions that happen where we are between START_COMMIT and UNBLOCKED in order to see what would be keeping us blocked for so long. The wait_for_commit() we do is where we wait for a previous transaction that hasn't completed it's commit. This can include all of the unpin work and other cleanups, which tends to be the longest part of our transaction commit. There is no reason we should be blocking new things from entering the transaction at this point, it just adds to random latency spikes for no reason. Fix this by adding a PREP stage. This allows us to properly deal with multiple committers coming in at the same time, we retain the behavior that the winner waits on the previous transaction and the losers all wait for this transaction commit to occur. Nothing else is blocked during the PREP stage, and then once the wait is complete we switch to COMMIT_START and all of the same behavior as before is maintained. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/locking.h | 2 +- fs/btrfs/transaction.c | 39 ++++++++++++++++++++++++--------------- fs/btrfs/transaction.h | 1 + 4 files changed, 30 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0a96ea8c1d3a6..639f1e308e4c6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1547,7 +1547,7 @@ static int transaction_kthread(void *arg) delta = ktime_get_seconds() - cur->start_time; if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) && - cur->state < TRANS_STATE_COMMIT_START && + cur->state < TRANS_STATE_COMMIT_PREP && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); delay -= msecs_to_jiffies((delta - 1) * 1000); @@ -2682,8 +2682,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters); btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered); btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent); - btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start, - BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_prep, + BTRFS_LOCKDEP_TRANS_COMMIT_PREP); btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked, BTRFS_LOCKDEP_TRANS_UNBLOCKED); btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed, @@ -4870,7 +4870,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) while (!list_empty(&fs_info->trans_list)) { t = list_first_entry(&fs_info->trans_list, struct btrfs_transaction, list); - if (t->state >= TRANS_STATE_COMMIT_START) { + if (t->state >= TRANS_STATE_COMMIT_PREP) { refcount_inc(&t->use_count); spin_unlock(&fs_info->trans_lock); btrfs_wait_for_commit(fs_info, t->transid); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index edb9b4a0dba15..7d6ee1e609bf2 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -79,7 +79,7 @@ enum btrfs_lock_nesting { }; enum btrfs_lockdep_trans_states { - BTRFS_LOCKDEP_TRANS_COMMIT_START, + BTRFS_LOCKDEP_TRANS_COMMIT_PREP, BTRFS_LOCKDEP_TRANS_UNBLOCKED, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED, BTRFS_LOCKDEP_TRANS_COMPLETED, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index ab09542f21708..341363beaf108 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -56,12 +56,17 @@ static struct kmem_cache *btrfs_trans_handle_cachep; * | Call btrfs_commit_transaction() on any trans handle attached to * | transaction N * V - * Transaction N [[TRANS_STATE_COMMIT_START]] + * Transaction N [[TRANS_STATE_COMMIT_PREP]] + * | + * | If there are simultaneous calls to btrfs_commit_transaction() one will win + * | the race and the rest will wait for the winner to commit the transaction. + * | + * | The winner will wait for previous running transaction to completely finish + * | if there is one. * | - * | Will wait for previous running transaction to completely finish if there - * | is one + * Transaction N [[TRANS_STATE_COMMIT_START]] * | - * | Then one of the following happes: + * | Then one of the following happens: * | - Wait for all other trans handle holders to release. * | The btrfs_commit_transaction() caller will do the commit work. * | - Wait for current transaction to be committed by others. @@ -112,6 +117,7 @@ static struct kmem_cache *btrfs_trans_handle_cachep; */ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, + [TRANS_STATE_COMMIT_PREP] = 0U, [TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH), [TRANS_STATE_COMMIT_DOING] = (__TRANS_START | __TRANS_ATTACH | @@ -1983,7 +1989,7 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) * Wait for the current transaction commit to start and block * subsequent transaction joins */ - btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); wait_event(fs_info->transaction_blocked_wait, cur_trans->state >= TRANS_STATE_COMMIT_START || TRANS_ABORTED(cur_trans)); @@ -2130,7 +2136,7 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans) return; lockdep_assert_held(&trans->fs_info->trans_lock); - ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START); + ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP); list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } @@ -2154,7 +2160,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); - btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); @@ -2214,7 +2220,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) } spin_lock(&fs_info->trans_lock); - if (cur_trans->state >= TRANS_STATE_COMMIT_START) { + if (cur_trans->state >= TRANS_STATE_COMMIT_PREP) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; add_pending_snapshot(trans); @@ -2226,7 +2232,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) want_state = TRANS_STATE_SUPER_COMMITTED; btrfs_trans_state_lockdep_release(fs_info, - BTRFS_LOCKDEP_TRANS_COMMIT_START); + BTRFS_LOCKDEP_TRANS_COMMIT_PREP); ret = btrfs_end_transaction(trans); wait_for_commit(cur_trans, want_state); @@ -2238,9 +2244,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; } - cur_trans->state = TRANS_STATE_COMMIT_START; + cur_trans->state = TRANS_STATE_COMMIT_PREP; wake_up(&fs_info->transaction_blocked_wait); - btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); if (cur_trans->list.prev != &fs_info->trans_list) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; @@ -2261,11 +2267,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_put_transaction(prev_trans); if (ret) goto lockdep_release; - } else { - spin_unlock(&fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); } } else { - spin_unlock(&fs_info->trans_lock); /* * The previous transaction was aborted and was already removed * from the list of transactions at fs_info->trans_list. So we @@ -2273,11 +2277,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * corrupt state (pointing to trees with unwritten nodes/leafs). */ if (BTRFS_FS_ERROR(fs_info)) { + spin_unlock(&fs_info->trans_lock); ret = -EROFS; goto lockdep_release; } } + cur_trans->state = TRANS_STATE_COMMIT_START; + wake_up(&fs_info->transaction_blocked_wait); + spin_unlock(&fs_info->trans_lock); + /* * Get the time spent on the work done by the commit thread and not * the time spent waiting on a previous commit @@ -2587,7 +2596,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto cleanup_transaction; lockdep_trans_commit_start_release: - btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); btrfs_end_transaction(trans); return ret; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 8e9fa23bd7fed..6b309f8a99a86 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -14,6 +14,7 @@ enum btrfs_trans_state { TRANS_STATE_RUNNING, + TRANS_STATE_COMMIT_PREP, TRANS_STATE_COMMIT_START, TRANS_STATE_COMMIT_DOING, TRANS_STATE_UNBLOCKED, From e110f8911ddb93e6f55da14ccbbe705397b30d0b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 29 Aug 2023 11:34:52 +0100 Subject: [PATCH 045/161] btrfs: fix lockdep splat and potential deadlock after failure running delayed items When running delayed items we are holding a delayed node's mutex and then we will attempt to modify a subvolume btree to insert/update/delete the delayed items. However if have an error during the insertions for example, btrfs_insert_delayed_items() may return with a path that has locked extent buffers (a leaf at the very least), and then we attempt to release the delayed node at __btrfs_run_delayed_items(), which requires taking the delayed node's mutex, causing an ABBA type of deadlock. This was reported by syzbot and the lockdep splat is the following: WARNING: possible circular locking dependency detected 6.5.0-rc7-syzkaller-00024-g93f5de5f648d #0 Not tainted ------------------------------------------------------ syz-executor.2/13257 is trying to acquire lock: ffff88801835c0c0 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node+0x9a/0xaa0 fs/btrfs/delayed-inode.c:256 but task is already holding lock: ffff88802a5ab8e8 (btrfs-tree-00){++++}-{3:3}, at: __btrfs_tree_lock+0x3c/0x2a0 fs/btrfs/locking.c:198 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (btrfs-tree-00){++++}-{3:3}: __lock_release kernel/locking/lockdep.c:5475 [inline] lock_release+0x36f/0x9d0 kernel/locking/lockdep.c:5781 up_write+0x79/0x580 kernel/locking/rwsem.c:1625 btrfs_tree_unlock_rw fs/btrfs/locking.h:189 [inline] btrfs_unlock_up_safe+0x179/0x3b0 fs/btrfs/locking.c:239 search_leaf fs/btrfs/ctree.c:1986 [inline] btrfs_search_slot+0x2511/0x2f80 fs/btrfs/ctree.c:2230 btrfs_insert_empty_items+0x9c/0x180 fs/btrfs/ctree.c:4376 btrfs_insert_delayed_item fs/btrfs/delayed-inode.c:746 [inline] btrfs_insert_delayed_items fs/btrfs/delayed-inode.c:824 [inline] __btrfs_commit_inode_delayed_items+0xd24/0x2410 fs/btrfs/delayed-inode.c:1111 __btrfs_run_delayed_items+0x1db/0x430 fs/btrfs/delayed-inode.c:1153 flush_space+0x269/0xe70 fs/btrfs/space-info.c:723 btrfs_async_reclaim_metadata_space+0x106/0x350 fs/btrfs/space-info.c:1078 process_one_work+0x92c/0x12c0 kernel/workqueue.c:2600 worker_thread+0xa63/0x1210 kernel/workqueue.c:2751 kthread+0x2b8/0x350 kernel/kthread.c:389 ret_from_fork+0x2e/0x60 arch/x86/kernel/process.c:145 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304 -> #0 (&delayed_node->mutex){+.+.}-{3:3}: check_prev_add kernel/locking/lockdep.c:3142 [inline] check_prevs_add kernel/locking/lockdep.c:3261 [inline] validate_chain kernel/locking/lockdep.c:3876 [inline] __lock_acquire+0x39ff/0x7f70 kernel/locking/lockdep.c:5144 lock_acquire+0x1e3/0x520 kernel/locking/lockdep.c:5761 __mutex_lock_common+0x1d8/0x2530 kernel/locking/mutex.c:603 __mutex_lock kernel/locking/mutex.c:747 [inline] mutex_lock_nested+0x1b/0x20 kernel/locking/mutex.c:799 __btrfs_release_delayed_node+0x9a/0xaa0 fs/btrfs/delayed-inode.c:256 btrfs_release_delayed_node fs/btrfs/delayed-inode.c:281 [inline] __btrfs_run_delayed_items+0x2b5/0x430 fs/btrfs/delayed-inode.c:1156 btrfs_commit_transaction+0x859/0x2ff0 fs/btrfs/transaction.c:2276 btrfs_sync_file+0xf56/0x1330 fs/btrfs/file.c:1988 vfs_fsync_range fs/sync.c:188 [inline] vfs_fsync fs/sync.c:202 [inline] do_fsync fs/sync.c:212 [inline] __do_sys_fsync fs/sync.c:220 [inline] __se_sys_fsync fs/sync.c:218 [inline] __x64_sys_fsync+0x196/0x1e0 fs/sync.c:218 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(btrfs-tree-00); lock(&delayed_node->mutex); lock(btrfs-tree-00); lock(&delayed_node->mutex); *** DEADLOCK *** 3 locks held by syz-executor.2/13257: #0: ffff88802c1ee370 (btrfs_trans_num_writers){++++}-{0:0}, at: spin_unlock include/linux/spinlock.h:391 [inline] #0: ffff88802c1ee370 (btrfs_trans_num_writers){++++}-{0:0}, at: join_transaction+0xb87/0xe00 fs/btrfs/transaction.c:287 #1: ffff88802c1ee398 (btrfs_trans_num_extwriters){++++}-{0:0}, at: join_transaction+0xbb2/0xe00 fs/btrfs/transaction.c:288 #2: ffff88802a5ab8e8 (btrfs-tree-00){++++}-{3:3}, at: __btrfs_tree_lock+0x3c/0x2a0 fs/btrfs/locking.c:198 stack backtrace: CPU: 0 PID: 13257 Comm: syz-executor.2 Not tainted 6.5.0-rc7-syzkaller-00024-g93f5de5f648d #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1e7/0x2d0 lib/dump_stack.c:106 check_noncircular+0x375/0x4a0 kernel/locking/lockdep.c:2195 check_prev_add kernel/locking/lockdep.c:3142 [inline] check_prevs_add kernel/locking/lockdep.c:3261 [inline] validate_chain kernel/locking/lockdep.c:3876 [inline] __lock_acquire+0x39ff/0x7f70 kernel/locking/lockdep.c:5144 lock_acquire+0x1e3/0x520 kernel/locking/lockdep.c:5761 __mutex_lock_common+0x1d8/0x2530 kernel/locking/mutex.c:603 __mutex_lock kernel/locking/mutex.c:747 [inline] mutex_lock_nested+0x1b/0x20 kernel/locking/mutex.c:799 __btrfs_release_delayed_node+0x9a/0xaa0 fs/btrfs/delayed-inode.c:256 btrfs_release_delayed_node fs/btrfs/delayed-inode.c:281 [inline] __btrfs_run_delayed_items+0x2b5/0x430 fs/btrfs/delayed-inode.c:1156 btrfs_commit_transaction+0x859/0x2ff0 fs/btrfs/transaction.c:2276 btrfs_sync_file+0xf56/0x1330 fs/btrfs/file.c:1988 vfs_fsync_range fs/sync.c:188 [inline] vfs_fsync fs/sync.c:202 [inline] do_fsync fs/sync.c:212 [inline] __do_sys_fsync fs/sync.c:220 [inline] __se_sys_fsync fs/sync.c:218 [inline] __x64_sys_fsync+0x196/0x1e0 fs/sync.c:218 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f3ad047cae9 Code: 28 00 00 00 75 (...) RSP: 002b:00007f3ad12510c8 EFLAGS: 00000246 ORIG_RAX: 000000000000004a RAX: ffffffffffffffda RBX: 00007f3ad059bf80 RCX: 00007f3ad047cae9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000005 RBP: 00007f3ad04c847a R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000000b R14: 00007f3ad059bf80 R15: 00007ffe56af92f8 ------------[ cut here ]------------ Fix this by releasing the path before releasing the delayed node in the error path at __btrfs_run_delayed_items(). Reported-by: syzbot+a379155f07c134ea9879@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/000000000000abba27060403b5bd@google.com/ CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 85dcf00241375..9300e09d339b9 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1153,20 +1153,33 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); if (ret) { - btrfs_release_delayed_node(curr_node); - curr_node = NULL; btrfs_abort_transaction(trans, ret); break; } prev_node = curr_node; curr_node = btrfs_next_delayed_node(curr_node); + /* + * See the comment below about releasing path before releasing + * node. If the commit of delayed items was successful the path + * should always be released, but in case of an error, it may + * point to locked extent buffers (a leaf at the very least). + */ + ASSERT(path->nodes[0] == NULL); btrfs_release_delayed_node(prev_node); } + /* + * Release the path to avoid a potential deadlock and lockdep splat when + * releasing the delayed node, as that requires taking the delayed node's + * mutex. If another task starts running delayed items before we take + * the mutex, it will first lock the mutex and then it may try to lock + * the same btree path (leaf). + */ + btrfs_free_path(path); + if (curr_node) btrfs_release_delayed_node(curr_node); - btrfs_free_path(path); trans->block_rsv = block_rsv; return ret; From 4ca8e03cf2bfaeef7c85939fa1ea0c749cd116ab Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 24 Aug 2023 16:59:04 -0400 Subject: [PATCH 046/161] btrfs: check for BTRFS_FS_ERROR in pending ordered assert If we do fast tree logging we increment a counter on the current transaction for every ordered extent we need to wait for. This means we expect the transaction to still be there when we clear pending on the ordered extent. However if we happen to abort the transaction and clean it up, there could be no running transaction, and thus we'll trip the "ASSERT(trans)" check. This is obviously incorrect, and the code properly deals with the case that the transaction doesn't exist. Fix this ASSERT() to only fire if there's no trans and we don't have BTRFS_FS_ERROR() set on the file system. CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index b46ab348e8e5b..345c449d588cc 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -639,7 +639,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); - ASSERT(trans); + ASSERT(trans || BTRFS_FS_ERROR(fs_info)); if (trans) { if (atomic_dec_and_test(&trans->pending_ordered)) wake_up(&trans->pending_wait); From 5e0e879926c1ce7e1f5e0dfaacaf2d105f7d8a05 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 22 Aug 2023 13:50:51 +0800 Subject: [PATCH 047/161] btrfs: fix a compilation error if DEBUG is defined in btree_dirty_folio [BUG] After commit 72a69cd03082 ("btrfs: subpage: pack all subpage bitmaps into a larger bitmap"), the DEBUG section of btree_dirty_folio() would no longer compile. [CAUSE] If DEBUG is defined, we would do extra checks for btree_dirty_folio(), mostly to make sure the range we marked dirty has an extent buffer and that extent buffer is dirty. For subpage, we need to iterate through all the extent buffers covered by that page range, and make sure they all matches the criteria. However commit 72a69cd03082 ("btrfs: subpage: pack all subpage bitmaps into a larger bitmap") changes how we store the bitmap, we pack all the 16 bits bitmaps into a larger bitmap, which would save some space. This means we no longer have btrfs_subpage::dirty_bitmap, instead the dirty bitmap is starting at btrfs_subpage_info::dirty_offset, and has a length of btrfs_subpage_info::bitmap_nr_bits. [FIX] Although I'm not sure if it still makes sense to maintain such code, at least let it compile. This patch would let us test the bits one by one through the bitmaps. CC: stable@vger.kernel.org # 6.1+ Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 639f1e308e4c6..68f60d50e1fd0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -520,6 +520,7 @@ static bool btree_dirty_folio(struct address_space *mapping, struct folio *folio) { struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + struct btrfs_subpage_info *spi = fs_info->subpage_info; struct btrfs_subpage *subpage; struct extent_buffer *eb; int cur_bit = 0; @@ -533,18 +534,19 @@ static bool btree_dirty_folio(struct address_space *mapping, btrfs_assert_tree_write_locked(eb); return filemap_dirty_folio(mapping, folio); } + + ASSERT(spi); subpage = folio_get_private(folio); - ASSERT(subpage->dirty_bitmap); - while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) { + for (cur_bit = spi->dirty_offset; + cur_bit < spi->dirty_offset + spi->bitmap_nr_bits; + cur_bit++) { unsigned long flags; u64 cur; - u16 tmp = (1 << cur_bit); spin_lock_irqsave(&subpage->lock, flags); - if (!(tmp & subpage->dirty_bitmap)) { + if (!test_bit(cur_bit, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - cur_bit++; continue; } spin_unlock_irqrestore(&subpage->lock, flags); @@ -557,7 +559,7 @@ static bool btree_dirty_folio(struct address_space *mapping, btrfs_assert_tree_write_locked(eb); free_extent_buffer(eb); - cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); + cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1; } return filemap_dirty_folio(mapping, folio); } From 91bfe3104b8db0310f76f2dcb6aacef24c889366 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 28 Aug 2023 09:06:42 +0100 Subject: [PATCH 048/161] btrfs: improve error message after failure to add delayed dir index item If we fail to add a delayed dir index item because there's already another item with the same index number, we print an error message (and then BUG). However that message isn't very helpful to debug anything because we don't know what's the index number and what are the values of index counters in the inode and its delayed inode (index_cnt fields of struct btrfs_inode and struct btrfs_delayed_node). So update the error message to include the index number and counters. We actually had a recent case where this issue was hit by a syzbot report (see the link below). Link: https://lore.kernel.org/linux-btrfs/00000000000036e1290603e097e0@google.com/ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 9300e09d339b9..bb9908cbabfed 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1511,9 +1511,10 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, ret = __btrfs_add_delayed_item(delayed_node, delayed_item); if (unlikely(ret)) { btrfs_err(trans->fs_info, - "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - name_len, name, delayed_node->root->root_key.objectid, - delayed_node->inode_id, ret); +"error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d", + name_len, name, index, btrfs_root_id(delayed_node->root), + delayed_node->inode_id, dir->index_cnt, + delayed_node->index_cnt, ret); BUG(); } mutex_unlock(&delayed_node->mutex); From 2c58c3931ede7cd08cbecf1f1a4acaf0a04a41a9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 28 Aug 2023 09:06:43 +0100 Subject: [PATCH 049/161] btrfs: remove BUG() after failure to insert delayed dir index item Instead of calling BUG() when we fail to insert a delayed dir index item into the delayed node's tree, we can just release all the resources we have allocated/acquired before and return the error to the caller. This is fine because all existing call chains undo anything they have done before calling btrfs_insert_delayed_dir_index() or BUG_ON (when creating pending snapshots in the transaction commit path). So remove the BUG() call and do proper error handling. This relates to a syzbot report linked below, but does not fix it because it only prevents hitting a BUG(), it does not fix the issue where somehow we attempt to use twice the same index number for different index items. Link: https://lore.kernel.org/linux-btrfs/00000000000036e1290603e097e0@google.com/ CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 74 +++++++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bb9908cbabfed..6e779bc163400 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1426,7 +1426,29 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH); } -/* Will return 0 or -ENOMEM */ +static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + return; + + /* + * Adding the new dir index item does not require touching another + * leaf, so we can release 1 unit of metadata that was previously + * reserved when starting the transaction. This applies only to + * the case where we had a transaction start and excludes the + * transaction join case (when replaying log trees). + */ + trace_btrfs_space_reservation(fs_info, "transaction", + trans->transid, bytes, 0); + btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); + ASSERT(trans->bytes_reserved >= bytes); + trans->bytes_reserved -= bytes; +} + +/* Will return 0, -ENOMEM or -EEXIST (index number collision, unexpected). */ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, @@ -1468,6 +1490,27 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); + /* + * First attempt to insert the delayed item. This is to make the error + * handling path simpler in case we fail (-EEXIST). There's no risk of + * any other task coming in and running the delayed item before we do + * the metadata space reservation below, because we are holding the + * delayed node's mutex and that mutex must also be locked before the + * node's delayed items can be run. + */ + ret = __btrfs_add_delayed_item(delayed_node, delayed_item); + if (unlikely(ret)) { + btrfs_err(trans->fs_info, +"error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d", + name_len, name, index, btrfs_root_id(delayed_node->root), + delayed_node->inode_id, dir->index_cnt, + delayed_node->index_cnt, ret); + btrfs_release_delayed_item(delayed_item); + btrfs_release_dir_index_item_space(trans); + mutex_unlock(&delayed_node->mutex); + goto release_node; + } + if (delayed_node->index_item_leaves == 0 || delayed_node->curr_index_batch_size + data_len > leaf_data_size) { delayed_node->curr_index_batch_size = data_len; @@ -1485,37 +1528,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, * impossible. */ if (WARN_ON(ret)) { - mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_item(delayed_item); + mutex_unlock(&delayed_node->mutex); goto release_node; } delayed_node->index_item_leaves++; - } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { - const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); - - /* - * Adding the new dir index item does not require touching another - * leaf, so we can release 1 unit of metadata that was previously - * reserved when starting the transaction. This applies only to - * the case where we had a transaction start and excludes the - * transaction join case (when replaying log trees). - */ - trace_btrfs_space_reservation(fs_info, "transaction", - trans->transid, bytes, 0); - btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); - ASSERT(trans->bytes_reserved >= bytes); - trans->bytes_reserved -= bytes; - } - - ret = __btrfs_add_delayed_item(delayed_node, delayed_item); - if (unlikely(ret)) { - btrfs_err(trans->fs_info, -"error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d", - name_len, name, index, btrfs_root_id(delayed_node->root), - delayed_node->inode_id, dir->index_cnt, - delayed_node->index_cnt, ret); - BUG(); + } else { + btrfs_release_dir_index_item_space(trans); } mutex_unlock(&delayed_node->mutex); From a57c2d4e46f519b24558ae0752c17eec416ac72a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 28 Aug 2023 09:06:44 +0100 Subject: [PATCH 050/161] btrfs: assert delayed node locked when removing delayed item When removing a delayed item, or releasing which will remove it as well, we will modify one of the delayed node's rbtrees and item counter if the delayed item is in one of the rbtrees. This require having the delayed node's mutex locked, otherwise we will race with other tasks modifying the rbtrees and the counter. This is motivated by a previous version of another patch actually calling btrfs_release_delayed_item() after unlocking the delayed node's mutex and against a delayed item that is in a rbtree. So assert at __btrfs_remove_delayed_item() that the delayed node's mutex is locked. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6e779bc163400..dfe29b29915fc 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -412,6 +412,7 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root) static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) { + struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node; struct rb_root_cached *root; struct btrfs_delayed_root *delayed_root; @@ -419,18 +420,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) if (RB_EMPTY_NODE(&delayed_item->rb_node)) return; - delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; + /* If it's in a rbtree, then we need to have delayed node locked. */ + lockdep_assert_held(&delayed_node->mutex); + + delayed_root = delayed_node->root->fs_info->delayed_root; BUG_ON(!delayed_root); if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) - root = &delayed_item->delayed_node->ins_root; + root = &delayed_node->ins_root; else - root = &delayed_item->delayed_node->del_root; + root = &delayed_node->del_root; rb_erase_cached(&delayed_item->rb_node, root); RB_CLEAR_NODE(&delayed_item->rb_node); - delayed_item->delayed_node->count--; + delayed_node->count--; finish_one_item(delayed_root); } From 5facccc9402301d67d48bef06159b91f7e41efc0 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Wed, 23 Aug 2023 03:17:47 +0530 Subject: [PATCH 051/161] MAINTAINERS: remove links to obsolete btrfs.wiki.kernel.org The wiki has been archived and is not updated anymore. Remove or replace the links in files that contain it (MAINTAINERS, Kconfig, docs). Signed-off-by: Bhaskar Chowdhury Reviewed-by: David Sterba Signed-off-by: David Sterba --- Documentation/filesystems/btrfs.rst | 1 - MAINTAINERS | 1 - fs/btrfs/Kconfig | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Documentation/filesystems/btrfs.rst b/Documentation/filesystems/btrfs.rst index 992eddb0e11b8..a81db8f54d689 100644 --- a/Documentation/filesystems/btrfs.rst +++ b/Documentation/filesystems/btrfs.rst @@ -37,7 +37,6 @@ For more information please refer to the documentation site or wiki https://btrfs.readthedocs.io - https://btrfs.wiki.kernel.org that maintains information about administration tasks, frequently asked questions, use cases, mount options, comprehensible changelogs, features, diff --git a/MAINTAINERS b/MAINTAINERS index d590ce31aa726..dea8c26efbca1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4360,7 +4360,6 @@ M: David Sterba L: linux-btrfs@vger.kernel.org S: Maintained W: https://btrfs.readthedocs.io -W: https://btrfs.wiki.kernel.org/ Q: https://patchwork.kernel.org/project/linux-btrfs/list/ C: irc://irc.libera.chat/btrfs T: git git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 3282adc84d521..a25c9910d90b2 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -31,7 +31,7 @@ config BTRFS_FS continue to be mountable and usable by newer kernels. For more information, please see the web pages at - http://btrfs.wiki.kernel.org. + https://btrfs.readthedocs.io To compile this file system support as a module, choose M here. The module will be called btrfs. From 9616cb34b08ec86642b162eae75c5a7ca8debe3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 5 Jul 2023 13:53:17 +0200 Subject: [PATCH 052/161] kselftest/runner.sh: Propagate SIGTERM to runner child MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Timeouts in kselftest are done using the "timeout" command with the "--foreground" option. Without the "foreground" option, it is not possible for a user to cancel the runner using SIGINT, because the signal is not propagated to timeout which is running in a different process group. The "forground" options places the timeout in the same process group as its parent, but only sends the SIGTERM (on timeout) signal to the forked process. Unfortunately, this does not play nice with all kselftests, e.g. "net:fcnal-test.sh", where the child processes will linger because timeout does not send SIGTERM to the group. Some users have noted these hangs [1]. Fix this by nesting the timeout with an additional timeout without the foreground option. Link: https://lore.kernel.org/all/7650b2eb-0aee-a2b0-2e64-c9bc63210f67@alu.unizg.hr/ # [1] Fixes: 651e0d881461 ("kselftest/runner: allow to properly deliver signals to tests") Signed-off-by: Björn Töpel Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest/runner.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh index 1c952d1401d46..70e0a465e30da 100644 --- a/tools/testing/selftests/kselftest/runner.sh +++ b/tools/testing/selftests/kselftest/runner.sh @@ -36,7 +36,8 @@ tap_timeout() { # Make sure tests will time out if utility is available. if [ -x /usr/bin/timeout ] ; then - /usr/bin/timeout --foreground "$kselftest_timeout" $1 + /usr/bin/timeout --foreground "$kselftest_timeout" \ + /usr/bin/timeout "$kselftest_timeout" $1 else $1 fi From 5f9dd2e896a91bfca90f8463eb6808c03d535d8a Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 22 Aug 2023 18:09:40 -0300 Subject: [PATCH 053/161] selftests: fix dependency checker script This patch fixes inconsistencies in the parsing rules of the levels 1 and 2 of the kselftest_deps.sh. It was added the levels 4 and 5 to account for a few edge cases that are present in some tests, also some minor identation styling have been fixed (s/ /\t/g). Signed-off-by: Ricardo B. Marliere Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest_deps.sh | 77 +++++++++++++++++++---- 1 file changed, 65 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/kselftest_deps.sh b/tools/testing/selftests/kselftest_deps.sh index 4bc14d9e8ff1d..de59cc8f03c3f 100755 --- a/tools/testing/selftests/kselftest_deps.sh +++ b/tools/testing/selftests/kselftest_deps.sh @@ -46,11 +46,11 @@ fi print_targets=0 while getopts "p" arg; do - case $arg in - p) + case $arg in + p) print_targets=1 shift;; - esac + esac done if [ $# -eq 0 ] @@ -92,6 +92,10 @@ pass_cnt=0 # Get all TARGETS from selftests Makefile targets=$(grep -E "^TARGETS +|^TARGETS =" Makefile | cut -d "=" -f2) +# Initially, in LDLIBS related lines, the dep checker needs +# to ignore lines containing the following strings: +filter="\$(VAR_LDLIBS)\|pkg-config\|PKG_CONFIG\|IOURING_EXTRA_LIBS" + # Single test case if [ $# -eq 2 ] then @@ -100,6 +104,8 @@ then l1_test $test l2_test $test l3_test $test + l4_test $test + l5_test $test print_results $1 $2 exit $? @@ -113,7 +119,7 @@ fi # Append space at the end of the list to append more tests. l1_tests=$(grep -r --include=Makefile "^LDLIBS" | \ - grep -v "VAR_LDLIBS" | awk -F: '{print $1}') + grep -v "$filter" | awk -F: '{print $1}' | uniq) # Level 2: LDLIBS set dynamically. # @@ -126,7 +132,7 @@ l1_tests=$(grep -r --include=Makefile "^LDLIBS" | \ # Append space at the end of the list to append more tests. l2_tests=$(grep -r --include=Makefile ": LDLIBS" | \ - grep -v "VAR_LDLIBS" | awk -F: '{print $1}') + grep -v "$filter" | awk -F: '{print $1}' | uniq) # Level 3 # memfd and others use pkg-config to find mount and fuse libs @@ -138,11 +144,32 @@ l2_tests=$(grep -r --include=Makefile ": LDLIBS" | \ # VAR_LDLIBS := $(shell pkg-config fuse --libs 2>/dev/null) l3_tests=$(grep -r --include=Makefile "^VAR_LDLIBS" | \ - grep -v "pkg-config" | awk -F: '{print $1}') + grep -v "pkg-config\|PKG_CONFIG" | awk -F: '{print $1}' | uniq) -#echo $l1_tests -#echo $l2_1_tests -#echo $l3_tests +# Level 4 +# some tests may fall back to default using `|| echo -l` +# if pkg-config doesn't find the libs, instead of using VAR_LDLIBS +# as per level 3 checks. +# e.g: +# netfilter/Makefile +# LDLIBS += $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl) +l4_tests=$(grep -r --include=Makefile "^LDLIBS" | \ + grep "pkg-config\|PKG_CONFIG" | awk -F: '{print $1}' | uniq) + +# Level 5 +# some tests may use IOURING_EXTRA_LIBS to add extra libs to LDLIBS, +# which in turn may be defined in a sub-Makefile +# e.g.: +# mm/Makefile +# $(OUTPUT)/gup_longterm: LDLIBS += $(IOURING_EXTRA_LIBS) +l5_tests=$(grep -r --include=Makefile "LDLIBS +=.*\$(IOURING_EXTRA_LIBS)" | \ + awk -F: '{print $1}' | uniq) + +#echo l1_tests $l1_tests +#echo l2_tests $l2_tests +#echo l3_tests $l3_tests +#echo l4_tests $l4_tests +#echo l5_tests $l5_tests all_tests print_results $1 $2 @@ -164,24 +191,32 @@ all_tests() for test in $l3_tests; do l3_test $test done + + for test in $l4_tests; do + l4_test $test + done + + for test in $l5_tests; do + l5_test $test + done } # Use same parsing used for l1_tests and pick libraries this time. l1_test() { test_libs=$(grep --include=Makefile "^LDLIBS" $test | \ - grep -v "VAR_LDLIBS" | \ + grep -v "$filter" | \ sed -e 's/\:/ /' | \ sed -e 's/+/ /' | cut -d "=" -f 2) check_libs $test $test_libs } -# Use same parsing used for l2__tests and pick libraries this time. +# Use same parsing used for l2_tests and pick libraries this time. l2_test() { test_libs=$(grep --include=Makefile ": LDLIBS" $test | \ - grep -v "VAR_LDLIBS" | \ + grep -v "$filter" | \ sed -e 's/\:/ /' | sed -e 's/+/ /' | \ cut -d "=" -f 2) @@ -197,6 +232,24 @@ l3_test() check_libs $test $test_libs } +l4_test() +{ + test_libs=$(grep --include=Makefile "^VAR_LDLIBS\|^LDLIBS" $test | \ + grep "\(pkg-config\|PKG_CONFIG\).*|| echo " | \ + sed -e 's/.*|| echo //' | sed -e 's/)$//') + + check_libs $test $test_libs +} + +l5_test() +{ + tests=$(find $(dirname "$test") -type f -name "*.mk") + test_libs=$(grep "^IOURING_EXTRA_LIBS +\?=" $tests | \ + cut -d "=" -f 2) + + check_libs $test $test_libs +} + check_libs() { From 3f3f384139ed147c71e1d770accf610133d5309b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 22 Aug 2023 15:58:37 +0200 Subject: [PATCH 054/161] selftests: Keep symlinks, when possible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When kselftest is built/installed with the 'gen_tar' target, rsync is used for the installation step to copy files. Extra care is needed for tests that have symlinks. Commit ae108c48b5d2 ("selftests: net: Fix cross-tree inclusion of scripts") added '-L' (transform symlink into referent file/dir) to rsync, to fix dangling links. However, that broke some tests where the symlink (being a symlink) is part of the test (e.g. exec:execveat). Use rsync's '--copy-unsafe-links' that does right thing. Fixes: ae108c48b5d2 ("selftests: net: Fix cross-tree inclusion of scripts") Signed-off-by: Björn Töpel Reviewed-by: Benjamin Poirier Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index d17854285f2b6..118e0964bda94 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -106,7 +106,7 @@ endef run_tests: all ifdef building_out_of_srctree @if [ "X$(TEST_PROGS)$(TEST_PROGS_EXTENDED)$(TEST_FILES)" != "X" ]; then \ - rsync -aLq $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(OUTPUT); \ + rsync -aq --copy-unsafe-links $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(OUTPUT); \ fi @if [ "X$(TEST_PROGS)" != "X" ]; then \ $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) \ @@ -120,7 +120,7 @@ endif define INSTALL_SINGLE_RULE $(if $(INSTALL_LIST),@mkdir -p $(INSTALL_PATH)) - $(if $(INSTALL_LIST),rsync -aL $(INSTALL_LIST) $(INSTALL_PATH)/) + $(if $(INSTALL_LIST),rsync -a --copy-unsafe-links $(INSTALL_LIST) $(INSTALL_PATH)/) endef define INSTALL_RULE From 7deac114be5fb25a4e865212ed0feaf5f85f2a28 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 25 Aug 2023 10:55:31 +0800 Subject: [PATCH 055/161] md: don't dereference mddev after export_rdev() Except for initial reference, mddev->kobject is referenced by rdev->kobject, and if the last rdev is freed, there is no guarantee that mddev is still valid. Hence mddev should not be used anymore after export_rdev(). This problem can be triggered by following test for mdadm at very low rate: New file: mdadm/tests/23rdev-lifetime devname=${dev0##*/} devt=`cat /sys/block/$devname/dev` pid="" runtime=2 clean_up_test() { pill -9 $pid echo clear > /sys/block/md0/md/array_state } trap 'clean_up_test' EXIT add_by_sysfs() { while true; do echo $devt > /sys/block/md0/md/new_dev done } remove_by_sysfs(){ while true; do echo remove > /sys/block/md0/md/dev-${devname}/state done } echo md0 > /sys/module/md_mod/parameters/new_array || die "create md0 failed" add_by_sysfs & pid="$pid $!" remove_by_sysfs & pid="$pid $!" sleep $runtime exit 0 Test cmd: ./test --save-logs --logdir=/tmp/ --keep-going --dev=loop --tests=23rdev-lifetime Test result: general protection fault, probably for non-canonical address 0x6b6b6b6b6b6b6bcb: 0000 [#4] PREEMPT SMP CPU: 0 PID: 1292 Comm: test Tainted: G D W 6.5.0-rc2-00121-g01e55c376936 #562 RIP: 0010:md_wakeup_thread+0x9e/0x320 [md_mod] Call Trace: mddev_unlock+0x1b6/0x310 [md_mod] rdev_attr_store+0xec/0x190 [md_mod] sysfs_kf_write+0x52/0x70 kernfs_fop_write_iter+0x19a/0x2a0 vfs_write+0x3b5/0x770 ksys_write+0x74/0x150 __x64_sys_write+0x22/0x30 do_syscall_64+0x40/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fix this problem by don't dereference mddev after export_rdev(). Fixes: 3ce94ce5d05a ("md: fix duplicate filename for rdev") Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230825025532.1523008-2-yukuai1@huaweicloud.com --- drivers/md/md.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 0fe7ab6e8ab9f..590aee057acad 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -798,14 +798,14 @@ void mddev_unlock(struct mddev *mddev) } else mutex_unlock(&mddev->reconfig_mutex); + md_wakeup_thread(mddev->thread); + wake_up(&mddev->sb_wait); + list_for_each_entry_safe(rdev, tmp, &delete, same_set) { list_del_init(&rdev->same_set); kobject_del(&rdev->kobj); export_rdev(rdev, mddev); } - - md_wakeup_thread(mddev->thread); - wake_up(&mddev->sb_wait); } EXPORT_SYMBOL_GPL(mddev_unlock); From 99892147f028d711f9d40fefad4f33632593864c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 25 Aug 2023 10:55:32 +0800 Subject: [PATCH 056/161] md: fix warning for holder mismatch from export_rdev() Commit a1d767191096 ("md: use mddev->external to select holder in export_rdev()") fix the problem that 'claim_rdev' is used for blkdev_get_by_dev() while 'rdev' is used for blkdev_put(). However, if mddev->external is changed from 0 to 1, then 'rdev' is used for blkdev_get_by_dev() while 'claim_rdev' is used for blkdev_put(). And this problem can be reporduced reliably by following: New file: mdadm/tests/23rdev-lifetime devname=${dev0##*/} devt=`cat /sys/block/$devname/dev` pid="" runtime=2 clean_up_test() { pill -9 $pid echo clear > /sys/block/md0/md/array_state } trap 'clean_up_test' EXIT add_by_sysfs() { while true; do echo $devt > /sys/block/md0/md/new_dev done } remove_by_sysfs(){ while true; do echo remove > /sys/block/md0/md/dev-${devname}/state done } echo md0 > /sys/module/md_mod/parameters/new_array || die "create md0 failed" add_by_sysfs & pid="$pid $!" remove_by_sysfs & pid="$pid $!" sleep $runtime exit 0 Test cmd: ./test --save-logs --logdir=/tmp/ --keep-going --dev=loop --tests=23rdev-lifetime Test result: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 960 at block/bdev.c:618 blkdev_put+0x27c/0x330 Modules linked in: multipath md_mod loop CPU: 0 PID: 960 Comm: test Not tainted 6.5.0-rc2-00121-g01e55c376936-dirty #50 RIP: 0010:blkdev_put+0x27c/0x330 Call Trace: export_rdev.isra.23+0x50/0xa0 [md_mod] mddev_unlock+0x19d/0x300 [md_mod] rdev_attr_store+0xec/0x190 [md_mod] sysfs_kf_write+0x52/0x70 kernfs_fop_write_iter+0x19a/0x2a0 vfs_write+0x3b5/0x770 ksys_write+0x74/0x150 __x64_sys_write+0x22/0x30 do_syscall_64+0x40/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fix the problem by recording if 'rdev' is used as holder. Fixes: a1d767191096 ("md: use mddev->external to select holder in export_rdev()") Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230825025532.1523008-3-yukuai1@huaweicloud.com --- drivers/md/md.c | 15 ++++++++++++--- drivers/md/md.h | 3 +++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 590aee057acad..73758b7541275 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2452,7 +2452,8 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); #endif - blkdev_put(rdev->bdev, mddev->external ? &claim_rdev : rdev); + blkdev_put(rdev->bdev, + test_bit(Holder, &rdev->flags) ? rdev : &claim_rdev); rdev->bdev = NULL; kobject_put(&rdev->kobj); } @@ -3632,6 +3633,7 @@ EXPORT_SYMBOL_GPL(md_rdev_init); static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) { struct md_rdev *rdev; + struct md_rdev *holder; sector_t size; int err; @@ -3646,8 +3648,15 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe if (err) goto out_clear_rdev; + if (super_format == -2) { + holder = &claim_rdev; + } else { + holder = rdev; + set_bit(Holder, &rdev->flags); + } + rdev->bdev = blkdev_get_by_dev(newdev, BLK_OPEN_READ | BLK_OPEN_WRITE, - super_format == -2 ? &claim_rdev : rdev, NULL); + holder, NULL); if (IS_ERR(rdev->bdev)) { pr_warn("md: could not open device unknown-block(%u,%u).\n", MAJOR(newdev), MINOR(newdev)); @@ -3684,7 +3693,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe return rdev; out_blkdev_put: - blkdev_put(rdev->bdev, super_format == -2 ? &claim_rdev : rdev); + blkdev_put(rdev->bdev, holder); out_clear_rdev: md_rdev_clear(rdev); out_free_rdev: diff --git a/drivers/md/md.h b/drivers/md/md.h index 9bcb77bca9639..7c9c13abd7cac 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -211,6 +211,9 @@ enum flag_bits { * check if there is collision between raid1 * serial bios. */ + Holder, /* rdev is used as holder while opening + * underlying disk exclusively. + */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, From d508ee2dd5574b1e84d140b927c25807c9b66d7e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 7 Sep 2023 17:58:59 -0400 Subject: [PATCH 057/161] tracefs/eventfs: Free top level files on removal When an instance is removed, the top level files of the eventfs directory are not cleaned up. Call the eventfs_remove() on each of the entries to free them. This was found via kmemleak: unreferenced object 0xffff8881047c1280 (size 96): comm "mkdir", pid 924, jiffies 4294906489 (age 2013.077s) hex dump (first 32 bytes): 18 31 ed 03 81 88 ff ff 00 31 09 24 81 88 ff ff .1.......1.$.... 00 00 00 00 00 00 00 00 98 19 7c 04 81 88 ff ff ..........|..... backtrace: [<000000000fa46b4d>] kmalloc_trace+0x2a/0xa0 [<00000000e729cd0c>] eventfs_prepare_ef.constprop.0+0x3a/0x160 [<000000009032e6a8>] eventfs_add_events_file+0xa0/0x160 [<00000000fe968442>] create_event_toplevel_files+0x6f/0x130 [<00000000e364d173>] event_trace_add_tracer+0x14/0x140 [<00000000411840fa>] trace_array_create_dir+0x52/0xf0 [<00000000967804fa>] trace_array_create+0x208/0x370 [<00000000da505565>] instance_mkdir+0x6b/0xb0 [<00000000dc1215af>] tracefs_syscall_mkdir+0x5b/0x90 [<00000000a8aca289>] vfs_mkdir+0x272/0x380 [<000000007709b242>] do_mkdirat+0xfc/0x1d0 [<00000000c0b6d219>] __x64_sys_mkdir+0x78/0xa0 [<0000000097b5dd4b>] do_syscall_64+0x3f/0x90 [<00000000a3f00cfa>] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 unreferenced object 0xffff888103ed3118 (size 8): comm "mkdir", pid 924, jiffies 4294906489 (age 2013.077s) hex dump (first 8 bytes): 65 6e 61 62 6c 65 00 00 enable.. backtrace: [<0000000010f75127>] __kmalloc_node_track_caller+0x51/0x160 [<000000004b3eca91>] kstrdup+0x34/0x60 [<0000000050074d7a>] eventfs_prepare_ef.constprop.0+0x53/0x160 [<000000009032e6a8>] eventfs_add_events_file+0xa0/0x160 [<00000000fe968442>] create_event_toplevel_files+0x6f/0x130 [<00000000e364d173>] event_trace_add_tracer+0x14/0x140 [<00000000411840fa>] trace_array_create_dir+0x52/0xf0 [<00000000967804fa>] trace_array_create+0x208/0x370 [<00000000da505565>] instance_mkdir+0x6b/0xb0 [<00000000dc1215af>] tracefs_syscall_mkdir+0x5b/0x90 [<00000000a8aca289>] vfs_mkdir+0x272/0x380 [<000000007709b242>] do_mkdirat+0xfc/0x1d0 [<00000000c0b6d219>] __x64_sys_mkdir+0x78/0xa0 [<0000000097b5dd4b>] do_syscall_64+0x3f/0x90 [<00000000a3f00cfa>] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 Link: https://lore.kernel.org/linux-trace-kernel/20230907175859.6fedbaa2@gandalf.local.home Cc: Mark Rutland Cc: Ajay Kaher Cc: Zheng Yejian Cc: Naresh Kamboju Reviewed-by: Masami Hiramatsu (Google) Fixes: 5bdcd5f5331a2 eventfs: ("Implement removal of meta data from eventfs") Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 609ccb5b7cfcd..f168aca454587 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -195,17 +195,39 @@ void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry) { struct tracefs_inode *ti_parent; struct eventfs_inode *ei; - struct eventfs_file *ef; - - mutex_lock(&eventfs_mutex); + struct eventfs_file *ef, *tmp; /* The top level events directory may be freed by this */ if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) { + LIST_HEAD(ef_del_list); + + mutex_lock(&eventfs_mutex); + ei = ti->private; + + /* Record all the top level files */ + list_for_each_entry_srcu(ef, &ei->e_top_files, list, + lockdep_is_held(&eventfs_mutex)) { + list_add_tail(&ef->del_list, &ef_del_list); + } + + /* Nothing should access this, but just in case! */ + ti->private = NULL; + + mutex_unlock(&eventfs_mutex); + + /* Now safely free the top level files and their children */ + list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) { + list_del(&ef->del_list); + eventfs_remove(ef); + } + kfree(ei); - goto out; + return; } + mutex_lock(&eventfs_mutex); + ti_parent = get_tracefs(dentry->d_parent->d_inode); if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE)) goto out; From 95a404bd60af6c4d9d8db01ad14fe8957ece31ca Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 7 Sep 2023 12:28:20 -0400 Subject: [PATCH 058/161] ring-buffer: Do not attempt to read past "commit" When iterating over the ring buffer while the ring buffer is active, the writer can corrupt the reader. There's barriers to help detect this and handle it, but that code missed the case where the last event was at the very end of the page and has only 4 bytes left. The checks to detect the corruption by the writer to reads needs to see the length of the event. If the length in the first 4 bytes is zero then the length is stored in the second 4 bytes. But if the writer is in the process of updating that code, there's a small window where the length in the first 4 bytes could be zero even though the length is only 4 bytes. That will cause rb_event_length() to read the next 4 bytes which could happen to be off the allocated page. To protect against this, fail immediately if the next event pointer is less than 8 bytes from the end of the commit (last byte of data), as all events must be a minimum of 8 bytes anyway. Link: https://lore.kernel.org/all/20230905141245.26470-1-Tze-nan.Wu@mediatek.com/ Link: https://lore.kernel.org/linux-trace-kernel/20230907122820.0899019c@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Reported-by: Tze-nan Wu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 72ccf75defd0f..a1651edc48d51 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2390,6 +2390,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter) */ commit = rb_page_commit(iter_head_page); smp_rmb(); + + /* An event needs to be at least 8 bytes in size */ + if (iter->head > commit - 8) + goto reset; + event = __rb_page_index(iter_head_page, iter->head); length = rb_event_length(event); From 1ef26d8b2ca5d8715563c951083cf6c385c77d1f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 7 Sep 2023 22:19:11 -0400 Subject: [PATCH 059/161] tracing: Use the new eventfs descriptor for print trigger The check to create the print event "trigger" was using the obsolete "dir" value of the trace_event_file to determine if it should create the trigger or not. But that value will now be NULL because it uses the event file descriptor. Change it to test the "ef" field of the trace_event_file structure so that the trace_marker "trigger" file appears again. Link: https://lkml.kernel.org/r/20230908022001.371815239@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Ajay Kaher Fixes: 27152bceea1df ("eventfs: Move tracing/events to eventfs") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0608ad20cf30e..122c23c9eb280 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9792,8 +9792,8 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) tr, &tracing_mark_fops); file = __find_event_file(tr, "ftrace", "print"); - if (file && file->dir) - trace_create_file("trigger", TRACE_MODE_WRITE, file->dir, + if (file && file->ef) + eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef, file, &event_trigger_fops); tr->trace_marker_file = file; From 6fdac58c560e4d164eb8161987bee045147cabe4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 7 Sep 2023 22:19:12 -0400 Subject: [PATCH 060/161] tracing: Remove unused trace_event_file dir field Now that eventfs structure is used to create the events directory via the eventfs dynamically allocate code, the "dir" field of the trace_event_file structure is no longer used. Remove it. Link: https://lkml.kernel.org/r/20230908022001.580400115@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Ajay Kaher Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 1 - kernel/trace/trace_events.c | 13 ------------- 2 files changed, 14 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index eb5c3add939b7..12f875e9e69a7 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -650,7 +650,6 @@ struct trace_event_file { struct trace_event_call *event_call; struct event_filter __rcu *filter; struct eventfs_file *ef; - struct dentry *dir; struct trace_array *tr; struct trace_subsystem_dir *system; struct list_head triggers; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2af92177b7658..065c639918582 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -992,19 +992,6 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) static void remove_event_file_dir(struct trace_event_file *file) { - struct dentry *dir = file->dir; - struct dentry *child; - - if (dir) { - spin_lock(&dir->d_lock); /* probably unneeded */ - list_for_each_entry(child, &dir->d_subdirs, d_child) { - if (d_really_is_positive(child)) /* probably unneeded */ - d_inode(child)->i_private = NULL; - } - spin_unlock(&dir->d_lock); - - tracefs_remove(dir); - } eventfs_remove(file->ef); list_del(&file->list); remove_subsystem(file->system); From 145036f88d693d7ef3aa8537a4b1aa22f8764647 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Wed, 14 Jun 2023 14:40:46 +0530 Subject: [PATCH 061/161] selftests/ftrace: Fix dependencies for some of the synthetic event tests Commit b81a3a100cca1b ("tracing/histogram: Add simple tests for stacktrace usage of synthetic events") changed the output text in tracefs README, but missed updating some of the dependencies specified in selftests. This causes some of the tests to exit as unsupported. Fix this by changing the grep pattern. Since we want these tests to work on older kernels, match only against the common last part of the pattern. Link: https://lore.kernel.org/linux-trace-kernel/20230614091046.2178539-1-naveen@kernel.org Cc: Cc: Masami Hiramatsu Cc: Shuah Khan Fixes: b81a3a100cca ("tracing/histogram: Add simple tests for stacktrace usage of synthetic events") Signed-off-by: Naveen N Rao Signed-off-by: Steven Rostedt (Google) --- .../trigger/inter-event/trigger-synthetic-event-dynstring.tc | 2 +- .../inter-event/trigger-synthetic_event_syntax_errors.tc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-dynstring.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-dynstring.tc index 213d890ed1886..174376ddbc6c7 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-dynstring.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-dynstring.tc @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 # description: event trigger - test inter-event histogram trigger trace action with dynamic string param -# requires: set_event synthetic_events events/sched/sched_process_exec/hist "char name[]' >> synthetic_events":README ping:program +# requires: set_event synthetic_events events/sched/sched_process_exec/hist "' >> synthetic_events":README ping:program fail() { #msg echo $1 diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic_event_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic_event_syntax_errors.tc index 955e3ceea44b5..b927ee54c02da 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic_event_syntax_errors.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic_event_syntax_errors.tc @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 # description: event trigger - test synthetic_events syntax parser errors -# requires: synthetic_events error_log "char name[]' >> synthetic_events":README +# requires: synthetic_events error_log "' >> synthetic_events":README check_error() { # command-with-error-pos-by-^ ftrace_errlog_check 'synthetic_events' "$1" 'synthetic_events' From fdd2630a7398191e84822612e589062063bd4f3d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 9 Sep 2023 07:12:30 -0400 Subject: [PATCH 062/161] nfsd: fix change_info in NFSv4 RENAME replies nfsd sends the transposed directory change info in the RENAME reply. The source directory is in save_fh and the target is in current_fh. Reported-by: Zhi Li Reported-by: Benjamin Coddington Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2218844 Signed-off-by: Jeff Layton Cc: Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 5ca748309c262..4199ede0583c7 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1058,8 +1058,8 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, rename->rn_tname, rename->rn_tnamelen); if (status) return status; - set_change_info(&rename->rn_sinfo, &cstate->current_fh); - set_change_info(&rename->rn_tinfo, &cstate->save_fh); + set_change_info(&rename->rn_sinfo, &cstate->save_fh); + set_change_info(&rename->rn_tinfo, &cstate->current_fh); return nfs_ok; } From ced33ca07d8d99435ca3320c740ea947843005ca Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Thu, 7 Sep 2023 00:26:03 +0800 Subject: [PATCH 063/161] selftests/net: Improve bind_bhash.sh to accommodate predictable network interface names Starting with v197, systemd uses predictable interface network names, the traditional interface naming scheme (eth0) is deprecated, therefore it cannot be assumed that the eth0 interface exists on the host. This modification makes the bind_bhash test program run in a separate network namespace and no longer needs to consider the name of the network interface on the host. Signed-off-by: Juntong Deng Signed-off-by: David S. Miller --- tools/testing/selftests/net/bind_bhash.sh | 26 ++++++++++++----------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/net/bind_bhash.sh b/tools/testing/selftests/net/bind_bhash.sh index ca0292d4b4419..a28563bdaae06 100755 --- a/tools/testing/selftests/net/bind_bhash.sh +++ b/tools/testing/selftests/net/bind_bhash.sh @@ -2,7 +2,7 @@ # SPDX-License-Identifier: GPL-2.0 NR_FILES=32768 -SAVED_NR_FILES=$(ulimit -n) +readonly NETNS="ns-$(mktemp -u XXXXXX)" # default values port=443 @@ -36,21 +36,21 @@ while getopts "ha:p:64" opt; do done setup() { + ip netns add "${NETNS}" + ip -netns "${NETNS}" link add veth0 type veth peer name veth1 + ip -netns "${NETNS}" link set lo up + ip -netns "${NETNS}" link set veth0 up + ip -netns "${NETNS}" link set veth1 up + if [[ "$use_v6" == true ]]; then - ip addr add $addr_v6 nodad dev eth0 + ip -netns "${NETNS}" addr add $addr_v6 nodad dev veth0 else - ip addr add $addr_v4 dev lo + ip -netns "${NETNS}" addr add $addr_v4 dev lo fi - ulimit -n $NR_FILES } cleanup() { - if [[ "$use_v6" == true ]]; then - ip addr del $addr_v6 dev eth0 - else - ip addr del $addr_v4/32 dev lo - fi - ulimit -n $SAVED_NR_FILES + ip netns del "${NETNS}" } if [[ "$addr" != "" ]]; then @@ -59,8 +59,10 @@ if [[ "$addr" != "" ]]; then fi setup if [[ "$use_v6" == true ]] ; then - ./bind_bhash $port "ipv6" $addr_v6 + ip netns exec "${NETNS}" sh -c \ + "ulimit -n ${NR_FILES};./bind_bhash ${port} ipv6 ${addr_v6}" else - ./bind_bhash $port "ipv4" $addr_v4 + ip netns exec "${NETNS}" sh -c \ + "ulimit -n ${NR_FILES};./bind_bhash ${port} ipv4 ${addr_v4}" fi cleanup From e73d1ab6cd7e7190bd891e521d270cd26ad8e40d Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 7 Sep 2023 11:55:12 +0200 Subject: [PATCH 064/161] net: bcmasp: add missing of_node_put for_each_available_child_of_node performs an of_node_get on each iteration, so a break out of the loop requires an of_node_put. This was done using the Coccinelle semantic patch iterators/for_each_child.cocci Signed-off-by: Julia Lawall Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/asp2/bcmasp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp.c b/drivers/net/ethernet/broadcom/asp2/bcmasp.c index d63d321f3e7b0..f048e3d451194 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp.c @@ -1300,6 +1300,7 @@ static int bcmasp_probe(struct platform_device *pdev) if (!intf) { dev_err(dev, "Cannot create eth interface %d\n", i); bcmasp_remove_intfs(priv); + of_node_put(intf_node); goto of_put_exit; } list_add_tail(&intf->list, &priv->intfs); From 281f65d29d6da1a9b6907fb0b145aaf34f4e4822 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Thu, 7 Sep 2023 22:03:58 +0800 Subject: [PATCH 065/161] net: microchip: vcap api: Fix possible memory leak for vcap_dup_rule() Inject fault When select CONFIG_VCAP_KUNIT_TEST, the below memory leak occurs. If kzalloc() for duprule succeeds, but the following kmemdup() fails, the duprule, ckf and caf memory will be leaked. So kfree them in the error path. unreferenced object 0xffff122744c50600 (size 192): comm "kunit_try_catch", pid 346, jiffies 4294896122 (age 911.812s) hex dump (first 32 bytes): 10 27 00 00 04 00 00 00 1e 00 00 00 2c 01 00 00 .'..........,... 00 00 00 00 00 00 00 00 18 06 c5 44 27 12 ff ff ...........D'... backtrace: [<00000000394b0db8>] __kmem_cache_alloc_node+0x274/0x2f8 [<0000000001bedc67>] kmalloc_trace+0x38/0x88 [<00000000b0612f98>] vcap_dup_rule+0x50/0x460 [<000000005d2d3aca>] vcap_add_rule+0x8cc/0x1038 [<00000000eef9d0f8>] test_vcap_xn_rule_creator.constprop.0.isra.0+0x238/0x494 [<00000000cbda607b>] vcap_api_rule_remove_in_front_test+0x1ac/0x698 [<00000000c8766299>] kunit_try_run_case+0xe0/0x20c [<00000000c4fe9186>] kunit_generic_run_threadfn_adapter+0x50/0x94 [<00000000f6864acf>] kthread+0x2e8/0x374 [<0000000022e639b3>] ret_from_fork+0x10/0x20 Fixes: 814e7693207f ("net: microchip: vcap api: Add a storage state to a VCAP rule") Signed-off-by: Jinjie Ruan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/vcap/vcap_api.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microchip/vcap/vcap_api.c b/drivers/net/ethernet/microchip/vcap/vcap_api.c index 300fe1a93dce8..ef980e4e5bc2f 100644 --- a/drivers/net/ethernet/microchip/vcap/vcap_api.c +++ b/drivers/net/ethernet/microchip/vcap/vcap_api.c @@ -1021,18 +1021,32 @@ static struct vcap_rule_internal *vcap_dup_rule(struct vcap_rule_internal *ri, list_for_each_entry(ckf, &ri->data.keyfields, ctrl.list) { newckf = kmemdup(ckf, sizeof(*newckf), GFP_KERNEL); if (!newckf) - return ERR_PTR(-ENOMEM); + goto err; list_add_tail(&newckf->ctrl.list, &duprule->data.keyfields); } list_for_each_entry(caf, &ri->data.actionfields, ctrl.list) { newcaf = kmemdup(caf, sizeof(*newcaf), GFP_KERNEL); if (!newcaf) - return ERR_PTR(-ENOMEM); + goto err; list_add_tail(&newcaf->ctrl.list, &duprule->data.actionfields); } return duprule; + +err: + list_for_each_entry_safe(ckf, newckf, &duprule->data.keyfields, ctrl.list) { + list_del(&ckf->ctrl.list); + kfree(ckf); + } + + list_for_each_entry_safe(caf, newcaf, &duprule->data.actionfields, ctrl.list) { + list_del(&caf->ctrl.list); + kfree(caf); + } + + kfree(duprule); + return ERR_PTR(-ENOMEM); } static void vcap_apply_width(u8 *dst, int width, int bytes) From 88e69af061f2e061a68751ef9cad47a674527a1b Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Fri, 8 Sep 2023 08:23:09 +0530 Subject: [PATCH 066/161] octeontx2-pf: Fix page pool cache index corruption. The access to page pool `cache' array and the `count' variable is not locked. Page pool cache access is fine as long as there is only one consumer per pool. octeontx2 driver fills in rx buffers from page pool in NAPI context. If system is stressed and could not allocate buffers, refiiling work will be delegated to a delayed workqueue. This means that there are two cosumers to the page pool cache. Either workqueue or IRQ/NAPI can be run on other CPU. This will lead to lock less access, hence corruption of cache pool indexes. To fix this issue, NAPI is rescheduled from workqueue context to refill rx buffers. Fixes: b2e3406a38f0 ("octeontx2-pf: Add support for page pool") Signed-off-by: Ratheesh Kannoth Reported-by: Sebastian Andrzej Siewior Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- .../ethernet/marvell/octeontx2/nic/cn10k.c | 6 ++- .../ethernet/marvell/octeontx2/nic/cn10k.h | 2 +- .../marvell/octeontx2/nic/otx2_common.c | 43 +++---------------- .../marvell/octeontx2/nic/otx2_common.h | 3 +- .../ethernet/marvell/octeontx2/nic/otx2_pf.c | 7 +-- .../marvell/octeontx2/nic/otx2_txrx.c | 30 ++++++++++--- .../marvell/octeontx2/nic/otx2_txrx.h | 4 +- 7 files changed, 44 insertions(+), 51 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c index 826f691de2595..a4a258da8dd59 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c @@ -107,12 +107,13 @@ int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura) } #define NPA_MAX_BURST 16 -void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) +int cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) { struct otx2_nic *pfvf = dev; + int cnt = cq->pool_ptrs; u64 ptrs[NPA_MAX_BURST]; - int num_ptrs = 1; dma_addr_t bufptr; + int num_ptrs = 1; /* Refill pool with new buffers */ while (cq->pool_ptrs) { @@ -131,6 +132,7 @@ void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) num_ptrs = 1; } } + return cnt - cq->pool_ptrs; } void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h index 8ae96815865e6..c1861f7de2545 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h @@ -24,7 +24,7 @@ static inline int mtu_to_dwrr_weight(struct otx2_nic *pfvf, int mtu) return weight; } -void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq); +int cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq); void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx); int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura); int cn10k_lmtst_init(struct otx2_nic *pfvf); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 8511906cb4e2d..997fedac3a98f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -574,20 +574,8 @@ int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, int otx2_alloc_buffer(struct otx2_nic *pfvf, struct otx2_cq_queue *cq, dma_addr_t *dma) { - if (unlikely(__otx2_alloc_rbuf(pfvf, cq->rbpool, dma))) { - struct refill_work *work; - struct delayed_work *dwork; - - work = &pfvf->refill_wrk[cq->cq_idx]; - dwork = &work->pool_refill_work; - /* Schedule a task if no other task is running */ - if (!cq->refill_task_sched) { - cq->refill_task_sched = true; - schedule_delayed_work(dwork, - msecs_to_jiffies(100)); - } + if (unlikely(__otx2_alloc_rbuf(pfvf, cq->rbpool, dma))) return -ENOMEM; - } return 0; } @@ -1082,39 +1070,20 @@ static int otx2_cq_init(struct otx2_nic *pfvf, u16 qidx) static void otx2_pool_refill_task(struct work_struct *work) { struct otx2_cq_queue *cq; - struct otx2_pool *rbpool; struct refill_work *wrk; - int qidx, free_ptrs = 0; struct otx2_nic *pfvf; - dma_addr_t bufptr; + int qidx; wrk = container_of(work, struct refill_work, pool_refill_work.work); pfvf = wrk->pf; qidx = wrk - pfvf->refill_wrk; cq = &pfvf->qset.cq[qidx]; - rbpool = cq->rbpool; - free_ptrs = cq->pool_ptrs; - while (cq->pool_ptrs) { - if (otx2_alloc_rbuf(pfvf, rbpool, &bufptr)) { - /* Schedule a WQ if we fails to free atleast half of the - * pointers else enable napi for this RQ. - */ - if (!((free_ptrs - cq->pool_ptrs) > free_ptrs / 2)) { - struct delayed_work *dwork; - - dwork = &wrk->pool_refill_work; - schedule_delayed_work(dwork, - msecs_to_jiffies(100)); - } else { - cq->refill_task_sched = false; - } - return; - } - pfvf->hw_ops->aura_freeptr(pfvf, qidx, bufptr + OTX2_HEAD_ROOM); - cq->pool_ptrs--; - } cq->refill_task_sched = false; + + local_bh_disable(); + napi_schedule(wrk->napi); + local_bh_enable(); } int otx2_config_nix_queues(struct otx2_nic *pfvf) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index 4c6032ee7800d..c04a8ee53a82f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -302,6 +302,7 @@ struct flr_work { struct refill_work { struct delayed_work pool_refill_work; struct otx2_nic *pf; + struct napi_struct *napi; }; /* PTPv2 originTimestamp structure */ @@ -370,7 +371,7 @@ struct dev_hw_ops { int (*sq_aq_init)(void *dev, u16 qidx, u16 sqb_aura); void (*sqe_flush)(void *dev, struct otx2_snd_queue *sq, int size, int qidx); - void (*refill_pool_ptrs)(void *dev, struct otx2_cq_queue *cq); + int (*refill_pool_ptrs)(void *dev, struct otx2_cq_queue *cq); void (*aura_freeptr)(void *dev, int aura, u64 buf); }; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 70b9065f7d101..6daf4d58c25d6 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1943,6 +1943,10 @@ int otx2_stop(struct net_device *netdev) netif_tx_disable(netdev); + for (wrk = 0; wrk < pf->qset.cq_cnt; wrk++) + cancel_delayed_work_sync(&pf->refill_wrk[wrk].pool_refill_work); + devm_kfree(pf->dev, pf->refill_wrk); + otx2_free_hw_resources(pf); otx2_free_cints(pf, pf->hw.cint_cnt); otx2_disable_napi(pf); @@ -1950,9 +1954,6 @@ int otx2_stop(struct net_device *netdev) for (qidx = 0; qidx < netdev->num_tx_queues; qidx++) netdev_tx_reset_queue(netdev_get_tx_queue(netdev, qidx)); - for (wrk = 0; wrk < pf->qset.cq_cnt; wrk++) - cancel_delayed_work_sync(&pf->refill_wrk[wrk].pool_refill_work); - devm_kfree(pf->dev, pf->refill_wrk); kfree(qset->sq); kfree(qset->cq); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index e369baf115301..e77d438489557 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -424,9 +424,10 @@ static int otx2_rx_napi_handler(struct otx2_nic *pfvf, return processed_cqe; } -void otx2_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) +int otx2_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) { struct otx2_nic *pfvf = dev; + int cnt = cq->pool_ptrs; dma_addr_t bufptr; while (cq->pool_ptrs) { @@ -435,6 +436,8 @@ void otx2_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) otx2_aura_freeptr(pfvf, cq->cq_idx, bufptr + OTX2_HEAD_ROOM); cq->pool_ptrs--; } + + return cnt - cq->pool_ptrs; } static int otx2_tx_napi_handler(struct otx2_nic *pfvf, @@ -521,6 +524,7 @@ int otx2_napi_handler(struct napi_struct *napi, int budget) struct otx2_cq_queue *cq; struct otx2_qset *qset; struct otx2_nic *pfvf; + int filled_cnt = -1; cq_poll = container_of(napi, struct otx2_cq_poll, napi); pfvf = (struct otx2_nic *)cq_poll->dev; @@ -541,7 +545,7 @@ int otx2_napi_handler(struct napi_struct *napi, int budget) } if (rx_cq && rx_cq->pool_ptrs) - pfvf->hw_ops->refill_pool_ptrs(pfvf, rx_cq); + filled_cnt = pfvf->hw_ops->refill_pool_ptrs(pfvf, rx_cq); /* Clear the IRQ */ otx2_write64(pfvf, NIX_LF_CINTX_INT(cq_poll->cint_idx), BIT_ULL(0)); @@ -561,9 +565,25 @@ int otx2_napi_handler(struct napi_struct *napi, int budget) otx2_config_irq_coalescing(pfvf, i); } - /* Re-enable interrupts */ - otx2_write64(pfvf, NIX_LF_CINTX_ENA_W1S(cq_poll->cint_idx), - BIT_ULL(0)); + if (unlikely(!filled_cnt)) { + struct refill_work *work; + struct delayed_work *dwork; + + work = &pfvf->refill_wrk[cq->cq_idx]; + dwork = &work->pool_refill_work; + /* Schedule a task if no other task is running */ + if (!cq->refill_task_sched) { + work->napi = napi; + cq->refill_task_sched = true; + schedule_delayed_work(dwork, + msecs_to_jiffies(100)); + } + } else { + /* Re-enable interrupts */ + otx2_write64(pfvf, + NIX_LF_CINTX_ENA_W1S(cq_poll->cint_idx), + BIT_ULL(0)); + } } return workdone; } diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h index 9e3bfbe5c4809..a82ffca8ce1b1 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h @@ -170,6 +170,6 @@ void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx); void otx2_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx); -void otx2_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq); -void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq); +int otx2_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq); +int cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq); #endif /* OTX2_TXRX_H */ From 6912e724832c47bb381eb1bd1e483ec8df0d0f0f Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Fri, 8 Sep 2023 11:31:42 +0800 Subject: [PATCH 067/161] net/smc: bugfix for smcr v2 server connect success statistic In the macro SMC_STAT_SERV_SUCC_INC, the smcd_version is used to determin whether to increase the v1 statistic or the v2 statistic. It is correct for SMCD. But for SMCR, smcr_version should be used. Signed-off-by: Guangguan Wang Signed-off-by: David S. Miller --- net/smc/smc_stats.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h index b60fe1eb37ab6..aa8928975cc63 100644 --- a/net/smc/smc_stats.h +++ b/net/smc/smc_stats.h @@ -243,8 +243,9 @@ while (0) #define SMC_STAT_SERV_SUCC_INC(net, _ini) \ do { \ typeof(_ini) i = (_ini); \ - bool is_v2 = (i->smcd_version & SMC_V2); \ bool is_smcd = (i->is_smcd); \ + u8 version = is_smcd ? i->smcd_version : i->smcr_version; \ + bool is_v2 = (version & SMC_V2); \ typeof(net->smc.smc_stats) smc_stats = (net)->smc.smc_stats; \ if (is_v2 && is_smcd) \ this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v2_succ_cnt); \ From f5146e3ef0a9eea405874b36178c19a4863b8989 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Fri, 8 Sep 2023 11:31:43 +0800 Subject: [PATCH 068/161] net/smc: use smc_lgr_list.lock to protect smc_lgr_list.list iterate in smcr_port_add While doing smcr_port_add, there maybe linkgroup add into or delete from smc_lgr_list.list at the same time, which may result kernel crash. So, use smc_lgr_list.lock to protect smc_lgr_list.list iterate in smcr_port_add. The crash calltrace show below: BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 0 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 0 PID: 559726 Comm: kworker/0:92 Kdump: loaded Tainted: G Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 449e491 04/01/2014 Workqueue: events smc_ib_port_event_work [smc] RIP: 0010:smcr_port_add+0xa6/0xf0 [smc] RSP: 0000:ffffa5a2c8f67de0 EFLAGS: 00010297 RAX: 0000000000000001 RBX: ffff9935e0650000 RCX: 0000000000000000 RDX: 0000000000000010 RSI: ffff9935e0654290 RDI: ffff9935c8560000 RBP: 0000000000000000 R08: 0000000000000000 R09: ffff9934c0401918 R10: 0000000000000000 R11: ffffffffb4a5c278 R12: ffff99364029aae4 R13: ffff99364029aa00 R14: 00000000ffffffed R15: ffff99364029ab08 FS: 0000000000000000(0000) GS:ffff994380600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000f06a10003 CR4: 0000000002770ef0 PKRU: 55555554 Call Trace: smc_ib_port_event_work+0x18f/0x380 [smc] process_one_work+0x19b/0x340 worker_thread+0x30/0x370 ? process_one_work+0x340/0x340 kthread+0x114/0x130 ? __kthread_cancel_work+0x50/0x50 ret_from_fork+0x1f/0x30 Fixes: 1f90a05d9ff9 ("net/smc: add smcr_port_add() and smcr_link_up() processing") Signed-off-by: Guangguan Wang Signed-off-by: David S. Miller --- net/smc/smc_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index bd01dd31e4bd7..d520ee62c8ecd 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1662,6 +1662,7 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *n; + spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { struct smc_link *link; @@ -1680,6 +1681,7 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) if (link) smc_llc_add_link_local(link); } + spin_unlock_bh(&smc_lgr_list.lock); } /* link is down - switch connections to alternate link, From 762f169f5d9b92f057ad2c27ec4e3849b743239a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 7 Aug 2023 18:21:20 +0200 Subject: [PATCH 069/161] efi/x86: Move EFI runtime call setup/teardown helpers out of line Only the arch_efi_call_virt() macro that some architectures override needs to be a macro, given that it is variadic and encapsulates calls via function pointers that have different prototypes. The associated setup and teardown code are not special in this regard, and don't need to be instantiated at each call site. So turn them into ordinary C functions and move them out of line. Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 32 ++------------------------------ arch/x86/platform/efi/efi_32.c | 12 ++++++++++++ arch/x86/platform/efi/efi_64.c | 19 +++++++++++++++++-- 3 files changed, 31 insertions(+), 32 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index b0994ae3bc23f..c4555b269a1b2 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -91,19 +91,6 @@ static inline void efi_fpu_end(void) #ifdef CONFIG_X86_32 #define EFI_X86_KERNEL_ALLOC_LIMIT (SZ_512M - 1) - -#define arch_efi_call_virt_setup() \ -({ \ - efi_fpu_begin(); \ - firmware_restrict_branch_speculation_start(); \ -}) - -#define arch_efi_call_virt_teardown() \ -({ \ - firmware_restrict_branch_speculation_end(); \ - efi_fpu_end(); \ -}) - #else /* !CONFIG_X86_32 */ #define EFI_X86_KERNEL_ALLOC_LIMIT EFI_ALLOC_LIMIT @@ -116,14 +103,6 @@ extern bool efi_disable_ibt_for_runtime; __efi_call(__VA_ARGS__); \ }) -#define arch_efi_call_virt_setup() \ -({ \ - efi_sync_low_kernel_mappings(); \ - efi_fpu_begin(); \ - firmware_restrict_branch_speculation_start(); \ - efi_enter_mm(); \ -}) - #undef arch_efi_call_virt #define arch_efi_call_virt(p, f, args...) ({ \ u64 ret, ibt = ibt_save(efi_disable_ibt_for_runtime); \ @@ -132,13 +111,6 @@ extern bool efi_disable_ibt_for_runtime; ret; \ }) -#define arch_efi_call_virt_teardown() \ -({ \ - efi_leave_mm(); \ - firmware_restrict_branch_speculation_end(); \ - efi_fpu_end(); \ -}) - #ifdef CONFIG_KASAN /* * CONFIG_KASAN may redefine memset to __memset. __memset function is present @@ -168,8 +140,8 @@ extern void efi_delete_dummy_variable(void); extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr); extern void efi_free_boot_services(void); -void efi_enter_mm(void); -void efi_leave_mm(void); +void arch_efi_call_virt_setup(void); +void arch_efi_call_virt_teardown(void); /* kexec external ABI */ struct efi_setup_data { diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index e06a199423c0f..b2cc7b4552a16 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -140,3 +140,15 @@ void __init efi_runtime_update_mappings(void) } } } + +void arch_efi_call_virt_setup(void) +{ + efi_fpu_begin(); + firmware_restrict_branch_speculation_start(); +} + +void arch_efi_call_virt_teardown(void) +{ + firmware_restrict_branch_speculation_end(); + efi_fpu_end(); +} diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 77f7ac3668cb4..91d31ac422d6c 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -474,19 +474,34 @@ void __init efi_dump_pagetable(void) * can not change under us. * It should be ensured that there are no concurrent calls to this function. */ -void efi_enter_mm(void) +static void efi_enter_mm(void) { efi_prev_mm = current->active_mm; current->active_mm = &efi_mm; switch_mm(efi_prev_mm, &efi_mm, NULL); } -void efi_leave_mm(void) +static void efi_leave_mm(void) { current->active_mm = efi_prev_mm; switch_mm(&efi_mm, efi_prev_mm, NULL); } +void arch_efi_call_virt_setup(void) +{ + efi_sync_low_kernel_mappings(); + efi_fpu_begin(); + firmware_restrict_branch_speculation_start(); + efi_enter_mm(); +} + +void arch_efi_call_virt_teardown(void) +{ + efi_leave_mm(); + firmware_restrict_branch_speculation_end(); + efi_fpu_end(); +} + static DEFINE_SPINLOCK(efi_runtime_lock); /* From aba7e066c738d4b349413a271b2a236aa55bacbc Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 2 Aug 2023 17:17:04 +0200 Subject: [PATCH 070/161] efi/x86: Ensure that EFI_RUNTIME_MAP is enabled for kexec CONFIG_EFI_RUNTIME_MAP needs to be enabled in order for kexec to be able to provide the required information about the EFI runtime mappings to the incoming kernel, regardless of whether kexec_load() or kexec_file_load() is being used. Without this information, kexec boot in EFI mode is not possible. The CONFIG_EFI_RUNTIME_MAP option is currently directly configurable if CONFIG_EXPERT is enabled, so that it can be turned on for debugging purposes even if KEXEC is not enabled. However, the upshot of this is that it can also be disabled even when it shouldn't. So tweak the Kconfig declarations to avoid this situation. Reported-by: Kirill A. Shutemov Signed-off-by: Ard Biesheuvel --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 982b777eadc7a..66bfabae88149 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1945,6 +1945,7 @@ config EFI select UCS2_STRING select EFI_RUNTIME_WRAPPERS select ARCH_USE_MEMREMAP_PROT + select EFI_RUNTIME_MAP if KEXEC_CORE help This enables the kernel to use EFI runtime services that are available (such as the EFI variable services). @@ -2020,7 +2021,6 @@ config EFI_MAX_FAKE_MEM config EFI_RUNTIME_MAP bool "Export EFI runtime maps to sysfs" if EXPERT depends on EFI - default KEXEC_CORE help Export EFI runtime memory regions to /sys/firmware/efi/runtime-map. That memory map is required by the 2nd kernel to set up EFI virtual From e7761d827e99919c32400056a884e481ef008ec4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 16 Aug 2023 21:05:57 +0200 Subject: [PATCH 071/161] efi/unaccepted: Use ACPI reclaim memory for unaccepted memory table Kyril reports that crashkernels fail to work on confidential VMs that rely on the unaccepted memory table, and this appears to be caused by the fact that it is not considered part of the set of firmware tables that the crashkernel needs to map. This is an oversight, and a result of the use of the EFI_LOADER_DATA memory type for this table. The correct memory type to use for any firmware table is EFI_ACPI_RECLAIM_MEMORY (including ones created by the EFI stub), even though the name suggests that is it specific to ACPI. ACPI reclaim means that the memory is used by the firmware to expose information to the operating system, but that the memory region has no special significance to the firmware itself, and the OS is free to reclaim the memory and use it as ordinary memory if it is not interested in the contents, or if it has already consumed them. In Linux, this memory is never reclaimed, but it is always covered by the kernel direct map and generally made accessible as ordinary memory. On x86, ACPI reclaim memory is translated into E820_ACPI, which the kexec logic already recognizes as memory that the crashkernel may need to to access, and so it will be mapped and accessible to the booting crash kernel. Fixes: 745e3ed85f71 ("efi/libstub: Implement support for unaccepted memory") Reported-by: Kirill A. Shutemov Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/unaccepted_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/efi/libstub/unaccepted_memory.c b/drivers/firmware/efi/libstub/unaccepted_memory.c index ca61f4733ea58..9a655f30ba47d 100644 --- a/drivers/firmware/efi/libstub/unaccepted_memory.c +++ b/drivers/firmware/efi/libstub/unaccepted_memory.c @@ -62,7 +62,7 @@ efi_status_t allocate_unaccepted_bitmap(__u32 nr_desc, bitmap_size = DIV_ROUND_UP(unaccepted_end - unaccepted_start, EFI_UNACCEPTED_UNIT_SIZE * BITS_PER_BYTE); - status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, + status = efi_bs_call(allocate_pool, EFI_ACPI_RECLAIM_MEMORY, sizeof(*unaccepted_table) + bitmap_size, (void **)&unaccepted_table); if (status != EFI_SUCCESS) { From fa60b8163816f194786f3ee334c9a458da7699c6 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Thu, 7 Sep 2023 12:46:31 +0200 Subject: [PATCH 072/161] net: stmmac: fix handling of zero coalescing tx-usecs Setting ethtool -C eth0 tx-usecs 0 is supposed to disable the use of the coalescing timer but currently it gets programmed with zero delay instead. Disable the use of the coalescing timer if tx-usecs is zero by preventing it from being restarted. Note that to keep things simple we don't start/stop the timer when the coalescing settings are changed, but just let that happen on the next transmit or timer expiry. Fixes: 8fce33317023 ("net: stmmac: Rework coalesce timer and fix multi-queue races") Signed-off-by: Vincent Whitchurch Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 9a3182b9e767c..2206789802bf7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2704,9 +2704,7 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue) /* We still have pending packets, let's call for a new scheduling */ if (tx_q->dirty_tx != tx_q->cur_tx) - hrtimer_start(&tx_q->txtimer, - STMMAC_COAL_TIMER(priv->tx_coal_timer[queue]), - HRTIMER_MODE_REL); + stmmac_tx_timer_arm(priv, queue); flags = u64_stats_update_begin_irqsave(&tx_q->txq_stats.syncp); tx_q->txq_stats.tx_packets += tx_packets; @@ -2995,9 +2993,13 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv) static void stmmac_tx_timer_arm(struct stmmac_priv *priv, u32 queue) { struct stmmac_tx_queue *tx_q = &priv->dma_conf.tx_queue[queue]; + u32 tx_coal_timer = priv->tx_coal_timer[queue]; + + if (!tx_coal_timer) + return; hrtimer_start(&tx_q->txtimer, - STMMAC_COAL_TIMER(priv->tx_coal_timer[queue]), + STMMAC_COAL_TIMER(tx_coal_timer), HRTIMER_MODE_REL); } From 9b90aca97f6d5255ca41e716720d138b878cd034 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Fri, 8 Sep 2023 14:19:48 +0800 Subject: [PATCH 073/161] net: ethernet: bcmasp: fix possible OOB write in bcmasp_netfilt_get_all_active() rule_locs is allocated in ethtool_get_rxnfc and the size is determined by rule_cnt from user space. So rule_cnt needs to be check before using rule_locs to avoid OOB writing or NULL pointer dereference. Fixes: c5d511c49587 ("net: bcmasp: Add support for wake on net filters") Signed-off-by: Hangyu Hua Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/asp2/bcmasp.c | 9 +++++++-- drivers/net/ethernet/broadcom/asp2/bcmasp.h | 4 ++-- drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp.c b/drivers/net/ethernet/broadcom/asp2/bcmasp.c index f048e3d451194..41a6098eb0c2f 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp.c @@ -528,13 +528,16 @@ void bcmasp_netfilt_suspend(struct bcmasp_intf *intf) ASP_RX_FILTER_BLK_CTRL); } -void bcmasp_netfilt_get_all_active(struct bcmasp_intf *intf, u32 *rule_locs, - u32 *rule_cnt) +int bcmasp_netfilt_get_all_active(struct bcmasp_intf *intf, u32 *rule_locs, + u32 *rule_cnt) { struct bcmasp_priv *priv = intf->parent; int j = 0, i; for (i = 0; i < NUM_NET_FILTERS; i++) { + if (j == *rule_cnt) + return -EMSGSIZE; + if (!priv->net_filters[i].claimed || priv->net_filters[i].port != intf->port) continue; @@ -548,6 +551,8 @@ void bcmasp_netfilt_get_all_active(struct bcmasp_intf *intf, u32 *rule_locs, } *rule_cnt = j; + + return 0; } int bcmasp_netfilt_get_active(struct bcmasp_intf *intf) diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp.h b/drivers/net/ethernet/broadcom/asp2/bcmasp.h index 5b512f7f5e949..ec90add6b03e2 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp.h +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp.h @@ -577,8 +577,8 @@ void bcmasp_netfilt_release(struct bcmasp_intf *intf, int bcmasp_netfilt_get_active(struct bcmasp_intf *intf); -void bcmasp_netfilt_get_all_active(struct bcmasp_intf *intf, u32 *rule_locs, - u32 *rule_cnt); +int bcmasp_netfilt_get_all_active(struct bcmasp_intf *intf, u32 *rule_locs, + u32 *rule_cnt); void bcmasp_netfilt_suspend(struct bcmasp_intf *intf); diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c b/drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c index c4f1604d5ab39..ce6a3d56fb239 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c @@ -335,7 +335,7 @@ static int bcmasp_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, err = bcmasp_flow_get(intf, cmd); break; case ETHTOOL_GRXCLSRLALL: - bcmasp_netfilt_get_all_active(intf, rule_locs, &cmd->rule_cnt); + err = bcmasp_netfilt_get_all_active(intf, rule_locs, &cmd->rule_cnt); cmd->data = NUM_NET_FILTERS; break; default: From 51fe0a470543f345e3c62b6798929de3ddcedc1d Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Fri, 8 Sep 2023 14:19:49 +0800 Subject: [PATCH 074/161] net: ethernet: mvpp2_main: fix possible OOB write in mvpp2_ethtool_get_rxnfc() rules is allocated in ethtool_get_rxnfc and the size is determined by rule_cnt from user space. So rule_cnt needs to be check before using rules to avoid OOB writing or NULL pointer dereference. Fixes: 90b509b39ac9 ("net: mvpp2: cls: Add Classification offload support") Signed-off-by: Hangyu Hua Reviewed-by: Marcin Wojtas Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index eb74ccddb4409..21c3f9b015c85 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -5586,6 +5586,11 @@ static int mvpp2_ethtool_get_rxnfc(struct net_device *dev, break; case ETHTOOL_GRXCLSRLALL: for (i = 0; i < MVPP2_N_RFS_ENTRIES_PER_FLOW; i++) { + if (loc == info->rule_cnt) { + ret = -EMSGSIZE; + break; + } + if (port->rfs_rules[i]) rules[loc++] = i; } From e4c79810755f66c9a933ca810da2724133b1165a Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Fri, 8 Sep 2023 14:19:50 +0800 Subject: [PATCH 075/161] net: ethernet: mtk_eth_soc: fix possible NULL pointer dereference in mtk_hwlro_get_fdir_all() rule_locs is allocated in ethtool_get_rxnfc and the size is determined by rule_cnt from user space. So rule_cnt needs to be check before using rule_locs to avoid NULL pointer dereference. Fixes: 7aab747e5563 ("net: ethernet: mediatek: add ethtool functions to configure RX flows of HW LRO") Signed-off-by: Hangyu Hua Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 6ad42e3b488f7..2372ce8c25803 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -2994,6 +2994,9 @@ static int mtk_hwlro_get_fdir_all(struct net_device *dev, int i; for (i = 0; i < MTK_MAX_LRO_IP_CNT; i++) { + if (cnt == cmd->rule_cnt) + return -EMSGSIZE; + if (mac->hwlro_ip[i]) { rule_locs[cnt] = i; cnt++; From 484b4833c604c0adcf19eac1ca14b60b757355b5 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Fri, 8 Sep 2023 18:17:52 +0800 Subject: [PATCH 076/161] hsr: Fix uninit-value access in fill_frame_info() Syzbot reports the following uninit-value access problem. ===================================================== BUG: KMSAN: uninit-value in fill_frame_info net/hsr/hsr_forward.c:601 [inline] BUG: KMSAN: uninit-value in hsr_forward_skb+0x9bd/0x30f0 net/hsr/hsr_forward.c:616 fill_frame_info net/hsr/hsr_forward.c:601 [inline] hsr_forward_skb+0x9bd/0x30f0 net/hsr/hsr_forward.c:616 hsr_dev_xmit+0x192/0x330 net/hsr/hsr_device.c:223 __netdev_start_xmit include/linux/netdevice.h:4889 [inline] netdev_start_xmit include/linux/netdevice.h:4903 [inline] xmit_one net/core/dev.c:3544 [inline] dev_hard_start_xmit+0x247/0xa10 net/core/dev.c:3560 __dev_queue_xmit+0x34d0/0x52a0 net/core/dev.c:4340 dev_queue_xmit include/linux/netdevice.h:3082 [inline] packet_xmit+0x9c/0x6b0 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3087 [inline] packet_sendmsg+0x8b1d/0x9f30 net/packet/af_packet.c:3119 sock_sendmsg_nosec net/socket.c:730 [inline] sock_sendmsg net/socket.c:753 [inline] __sys_sendto+0x781/0xa30 net/socket.c:2176 __do_sys_sendto net/socket.c:2188 [inline] __se_sys_sendto net/socket.c:2184 [inline] __ia32_sys_sendto+0x11f/0x1c0 net/socket.c:2184 do_syscall_32_irqs_on arch/x86/entry/common.c:112 [inline] __do_fast_syscall_32+0xa2/0x100 arch/x86/entry/common.c:178 do_fast_syscall_32+0x37/0x80 arch/x86/entry/common.c:203 do_SYSENTER_32+0x1f/0x30 arch/x86/entry/common.c:246 entry_SYSENTER_compat_after_hwframe+0x70/0x82 Uninit was created at: slab_post_alloc_hook+0x12f/0xb70 mm/slab.h:767 slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x577/0xa80 mm/slub.c:3523 kmalloc_reserve+0x148/0x470 net/core/skbuff.c:559 __alloc_skb+0x318/0x740 net/core/skbuff.c:644 alloc_skb include/linux/skbuff.h:1286 [inline] alloc_skb_with_frags+0xc8/0xbd0 net/core/skbuff.c:6299 sock_alloc_send_pskb+0xa80/0xbf0 net/core/sock.c:2794 packet_alloc_skb net/packet/af_packet.c:2936 [inline] packet_snd net/packet/af_packet.c:3030 [inline] packet_sendmsg+0x70e8/0x9f30 net/packet/af_packet.c:3119 sock_sendmsg_nosec net/socket.c:730 [inline] sock_sendmsg net/socket.c:753 [inline] __sys_sendto+0x781/0xa30 net/socket.c:2176 __do_sys_sendto net/socket.c:2188 [inline] __se_sys_sendto net/socket.c:2184 [inline] __ia32_sys_sendto+0x11f/0x1c0 net/socket.c:2184 do_syscall_32_irqs_on arch/x86/entry/common.c:112 [inline] __do_fast_syscall_32+0xa2/0x100 arch/x86/entry/common.c:178 do_fast_syscall_32+0x37/0x80 arch/x86/entry/common.c:203 do_SYSENTER_32+0x1f/0x30 arch/x86/entry/common.c:246 entry_SYSENTER_compat_after_hwframe+0x70/0x82 It is because VLAN not yet supported in hsr driver. Return error when protocol is ETH_P_8021Q in fill_frame_info() now to fix it. Fixes: 451d8123f897 ("net: prp: add packet handling support") Reported-by: syzbot+bf7e6250c7ce248f3ec9@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=bf7e6250c7ce248f3ec9 Signed-off-by: Ziyang Xuan Signed-off-by: David S. Miller --- net/hsr/hsr_forward.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 629daacc96071..b71dab630a873 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -594,6 +594,7 @@ static int fill_frame_info(struct hsr_frame_info *frame, proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto; /* FIXME: */ netdev_warn_once(skb->dev, "VLAN not yet supported"); + return -EINVAL; } frame->is_from_san = false; From 32530dba1bd48da4437d18d9a8dbc9d2826938a6 Mon Sep 17 00:00:00 2001 From: Ciprian Regus Date: Fri, 8 Sep 2023 15:58:08 +0300 Subject: [PATCH 077/161] net:ethernet:adi:adin1110: Fix forwarding offload Currently, when a new fdb entry is added (with both ports of the ADIN2111 bridged), the driver configures the MAC filters for the wrong port, which results in the forwarding being done by the host, and not actually hardware offloaded. The ADIN2111 offloads the forwarding by setting filters on the destination MAC address of incoming frames. Based on these, they may be routed to the other port. Thus, if a frame has to be forwarded from port 1 to port 2, the required configuration for the ADDR_FILT_UPRn register should set the APPLY2PORT1 bit (instead of APPLY2PORT2, as it's currently the case). Fixes: bc93e19d088b ("net: ethernet: adi: Add ADIN1110 support") Signed-off-by: Ciprian Regus Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/adi/adin1110.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/adi/adin1110.c b/drivers/net/ethernet/adi/adin1110.c index 1c009b485188d..ca66b747b7c5d 100644 --- a/drivers/net/ethernet/adi/adin1110.c +++ b/drivers/net/ethernet/adi/adin1110.c @@ -1385,7 +1385,7 @@ static int adin1110_fdb_add(struct adin1110_port_priv *port_priv, return -ENOMEM; other_port = priv->ports[!port_priv->nr]; - port_rules = adin1110_port_rules(port_priv, false, true); + port_rules = adin1110_port_rules(other_port, false, true); eth_broadcast_addr(mask); return adin1110_write_mac_address(other_port, mac_nr, (u8 *)fdb->addr, From 02c652f5465011126152bbd93b6a582a1d0c32f1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 8 Sep 2023 16:33:48 +0300 Subject: [PATCH 078/161] net: dsa: sja1105: hide all multicast addresses from "bridge fdb show" Commit 4d9423549501 ("net: dsa: sja1105: offload bridge port flags to device") has partially hidden some multicast entries from showing up in the "bridge fdb show" output, but it wasn't enough. Addresses which are added through "bridge mdb add" still show up. Hide them all. Fixes: 291d1e72b756 ("net: dsa: sja1105: Add support for FDB and MDB management") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index a23d980d28f5d..11c917d5ce436 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1868,13 +1868,14 @@ static int sja1105_fdb_dump(struct dsa_switch *ds, int port, if (!(l2_lookup.destports & BIT(port))) continue; - /* We need to hide the FDB entry for unknown multicast */ - if (l2_lookup.macaddr == SJA1105_UNKNOWN_MULTICAST && - l2_lookup.mask_macaddr == SJA1105_UNKNOWN_MULTICAST) - continue; - u64_to_ether_addr(l2_lookup.macaddr, macaddr); + /* Hardware FDB is shared for fdb and mdb, "bridge fdb show" + * only wants to see unicast + */ + if (is_multicast_ether_addr(macaddr)) + continue; + /* We need to hide the dsa_8021q VLANs from the user. */ if (vid_is_dsa_8021q(l2_lookup.vlanid)) l2_lookup.vlanid = 0; From c956798062b5a308db96e75157747291197f0378 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 8 Sep 2023 16:33:49 +0300 Subject: [PATCH 079/161] net: dsa: sja1105: propagate exact error code from sja1105_dynamic_config_poll_valid() Currently, sja1105_dynamic_config_wait_complete() returns either 0 or -ETIMEDOUT, because it just looks at the read_poll_timeout() return code. There will be future changes which move some more checks to sja1105_dynamic_config_poll_valid(). It is important that we propagate their exact return code (-ENOENT, -EINVAL), because callers of sja1105_dynamic_config_read() depend on them. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_dynamic_config.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_dynamic_config.c b/drivers/net/dsa/sja1105/sja1105_dynamic_config.c index 7729d3f8b7f50..93d47dab8d3e9 100644 --- a/drivers/net/dsa/sja1105/sja1105_dynamic_config.c +++ b/drivers/net/dsa/sja1105/sja1105_dynamic_config.c @@ -1211,13 +1211,14 @@ sja1105_dynamic_config_wait_complete(struct sja1105_private *priv, struct sja1105_dyn_cmd *cmd, const struct sja1105_dynamic_table_ops *ops) { - int rc; - - return read_poll_timeout(sja1105_dynamic_config_poll_valid, - rc, rc != -EAGAIN, - SJA1105_DYNAMIC_CONFIG_SLEEP_US, - SJA1105_DYNAMIC_CONFIG_TIMEOUT_US, - false, priv, cmd, ops); + int err, rc; + + err = read_poll_timeout(sja1105_dynamic_config_poll_valid, + rc, rc != -EAGAIN, + SJA1105_DYNAMIC_CONFIG_SLEEP_US, + SJA1105_DYNAMIC_CONFIG_TIMEOUT_US, + false, priv, cmd, ops); + return err < 0 ? err : rc; } /* Provides read access to the settings through the dynamic interface From 7cef293b9a634a05fcce9e1df4aee3aeed023345 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 8 Sep 2023 16:33:50 +0300 Subject: [PATCH 080/161] net: dsa: sja1105: fix multicast forwarding working only for last added mdb entry The commit cited in Fixes: did 2 things: it refactored the read-back polling from sja1105_dynamic_config_read() into a new function, sja1105_dynamic_config_wait_complete(), and it called that from sja1105_dynamic_config_write() too. What is problematic is the refactoring. The refactored code from sja1105_dynamic_config_poll_valid() works like the previous one, but the problem is that it uses another packed_buf[] SPI buffer, and there was code at the end of sja1105_dynamic_config_read() which was relying on the read-back packed_buf[]: /* Don't dereference possibly NULL pointer - maybe caller * only wanted to see whether the entry existed or not. */ if (entry) ops->entry_packing(packed_buf, entry, UNPACK); After the change, the packed_buf[] that this code sees is no longer the entry read back from hardware, but the original entry that the caller passed to the sja1105_dynamic_config_read(), packed into this buffer. This difference is the most notable with the SJA1105_SEARCH uses from sja1105pqrs_fdb_add() - used for both fdb and mdb. There, we have logic added by commit 728db843df88 ("net: dsa: sja1105: ignore the FDB entry for unknown multicast when adding a new address") to figure out whether the address we're trying to add matches on any existing hardware entry, with the exception of the catch-all multicast address. That logic was broken, because with sja1105_dynamic_config_read() not working properly, it doesn't return us the entry read back from hardware, but the entry that we passed to it. And, since for multicast, a match will always exist, it will tell us that any mdb entry already exists at index=0 L2 Address Lookup table. It is index=0 because the caller doesn't know the index - it wants to find it out, and sja1105_dynamic_config_read() does: if (index < 0) { // SJA1105_SEARCH /* Avoid copying a signed negative number to an u64 */ cmd.index = 0; // <- this cmd.search = true; } else { cmd.index = index; cmd.search = false; } So, to the caller of sja1105_dynamic_config_read(), the returned info looks entirely legit, and it will add all mdb entries to FDB index 0. There, they will always overwrite each other (not to mention, potentially they can also overwrite a pre-existing bridge fdb entry), and the user-visible impact will be that only the last mdb entry will be forwarded as it should. The others won't (will be flooded or dropped, depending on the egress flood settings). Fixing is a bit more complicated, and involves either passing the same packed_buf[] to sja1105_dynamic_config_wait_complete(), or moving all the extra processing on the packed_buf[] to sja1105_dynamic_config_wait_complete(). I've opted for the latter, because it makes sja1105_dynamic_config_wait_complete() a bit more self-contained. Fixes: df405910ab9f ("net: dsa: sja1105: wait for dynamic config command completion on writes too") Reported-by: Yanan Yang Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- .../net/dsa/sja1105/sja1105_dynamic_config.c | 80 +++++++++---------- 1 file changed, 37 insertions(+), 43 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_dynamic_config.c b/drivers/net/dsa/sja1105/sja1105_dynamic_config.c index 93d47dab8d3e9..984c0e604e8de 100644 --- a/drivers/net/dsa/sja1105/sja1105_dynamic_config.c +++ b/drivers/net/dsa/sja1105/sja1105_dynamic_config.c @@ -1175,18 +1175,15 @@ const struct sja1105_dynamic_table_ops sja1110_dyn_ops[BLK_IDX_MAX_DYN] = { static int sja1105_dynamic_config_poll_valid(struct sja1105_private *priv, - struct sja1105_dyn_cmd *cmd, - const struct sja1105_dynamic_table_ops *ops) + const struct sja1105_dynamic_table_ops *ops, + void *entry, bool check_valident, + bool check_errors) { u8 packed_buf[SJA1105_MAX_DYN_CMD_SIZE] = {}; + struct sja1105_dyn_cmd cmd = {}; int rc; - /* We don't _need_ to read the full entry, just the command area which - * is a fixed SJA1105_SIZE_DYN_CMD. But our cmd_packing() API expects a - * buffer that contains the full entry too. Additionally, our API - * doesn't really know how many bytes into the buffer does the command - * area really begin. So just read back the whole entry. - */ + /* Read back the whole entry + command structure. */ rc = sja1105_xfer_buf(priv, SPI_READ, ops->addr, packed_buf, ops->packed_size); if (rc) @@ -1195,11 +1192,25 @@ sja1105_dynamic_config_poll_valid(struct sja1105_private *priv, /* Unpack the command structure, and return it to the caller in case it * needs to perform further checks on it (VALIDENT). */ - memset(cmd, 0, sizeof(*cmd)); - ops->cmd_packing(packed_buf, cmd, UNPACK); + ops->cmd_packing(packed_buf, &cmd, UNPACK); /* Hardware hasn't cleared VALID => still working on it */ - return cmd->valid ? -EAGAIN : 0; + if (cmd.valid) + return -EAGAIN; + + if (check_valident && !cmd.valident && !(ops->access & OP_VALID_ANYWAY)) + return -ENOENT; + + if (check_errors && cmd.errors) + return -EINVAL; + + /* Don't dereference possibly NULL pointer - maybe caller + * only wanted to see whether the entry existed or not. + */ + if (entry) + ops->entry_packing(packed_buf, entry, UNPACK); + + return 0; } /* Poll the dynamic config entry's control area until the hardware has @@ -1208,8 +1219,9 @@ sja1105_dynamic_config_poll_valid(struct sja1105_private *priv, */ static int sja1105_dynamic_config_wait_complete(struct sja1105_private *priv, - struct sja1105_dyn_cmd *cmd, - const struct sja1105_dynamic_table_ops *ops) + const struct sja1105_dynamic_table_ops *ops, + void *entry, bool check_valident, + bool check_errors) { int err, rc; @@ -1217,7 +1229,8 @@ sja1105_dynamic_config_wait_complete(struct sja1105_private *priv, rc, rc != -EAGAIN, SJA1105_DYNAMIC_CONFIG_SLEEP_US, SJA1105_DYNAMIC_CONFIG_TIMEOUT_US, - false, priv, cmd, ops); + false, priv, ops, entry, check_valident, + check_errors); return err < 0 ? err : rc; } @@ -1287,25 +1300,14 @@ int sja1105_dynamic_config_read(struct sja1105_private *priv, mutex_lock(&priv->dynamic_config_lock); rc = sja1105_xfer_buf(priv, SPI_WRITE, ops->addr, packed_buf, ops->packed_size); - if (rc < 0) { - mutex_unlock(&priv->dynamic_config_lock); - return rc; - } - - rc = sja1105_dynamic_config_wait_complete(priv, &cmd, ops); - mutex_unlock(&priv->dynamic_config_lock); if (rc < 0) - return rc; + goto out; - if (!cmd.valident && !(ops->access & OP_VALID_ANYWAY)) - return -ENOENT; + rc = sja1105_dynamic_config_wait_complete(priv, ops, entry, true, false); +out: + mutex_unlock(&priv->dynamic_config_lock); - /* Don't dereference possibly NULL pointer - maybe caller - * only wanted to see whether the entry existed or not. - */ - if (entry) - ops->entry_packing(packed_buf, entry, UNPACK); - return 0; + return rc; } int sja1105_dynamic_config_write(struct sja1105_private *priv, @@ -1357,22 +1359,14 @@ int sja1105_dynamic_config_write(struct sja1105_private *priv, mutex_lock(&priv->dynamic_config_lock); rc = sja1105_xfer_buf(priv, SPI_WRITE, ops->addr, packed_buf, ops->packed_size); - if (rc < 0) { - mutex_unlock(&priv->dynamic_config_lock); - return rc; - } - - rc = sja1105_dynamic_config_wait_complete(priv, &cmd, ops); - mutex_unlock(&priv->dynamic_config_lock); if (rc < 0) - return rc; + goto out; - cmd = (struct sja1105_dyn_cmd) {0}; - ops->cmd_packing(packed_buf, &cmd, UNPACK); - if (cmd.errors) - return -EINVAL; + rc = sja1105_dynamic_config_wait_complete(priv, ops, NULL, false, true); +out: + mutex_unlock(&priv->dynamic_config_lock); - return 0; + return rc; } static u8 sja1105_crc8_add(u8 crc, u8 byte, u8 poly) From ea32690daf4fa525dc5a4d164bd00ed8c756e1c6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 8 Sep 2023 16:33:51 +0300 Subject: [PATCH 081/161] net: dsa: sja1105: serialize sja1105_port_mcast_flood() with other FDB accesses sja1105_fdb_add() runs from the dsa_owq, and sja1105_port_mcast_flood() runs from switchdev_deferred_process_work(). Prior to the blamed commit, they used to be indirectly serialized through the rtnl_lock(), which no longer holds true because dsa_owq dropped that. So, it is now possible that we traverse the static config BLK_IDX_L2_LOOKUP elements concurrently compared to when we change them, in sja1105_static_fdb_change(). That is not ideal, since it might result in data corruption. Introduce a mutex which serializes accesses to the hardware FDB and to the static config elements for the L2 Address Lookup table. I can't find a good reason to add locking around sja1105_fdb_dump(). I'll add it later if needed. Fixes: 0faf890fc519 ("net: dsa: drop rtnl_lock from dsa_slave_switchdev_event_work") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105.h | 2 + drivers/net/dsa/sja1105/sja1105_main.c | 56 ++++++++++++++++++++------ 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index 0617d5ccd3ff1..8c66d3bf61f02 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -266,6 +266,8 @@ struct sja1105_private { * the switch doesn't confuse them with one another. */ struct mutex mgmt_lock; + /* Serializes accesses to the FDB */ + struct mutex fdb_lock; /* PTP two-step TX timestamp ID, and its serialization lock */ spinlock_t ts_id_lock; u8 ts_id; diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 11c917d5ce436..cefd72617af4d 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1798,6 +1798,7 @@ static int sja1105_fdb_add(struct dsa_switch *ds, int port, struct dsa_db db) { struct sja1105_private *priv = ds->priv; + int rc; if (!vid) { switch (db.type) { @@ -1812,12 +1813,16 @@ static int sja1105_fdb_add(struct dsa_switch *ds, int port, } } - return priv->info->fdb_add_cmd(ds, port, addr, vid); + mutex_lock(&priv->fdb_lock); + rc = priv->info->fdb_add_cmd(ds, port, addr, vid); + mutex_unlock(&priv->fdb_lock); + + return rc; } -static int sja1105_fdb_del(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid, - struct dsa_db db) +static int __sja1105_fdb_del(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid, + struct dsa_db db) { struct sja1105_private *priv = ds->priv; @@ -1837,6 +1842,20 @@ static int sja1105_fdb_del(struct dsa_switch *ds, int port, return priv->info->fdb_del_cmd(ds, port, addr, vid); } +static int sja1105_fdb_del(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid, + struct dsa_db db) +{ + struct sja1105_private *priv = ds->priv; + int rc; + + mutex_lock(&priv->fdb_lock); + rc = __sja1105_fdb_del(ds, port, addr, vid, db); + mutex_unlock(&priv->fdb_lock); + + return rc; +} + static int sja1105_fdb_dump(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data) { @@ -1899,6 +1918,8 @@ static void sja1105_fast_age(struct dsa_switch *ds, int port) }; int i; + mutex_lock(&priv->fdb_lock); + for (i = 0; i < SJA1105_MAX_L2_LOOKUP_COUNT; i++) { struct sja1105_l2_lookup_entry l2_lookup = {0}; u8 macaddr[ETH_ALEN]; @@ -1912,7 +1933,7 @@ static void sja1105_fast_age(struct dsa_switch *ds, int port) if (rc) { dev_err(ds->dev, "Failed to read FDB: %pe\n", ERR_PTR(rc)); - return; + break; } if (!(l2_lookup.destports & BIT(port))) @@ -1924,14 +1945,16 @@ static void sja1105_fast_age(struct dsa_switch *ds, int port) u64_to_ether_addr(l2_lookup.macaddr, macaddr); - rc = sja1105_fdb_del(ds, port, macaddr, l2_lookup.vlanid, db); + rc = __sja1105_fdb_del(ds, port, macaddr, l2_lookup.vlanid, db); if (rc) { dev_err(ds->dev, "Failed to delete FDB entry %pM vid %lld: %pe\n", macaddr, l2_lookup.vlanid, ERR_PTR(rc)); - return; + break; } } + + mutex_unlock(&priv->fdb_lock); } static int sja1105_mdb_add(struct dsa_switch *ds, int port, @@ -2955,7 +2978,9 @@ static int sja1105_port_mcast_flood(struct sja1105_private *priv, int to, { struct sja1105_l2_lookup_entry *l2_lookup; struct sja1105_table *table; - int match; + int match, rc; + + mutex_lock(&priv->fdb_lock); table = &priv->static_config.tables[BLK_IDX_L2_LOOKUP]; l2_lookup = table->entries; @@ -2968,7 +2993,8 @@ static int sja1105_port_mcast_flood(struct sja1105_private *priv, int to, if (match == table->entry_count) { NL_SET_ERR_MSG_MOD(extack, "Could not find FDB entry for unknown multicast"); - return -ENOSPC; + rc = -ENOSPC; + goto out; } if (flags.val & BR_MCAST_FLOOD) @@ -2976,10 +3002,13 @@ static int sja1105_port_mcast_flood(struct sja1105_private *priv, int to, else l2_lookup[match].destports &= ~BIT(to); - return sja1105_dynamic_config_write(priv, BLK_IDX_L2_LOOKUP, - l2_lookup[match].index, - &l2_lookup[match], - true); + rc = sja1105_dynamic_config_write(priv, BLK_IDX_L2_LOOKUP, + l2_lookup[match].index, + &l2_lookup[match], true); +out: + mutex_unlock(&priv->fdb_lock); + + return rc; } static int sja1105_port_pre_bridge_flags(struct dsa_switch *ds, int port, @@ -3349,6 +3378,7 @@ static int sja1105_probe(struct spi_device *spi) mutex_init(&priv->ptp_data.lock); mutex_init(&priv->dynamic_config_lock); mutex_init(&priv->mgmt_lock); + mutex_init(&priv->fdb_lock); spin_lock_init(&priv->ts_id_lock); rc = sja1105_parse_dt(priv); From 86899e9e1e29e854b5f6dcc24ba4f75f792c89aa Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 8 Sep 2023 16:33:52 +0300 Subject: [PATCH 082/161] net: dsa: sja1105: block FDB accesses that are concurrent with a switch reset Currently, when we add the first sja1105 port to a bridge with vlan_filtering 1, then we sometimes see this output: sja1105 spi2.2: port 4 failed to read back entry for be:79:b4:9e:9e:96 vid 3088: -ENOENT sja1105 spi2.2: Reset switch and programmed static config. Reason: VLAN filtering sja1105 spi2.2: port 0 failed to add be:79:b4:9e:9e:96 vid 0 to fdb: -2 It is because sja1105_fdb_add() runs from the dsa_owq which is no longer serialized with switch resets since it dropped the rtnl_lock() in the blamed commit. Either performing the FDB accesses before the reset, or after the reset, is equally fine, because sja1105_static_fdb_change() backs up those changes in the static config, but FDB access during reset isn't ok. Make sja1105_static_config_reload() take the fdb_lock to fix that. Fixes: 0faf890fc519 ("net: dsa: drop rtnl_lock from dsa_slave_switchdev_event_work") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index cefd72617af4d..1a367e64bc3b1 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2297,6 +2297,7 @@ int sja1105_static_config_reload(struct sja1105_private *priv, int rc, i; s64 now; + mutex_lock(&priv->fdb_lock); mutex_lock(&priv->mgmt_lock); mac = priv->static_config.tables[BLK_IDX_MAC_CONFIG].entries; @@ -2409,6 +2410,7 @@ int sja1105_static_config_reload(struct sja1105_private *priv, goto out; out: mutex_unlock(&priv->mgmt_lock); + mutex_unlock(&priv->fdb_lock); return rc; } From a7b8d60b37237680009dd0b025fe8c067aba0ee3 Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Fri, 8 Sep 2023 15:01:52 +0800 Subject: [PATCH 083/161] r8152: check budget for r8152_poll() According to the document of napi, there is no rx process when the budget is 0. Therefore, r8152_poll() has to return 0 directly when the budget is equal to 0. Fixes: d2187f8e4454 ("r8152: divide the tx and rx bottom functions") Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 332c853ca99b3..0c13d9950cd85 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -2636,6 +2636,9 @@ static int r8152_poll(struct napi_struct *napi, int budget) struct r8152 *tp = container_of(napi, struct r8152, napi); int work_done; + if (!budget) + return 0; + work_done = rx_bottom(tp, budget); if (work_done < budget) { From c821a88bd720b0046433173185fd841a100d44ad Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Sun, 10 Sep 2023 02:03:10 +0900 Subject: [PATCH 084/161] kcm: Fix memory leak in error path of kcm_sendmsg() syzbot reported a memory leak like below: BUG: memory leak unreferenced object 0xffff88810b088c00 (size 240): comm "syz-executor186", pid 5012, jiffies 4294943306 (age 13.680s) hex dump (first 32 bytes): 00 89 08 0b 81 88 ff ff 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] __alloc_skb+0x1ef/0x230 net/core/skbuff.c:634 [] alloc_skb include/linux/skbuff.h:1289 [inline] [] kcm_sendmsg+0x269/0x1050 net/kcm/kcmsock.c:815 [] sock_sendmsg_nosec net/socket.c:725 [inline] [] sock_sendmsg+0x56/0xb0 net/socket.c:748 [] ____sys_sendmsg+0x365/0x470 net/socket.c:2494 [] ___sys_sendmsg+0xc9/0x130 net/socket.c:2548 [] __sys_sendmsg+0xa6/0x120 net/socket.c:2577 [] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [] do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80 [] entry_SYSCALL_64_after_hwframe+0x63/0xcd In kcm_sendmsg(), kcm_tx_msg(head)->last_skb is used as a cursor to append newly allocated skbs to 'head'. If some bytes are copied, an error occurred, and jumped to out_error label, 'last_skb' is left unmodified. A later kcm_sendmsg() will use an obsoleted 'last_skb' reference, corrupting the 'head' frag_list and causing the leak. This patch fixes this issue by properly updating the last allocated skb in 'last_skb'. Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module") Reported-and-tested-by: syzbot+6f98de741f7dbbfc4ccb@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=6f98de741f7dbbfc4ccb Signed-off-by: Shigeru Yoshida Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 4580f61426bb8..740539a218b7c 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -939,6 +939,8 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (head != kcm->seq_skb) kfree_skb(head); + else if (copied) + kcm_tx_msg(head)->last_skb = skb; err = sk_stream_error(sk, msg->msg_flags, err); From 79b83606abc778aa3cbee535b362ce905d0b9448 Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Sun, 10 Sep 2023 06:54:45 +0200 Subject: [PATCH 085/161] efivarfs: fix statfs() on efivarfs Some firmware (notably U-Boot) provides GetVariable() and GetNextVariableName() but not QueryVariableInfo(). With commit d86ff3333cb1 ("efivarfs: expose used and total size") the statfs syscall was broken for such firmware. If QueryVariableInfo() does not exist or returns EFI_UNSUPPORTED, just report the file system size as 0 as statfs_simple() previously did. Fixes: d86ff3333cb1 ("efivarfs: expose used and total size") Link: https://lore.kernel.org/all/20230910045445.41632-1-heinrich.schuchardt@canonical.com/ Signed-off-by: Heinrich Schuchardt [ardb: log warning on QueryVariableInfo() failure] Reviewed-by: Ilias Apalodimas Signed-off-by: Ard Biesheuvel --- fs/efivarfs/super.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index e028fafa04f38..996271473609a 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -32,10 +32,16 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 storage_space, remaining_space, max_variable_size; efi_status_t status; - status = efivar_query_variable_info(attr, &storage_space, &remaining_space, - &max_variable_size); - if (status != EFI_SUCCESS) - return efi_status_to_err(status); + /* Some UEFI firmware does not implement QueryVariableInfo() */ + storage_space = remaining_space = 0; + if (efi_rt_services_supported(EFI_RT_SUPPORTED_QUERY_VARIABLE_INFO)) { + status = efivar_query_variable_info(attr, &storage_space, + &remaining_space, + &max_variable_size); + if (status != EFI_SUCCESS && status != EFI_UNSUPPORTED) + pr_warn_ratelimited("query_variable_info() failed: 0x%lx\n", + status); + } /* * This is not a normal filesystem, so no point in pretending it has a block From e10a35abb3da12b812cfb6fc6137926a0c81e39a Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Sun, 10 Sep 2023 22:40:30 +0100 Subject: [PATCH 086/161] net: ethernet: mtk_eth_soc: fix uninitialized variable Variable dma_addr in function mtk_poll_rx can be uninitialized on some of the error paths. In practise this doesn't matter, even random data present in uninitialized stack memory can safely be used in the way it happens in the error path. However, in order to make Smatch happy make sure the variable is always initialized. Signed-off-by: Daniel Golle Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 2372ce8c25803..3cffd1bd30677 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -2005,11 +2005,11 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget, u8 *data, *new_data; struct mtk_rx_dma_v2 *rxd, trxd; int done = 0, bytes = 0; + dma_addr_t dma_addr = DMA_MAPPING_ERROR; while (done < budget) { unsigned int pktlen, *rxdcsum; struct net_device *netdev; - dma_addr_t dma_addr; u32 hash, reason; int mac = 0; @@ -2186,7 +2186,8 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget, else rxd->rxd2 = RX_DMA_PREP_PLEN0(ring->buf_size); - if (MTK_HAS_CAPS(eth->soc->caps, MTK_36BIT_DMA)) + if (MTK_HAS_CAPS(eth->soc->caps, MTK_36BIT_DMA) && + likely(dma_addr != DMA_MAPPING_ERROR)) rxd->rxd2 |= RX_DMA_PREP_ADDR64(dma_addr); ring->calc_idx = idx; From 5a124b1fd3e6cb15a943f0cdfe96aa8f6d3d2f39 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 9 Sep 2023 20:41:56 +0200 Subject: [PATCH 087/161] net: ethernet: mtk_eth_soc: fix pse_port configuration for MT7988 MT7988 SoC support 3 NICs. Fix pse_port configuration in mtk_flow_set_output_device routine if the traffic is offloaded to eth2. Rely on mtk_pse_port definitions. Fixes: 88efedf517e6 ("net: ethernet: mtk_eth_soc: enable nft hw flowtable_offload for MT7988 SoC") Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_ppe_offload.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c index a70a5417c1734..a4efbeb162084 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c @@ -214,9 +214,11 @@ mtk_flow_set_output_device(struct mtk_eth *eth, struct mtk_foe_entry *foe, dsa_port = mtk_flow_get_dsa_port(&dev); if (dev == eth->netdev[0]) - pse_port = 1; + pse_port = PSE_GDM1_PORT; else if (dev == eth->netdev[1]) - pse_port = 2; + pse_port = PSE_GDM2_PORT; + else if (dev == eth->netdev[2]) + pse_port = PSE_GDM3_PORT; else return -EOPNOTSUPP; From 78034cbece79c2d730ad0770b3b7f23eedbbecf5 Mon Sep 17 00:00:00 2001 From: Liming Sun Date: Tue, 29 Aug 2023 13:42:59 -0400 Subject: [PATCH 088/161] platform/mellanox: mlxbf-tmfifo: Drop the Rx packet if no more descriptors This commit fixes tmfifo console stuck issue when the virtual networking interface is in down state. In such case, the network Rx descriptors runs out and causes the Rx network packet staying in the head of the tmfifo thus blocking the console packets. The fix is to drop the Rx network packet when no more Rx descriptors. Function name mlxbf_tmfifo_release_pending_pkt() is also renamed to mlxbf_tmfifo_release_pkt() to be more approperiate. Fixes: 1357dfd7261f ("platform/mellanox: Add TmFifo driver for Mellanox BlueField Soc") Signed-off-by: Liming Sun Reviewed-by: Vadim Pasternak Reviewed-by: David Thompson Link: https://lore.kernel.org/r/8c0177dc938ae03f52ff7e0b62dbeee74b7bec09.1693322547.git.limings@nvidia.com Signed-off-by: Hans de Goede --- drivers/platform/mellanox/mlxbf-tmfifo.c | 66 ++++++++++++++++++------ 1 file changed, 49 insertions(+), 17 deletions(-) diff --git a/drivers/platform/mellanox/mlxbf-tmfifo.c b/drivers/platform/mellanox/mlxbf-tmfifo.c index b600b77d91ef2..5c1f859b682a8 100644 --- a/drivers/platform/mellanox/mlxbf-tmfifo.c +++ b/drivers/platform/mellanox/mlxbf-tmfifo.c @@ -59,6 +59,7 @@ struct mlxbf_tmfifo; * @vq: pointer to the virtio virtqueue * @desc: current descriptor of the pending packet * @desc_head: head descriptor of the pending packet + * @drop_desc: dummy desc for packet dropping * @cur_len: processed length of the current descriptor * @rem_len: remaining length of the pending packet * @pkt_len: total length of the pending packet @@ -75,6 +76,7 @@ struct mlxbf_tmfifo_vring { struct virtqueue *vq; struct vring_desc *desc; struct vring_desc *desc_head; + struct vring_desc drop_desc; int cur_len; int rem_len; u32 pkt_len; @@ -86,6 +88,14 @@ struct mlxbf_tmfifo_vring { struct mlxbf_tmfifo *fifo; }; +/* Check whether vring is in drop mode. */ +#define IS_VRING_DROP(_r) ({ \ + typeof(_r) (r) = (_r); \ + (r->desc_head == &r->drop_desc ? true : false); }) + +/* A stub length to drop maximum length packet. */ +#define VRING_DROP_DESC_MAX_LEN GENMASK(15, 0) + /* Interrupt types. */ enum { MLXBF_TM_RX_LWM_IRQ, @@ -262,6 +272,7 @@ static int mlxbf_tmfifo_alloc_vrings(struct mlxbf_tmfifo *fifo, vring->align = SMP_CACHE_BYTES; vring->index = i; vring->vdev_id = tm_vdev->vdev.id.device; + vring->drop_desc.len = VRING_DROP_DESC_MAX_LEN; dev = &tm_vdev->vdev.dev; size = vring_size(vring->num, vring->align); @@ -367,7 +378,7 @@ static u32 mlxbf_tmfifo_get_pkt_len(struct mlxbf_tmfifo_vring *vring, return len; } -static void mlxbf_tmfifo_release_pending_pkt(struct mlxbf_tmfifo_vring *vring) +static void mlxbf_tmfifo_release_pkt(struct mlxbf_tmfifo_vring *vring) { struct vring_desc *desc_head; u32 len = 0; @@ -596,19 +607,25 @@ static void mlxbf_tmfifo_rxtx_word(struct mlxbf_tmfifo_vring *vring, if (vring->cur_len + sizeof(u64) <= len) { /* The whole word. */ - if (is_rx) - memcpy(addr + vring->cur_len, &data, sizeof(u64)); - else - memcpy(&data, addr + vring->cur_len, sizeof(u64)); + if (!IS_VRING_DROP(vring)) { + if (is_rx) + memcpy(addr + vring->cur_len, &data, + sizeof(u64)); + else + memcpy(&data, addr + vring->cur_len, + sizeof(u64)); + } vring->cur_len += sizeof(u64); } else { /* Leftover bytes. */ - if (is_rx) - memcpy(addr + vring->cur_len, &data, - len - vring->cur_len); - else - memcpy(&data, addr + vring->cur_len, - len - vring->cur_len); + if (!IS_VRING_DROP(vring)) { + if (is_rx) + memcpy(addr + vring->cur_len, &data, + len - vring->cur_len); + else + memcpy(&data, addr + vring->cur_len, + len - vring->cur_len); + } vring->cur_len = len; } @@ -709,8 +726,16 @@ static bool mlxbf_tmfifo_rxtx_one_desc(struct mlxbf_tmfifo_vring *vring, /* Get the descriptor of the next packet. */ if (!vring->desc) { desc = mlxbf_tmfifo_get_next_pkt(vring, is_rx); - if (!desc) - return false; + if (!desc) { + /* Drop next Rx packet to avoid stuck. */ + if (is_rx) { + desc = &vring->drop_desc; + vring->desc_head = desc; + vring->desc = desc; + } else { + return false; + } + } } else { desc = vring->desc; } @@ -743,17 +768,24 @@ static bool mlxbf_tmfifo_rxtx_one_desc(struct mlxbf_tmfifo_vring *vring, vring->rem_len -= len; /* Get the next desc on the chain. */ - if (vring->rem_len > 0 && + if (!IS_VRING_DROP(vring) && vring->rem_len > 0 && (virtio16_to_cpu(vdev, desc->flags) & VRING_DESC_F_NEXT)) { idx = virtio16_to_cpu(vdev, desc->next); desc = &vr->desc[idx]; goto mlxbf_tmfifo_desc_done; } - /* Done and release the pending packet. */ - mlxbf_tmfifo_release_pending_pkt(vring); + /* Done and release the packet. */ desc = NULL; fifo->vring[is_rx] = NULL; + if (!IS_VRING_DROP(vring)) { + mlxbf_tmfifo_release_pkt(vring); + } else { + vring->pkt_len = 0; + vring->desc_head = NULL; + vring->desc = NULL; + return false; + } /* * Make sure the load/store are in order before @@ -933,7 +965,7 @@ static void mlxbf_tmfifo_virtio_del_vqs(struct virtio_device *vdev) /* Release the pending packet. */ if (vring->desc) - mlxbf_tmfifo_release_pending_pkt(vring); + mlxbf_tmfifo_release_pkt(vring); vq = vring->vq; if (vq) { vring->vq = NULL; From fc4c655821546239abb3cf4274d66b9747aa87dd Mon Sep 17 00:00:00 2001 From: Liming Sun Date: Tue, 29 Aug 2023 13:43:00 -0400 Subject: [PATCH 089/161] platform/mellanox: mlxbf-tmfifo: Drop jumbo frames This commit drops over-sized network packets to avoid tmfifo queue stuck. Fixes: 1357dfd7261f ("platform/mellanox: Add TmFifo driver for Mellanox BlueField Soc") Signed-off-by: Liming Sun Reviewed-by: Vadim Pasternak Reviewed-by: David Thompson Link: https://lore.kernel.org/r/9318936c2447f76db475c985ca6d91f057efcd41.1693322547.git.limings@nvidia.com Signed-off-by: Hans de Goede --- drivers/platform/mellanox/mlxbf-tmfifo.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/platform/mellanox/mlxbf-tmfifo.c b/drivers/platform/mellanox/mlxbf-tmfifo.c index 5c1f859b682a8..f3696a54a2bd7 100644 --- a/drivers/platform/mellanox/mlxbf-tmfifo.c +++ b/drivers/platform/mellanox/mlxbf-tmfifo.c @@ -224,7 +224,7 @@ static u8 mlxbf_tmfifo_net_default_mac[ETH_ALEN] = { static efi_char16_t mlxbf_tmfifo_efi_name[] = L"RshimMacAddr"; /* Maximum L2 header length. */ -#define MLXBF_TMFIFO_NET_L2_OVERHEAD 36 +#define MLXBF_TMFIFO_NET_L2_OVERHEAD (ETH_HLEN + VLAN_HLEN) /* Supported virtio-net features. */ #define MLXBF_TMFIFO_NET_FEATURES \ @@ -642,13 +642,14 @@ static void mlxbf_tmfifo_rxtx_word(struct mlxbf_tmfifo_vring *vring, * flag is set. */ static void mlxbf_tmfifo_rxtx_header(struct mlxbf_tmfifo_vring *vring, - struct vring_desc *desc, + struct vring_desc **desc, bool is_rx, bool *vring_change) { struct mlxbf_tmfifo *fifo = vring->fifo; struct virtio_net_config *config; struct mlxbf_tmfifo_msg_hdr hdr; int vdev_id, hdr_len; + bool drop_rx = false; /* Read/Write packet header. */ if (is_rx) { @@ -668,8 +669,8 @@ static void mlxbf_tmfifo_rxtx_header(struct mlxbf_tmfifo_vring *vring, if (ntohs(hdr.len) > __virtio16_to_cpu(virtio_legacy_is_little_endian(), config->mtu) + - MLXBF_TMFIFO_NET_L2_OVERHEAD) - return; + MLXBF_TMFIFO_NET_L2_OVERHEAD) + drop_rx = true; } else { vdev_id = VIRTIO_ID_CONSOLE; hdr_len = 0; @@ -684,16 +685,25 @@ static void mlxbf_tmfifo_rxtx_header(struct mlxbf_tmfifo_vring *vring, if (!tm_dev2) return; - vring->desc = desc; + vring->desc = *desc; vring = &tm_dev2->vrings[MLXBF_TMFIFO_VRING_RX]; *vring_change = true; } + + if (drop_rx && !IS_VRING_DROP(vring)) { + if (vring->desc_head) + mlxbf_tmfifo_release_pkt(vring); + *desc = &vring->drop_desc; + vring->desc_head = *desc; + vring->desc = *desc; + } + vring->pkt_len = ntohs(hdr.len) + hdr_len; } else { /* Network virtio has an extra header. */ hdr_len = (vring->vdev_id == VIRTIO_ID_NET) ? sizeof(struct virtio_net_hdr) : 0; - vring->pkt_len = mlxbf_tmfifo_get_pkt_len(vring, desc); + vring->pkt_len = mlxbf_tmfifo_get_pkt_len(vring, *desc); hdr.type = (vring->vdev_id == VIRTIO_ID_NET) ? VIRTIO_ID_NET : VIRTIO_ID_CONSOLE; hdr.len = htons(vring->pkt_len - hdr_len); @@ -742,7 +752,7 @@ static bool mlxbf_tmfifo_rxtx_one_desc(struct mlxbf_tmfifo_vring *vring, /* Beginning of a packet. Start to Rx/Tx packet header. */ if (vring->pkt_len == 0) { - mlxbf_tmfifo_rxtx_header(vring, desc, is_rx, &vring_change); + mlxbf_tmfifo_rxtx_header(vring, &desc, is_rx, &vring_change); (*avail)--; /* Return if new packet is for another ring. */ From 80ccd40568bcd3655b0fd0be1e9b3379fd6e1056 Mon Sep 17 00:00:00 2001 From: Shravan Kumar Ramani Date: Tue, 5 Sep 2023 08:49:32 -0400 Subject: [PATCH 090/161] platform/mellanox: mlxbf-pmc: Fix potential buffer overflows Replace sprintf with sysfs_emit where possible. Size check in mlxbf_pmc_event_list_show should account for "\0". Fixes: 1a218d312e65 ("platform/mellanox: mlxbf-pmc: Add Mellanox BlueField PMC driver") Signed-off-by: Shravan Kumar Ramani Reviewed-by: Vadim Pasternak Reviewed-by: David Thompson Link: https://lore.kernel.org/r/bef39ef32319a31b32f999065911f61b0d3b17c3.1693917738.git.shravankr@nvidia.com Signed-off-by: Hans de Goede --- drivers/platform/mellanox/mlxbf-pmc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/platform/mellanox/mlxbf-pmc.c b/drivers/platform/mellanox/mlxbf-pmc.c index be967d797c28e..95afcae7b9fa9 100644 --- a/drivers/platform/mellanox/mlxbf-pmc.c +++ b/drivers/platform/mellanox/mlxbf-pmc.c @@ -1008,7 +1008,7 @@ static ssize_t mlxbf_pmc_counter_show(struct device *dev, } else return -EINVAL; - return sprintf(buf, "0x%llx\n", value); + return sysfs_emit(buf, "0x%llx\n", value); } /* Store function for "counter" sysfs files */ @@ -1078,13 +1078,13 @@ static ssize_t mlxbf_pmc_event_show(struct device *dev, err = mlxbf_pmc_read_event(blk_num, cnt_num, is_l3, &evt_num); if (err) - return sprintf(buf, "No event being monitored\n"); + return sysfs_emit(buf, "No event being monitored\n"); evt_name = mlxbf_pmc_get_event_name(pmc->block_name[blk_num], evt_num); if (!evt_name) return -EINVAL; - return sprintf(buf, "0x%llx: %s\n", evt_num, evt_name); + return sysfs_emit(buf, "0x%llx: %s\n", evt_num, evt_name); } /* Store function for "event" sysfs files */ @@ -1139,9 +1139,9 @@ static ssize_t mlxbf_pmc_event_list_show(struct device *dev, return -EINVAL; for (i = 0, buf[0] = '\0'; i < size; ++i) { - len += sprintf(e_info, "0x%x: %s\n", events[i].evt_num, - events[i].evt_name); - if (len > PAGE_SIZE) + len += snprintf(e_info, sizeof(e_info), "0x%x: %s\n", + events[i].evt_num, events[i].evt_name); + if (len >= PAGE_SIZE) break; strcat(buf, e_info); ret = len; @@ -1168,7 +1168,7 @@ static ssize_t mlxbf_pmc_enable_show(struct device *dev, value = FIELD_GET(MLXBF_PMC_L3C_PERF_CNT_CFG_EN, perfcnt_cfg); - return sprintf(buf, "%d\n", value); + return sysfs_emit(buf, "%d\n", value); } /* Store function for "enable" sysfs files - only for l3cache */ From 0f5969452e162efc50bdc98968fb62b424a9874b Mon Sep 17 00:00:00 2001 From: Shravan Kumar Ramani Date: Tue, 5 Sep 2023 08:49:33 -0400 Subject: [PATCH 091/161] platform/mellanox: mlxbf-pmc: Fix reading of unprogrammed events This fix involves 2 changes: - All event regs have a reset value of 0, which is not a valid event_number as per the event_list for most blocks and hence seen as an error. Add a "disable" event with event_number 0 for all blocks. - The enable bit for each counter need not be checked before reading the event info, and hence removed. Fixes: 1a218d312e65 ("platform/mellanox: mlxbf-pmc: Add Mellanox BlueField PMC driver") Signed-off-by: Shravan Kumar Ramani Reviewed-by: Vadim Pasternak Reviewed-by: David Thompson Link: https://lore.kernel.org/r/04d0213932d32681de1c716b54320ed894e52425.1693917738.git.shravankr@nvidia.com Signed-off-by: Hans de Goede --- drivers/platform/mellanox/mlxbf-pmc.c | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/drivers/platform/mellanox/mlxbf-pmc.c b/drivers/platform/mellanox/mlxbf-pmc.c index 95afcae7b9fa9..2d4bbe99959ef 100644 --- a/drivers/platform/mellanox/mlxbf-pmc.c +++ b/drivers/platform/mellanox/mlxbf-pmc.c @@ -191,6 +191,7 @@ static const struct mlxbf_pmc_events mlxbf_pmc_smgen_events[] = { }; static const struct mlxbf_pmc_events mlxbf_pmc_trio_events_1[] = { + { 0x0, "DISABLE" }, { 0xa0, "TPIO_DATA_BEAT" }, { 0xa1, "TDMA_DATA_BEAT" }, { 0xa2, "MAP_DATA_BEAT" }, @@ -214,6 +215,7 @@ static const struct mlxbf_pmc_events mlxbf_pmc_trio_events_1[] = { }; static const struct mlxbf_pmc_events mlxbf_pmc_trio_events_2[] = { + { 0x0, "DISABLE" }, { 0xa0, "TPIO_DATA_BEAT" }, { 0xa1, "TDMA_DATA_BEAT" }, { 0xa2, "MAP_DATA_BEAT" }, @@ -246,6 +248,7 @@ static const struct mlxbf_pmc_events mlxbf_pmc_trio_events_2[] = { }; static const struct mlxbf_pmc_events mlxbf_pmc_ecc_events[] = { + { 0x0, "DISABLE" }, { 0x100, "ECC_SINGLE_ERROR_CNT" }, { 0x104, "ECC_DOUBLE_ERROR_CNT" }, { 0x114, "SERR_INJ" }, @@ -258,6 +261,7 @@ static const struct mlxbf_pmc_events mlxbf_pmc_ecc_events[] = { }; static const struct mlxbf_pmc_events mlxbf_pmc_mss_events[] = { + { 0x0, "DISABLE" }, { 0xc0, "RXREQ_MSS" }, { 0xc1, "RXDAT_MSS" }, { 0xc2, "TXRSP_MSS" }, @@ -265,6 +269,7 @@ static const struct mlxbf_pmc_events mlxbf_pmc_mss_events[] = { }; static const struct mlxbf_pmc_events mlxbf_pmc_hnf_events[] = { + { 0x0, "DISABLE" }, { 0x45, "HNF_REQUESTS" }, { 0x46, "HNF_REJECTS" }, { 0x47, "ALL_BUSY" }, @@ -323,6 +328,7 @@ static const struct mlxbf_pmc_events mlxbf_pmc_hnf_events[] = { }; static const struct mlxbf_pmc_events mlxbf_pmc_hnfnet_events[] = { + { 0x0, "DISABLE" }, { 0x12, "CDN_REQ" }, { 0x13, "DDN_REQ" }, { 0x14, "NDN_REQ" }, @@ -892,7 +898,7 @@ static int mlxbf_pmc_read_event(int blk_num, uint32_t cnt_num, bool is_l3, uint64_t *result) { uint32_t perfcfg_offset, perfval_offset; - uint64_t perfmon_cfg, perfevt, perfctl; + uint64_t perfmon_cfg, perfevt; if (cnt_num >= pmc->block[blk_num].counters) return -EINVAL; @@ -904,25 +910,6 @@ static int mlxbf_pmc_read_event(int blk_num, uint32_t cnt_num, bool is_l3, perfval_offset = perfcfg_offset + pmc->block[blk_num].counters * MLXBF_PMC_REG_SIZE; - /* Set counter in "read" mode */ - perfmon_cfg = FIELD_PREP(MLXBF_PMC_PERFMON_CONFIG_ADDR, - MLXBF_PMC_PERFCTL); - perfmon_cfg |= FIELD_PREP(MLXBF_PMC_PERFMON_CONFIG_STROBE, 1); - perfmon_cfg |= FIELD_PREP(MLXBF_PMC_PERFMON_CONFIG_WR_R_B, 0); - - if (mlxbf_pmc_write(pmc->block[blk_num].mmio_base + perfcfg_offset, - MLXBF_PMC_WRITE_REG_64, perfmon_cfg)) - return -EFAULT; - - /* Check if the counter is enabled */ - - if (mlxbf_pmc_read(pmc->block[blk_num].mmio_base + perfval_offset, - MLXBF_PMC_READ_REG_64, &perfctl)) - return -EFAULT; - - if (!FIELD_GET(MLXBF_PMC_PERFCTL_EN0, perfctl)) - return -EINVAL; - /* Set counter in "read" mode */ perfmon_cfg = FIELD_PREP(MLXBF_PMC_PERFMON_CONFIG_ADDR, MLXBF_PMC_PERFEVT); From c2dffda1d8f7511505bbbf16ba282f2079b30089 Mon Sep 17 00:00:00 2001 From: David Thompson Date: Tue, 5 Sep 2023 09:32:43 -0400 Subject: [PATCH 092/161] platform/mellanox: mlxbf-bootctl: add NET dependency into Kconfig The latest version of the mlxbf_bootctl driver utilizes "sysfs_format_mac", and this API is only available if NET is defined in the kernel configuration. This patch changes the mlxbf_bootctl Kconfig to depend on NET. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202309031058.JvwNDBKt-lkp@intel.com/ Reported-by: Randy Dunlap Signed-off-by: David Thompson Link: https://lore.kernel.org/r/20230905133243.31550-1-davthompson@nvidia.com Signed-off-by: Hans de Goede --- drivers/platform/mellanox/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/mellanox/Kconfig b/drivers/platform/mellanox/Kconfig index 382793e73a60a..e52aea996ca51 100644 --- a/drivers/platform/mellanox/Kconfig +++ b/drivers/platform/mellanox/Kconfig @@ -60,6 +60,7 @@ config MLXBF_BOOTCTL tristate "Mellanox BlueField Firmware Boot Control driver" depends on ARM64 depends on ACPI + depends on NET help The Mellanox BlueField firmware implements functionality to request swapping the primary and alternate eMMC boot partition, From 0a138f1670bd1af13ba6949c48ea86ddd4bf557e Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 4 Sep 2023 14:00:35 +0200 Subject: [PATCH 093/161] platform/mellanox: NVSW_SN2201 should depend on ACPI The only probing method supported by the Nvidia SN2201 platform driver is probing through an ACPI match table. Hence add a dependency on ACPI, to prevent asking the user about this driver when configuring a kernel without ACPI support. Fixes: 662f24826f95 ("platform/mellanox: Add support for new SN2201 system") Signed-off-by: Geert Uytterhoeven Acked-by: Vadim Pasternak Acked-by: Andi Shyti Link: https://lore.kernel.org/r/ec5a4071691ab08d58771b7732a9988e89779268.1693828363.git.geert+renesas@glider.be Signed-off-by: Hans de Goede --- drivers/platform/mellanox/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/mellanox/Kconfig b/drivers/platform/mellanox/Kconfig index e52aea996ca51..f7dfa0e785fd6 100644 --- a/drivers/platform/mellanox/Kconfig +++ b/drivers/platform/mellanox/Kconfig @@ -81,8 +81,8 @@ config MLXBF_PMC config NVSW_SN2201 tristate "Nvidia SN2201 platform driver support" - depends on HWMON - depends on I2C + depends on HWMON && I2C + depends on ACPI || COMPILE_TEST select REGMAP_I2C help This driver provides support for the Nvidia SN2201 platform. From 4106a70ddad57ee6d8f98b81d6f036740c72762b Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Tue, 5 Sep 2023 20:28:13 +1200 Subject: [PATCH 094/161] platform/x86: asus-wmi: Support 2023 ROG X16 tablet mode Add quirk for ASUS ROG X16 (GV601V, 2023 versions) Flow 2-in-1 to enable tablet mode with lid flip (all screen rotations). Signed-off-by: Luke D. Jones Link: https://lore.kernel.org/r/20230905082813.13470-1-luke@ljones.dev Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/asus-nb-wmi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c index fdf7da06af306..d85d895fee894 100644 --- a/drivers/platform/x86/asus-nb-wmi.c +++ b/drivers/platform/x86/asus-nb-wmi.c @@ -478,6 +478,15 @@ static const struct dmi_system_id asus_quirks[] = { }, .driver_data = &quirk_asus_tablet_mode, }, + { + .callback = dmi_matched, + .ident = "ASUS ROG FLOW X16", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "GV601V"), + }, + .driver_data = &quirk_asus_tablet_mode, + }, { .callback = dmi_matched, .ident = "ASUS VivoBook E410MA", From 8a81cf96f5510aaf9a65d103f7405079a7b0fcc5 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 7 Sep 2023 11:55:18 +0200 Subject: [PATCH 095/161] thermal/of: add missing of_node_put() for_each_child_of_node performs an of_node_get on each iteration, so a break out of the loop requires an of_node_put. This was done using the Coccinelle semantic patch iterators/for_each_child.cocci Signed-off-by: Julia Lawall Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_of.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c index 4ca905723429c..1e0655b63259f 100644 --- a/drivers/thermal/thermal_of.c +++ b/drivers/thermal/thermal_of.c @@ -37,8 +37,10 @@ static int of_find_trip_id(struct device_node *np, struct device_node *trip) */ for_each_child_of_node(trips, t) { - if (t == trip) + if (t == trip) { + of_node_put(t); goto out; + } i++; } @@ -401,8 +403,10 @@ static int thermal_of_for_each_cooling_maps(struct thermal_zone_device *tz, for_each_child_of_node(cm_np, child) { ret = thermal_of_for_each_cooling_device(tz_np, child, tz, cdev, action); - if (ret) + if (ret) { + of_node_put(child); break; + } } of_node_put(cm_np); From ebc7abb35b258152d4a424f89d7c03db1d7ce61c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 7 Sep 2023 20:18:56 +0200 Subject: [PATCH 096/161] thermal: Constify the trip argument of the .get_trend() zone callback Add 'const' to the definition of the 'trip' argument of the .get_trend() thermal zone callback to indicate that the trip point passed to it should not be modified by it and adjust the callback functions implementing it, thermal_get_trend() in the ACPI thermal driver and __ti_thermal_get_trend(), accordingly. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Michal Wilczynski --- drivers/acpi/thermal.c | 2 +- drivers/thermal/ti-soc-thermal/ti-thermal-common.c | 3 ++- include/linux/thermal.h | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index f14e68266ccd4..312730f8272ee 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -492,7 +492,7 @@ static int thermal_get_temp(struct thermal_zone_device *thermal, int *temp) } static int thermal_get_trend(struct thermal_zone_device *thermal, - struct thermal_trip *trip, + const struct thermal_trip *trip, enum thermal_trend *trend) { struct acpi_thermal *tz = thermal_zone_device_priv(thermal); diff --git a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c index 6ba2613627e13..0cf0826b805a9 100644 --- a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c +++ b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c @@ -110,7 +110,8 @@ static inline int __ti_thermal_get_temp(struct thermal_zone_device *tz, int *tem } static int __ti_thermal_get_trend(struct thermal_zone_device *tz, - struct thermal_trip *trip, enum thermal_trend *trend) + const struct thermal_trip *trip, + enum thermal_trend *trend) { struct ti_thermal_data *data = thermal_zone_device_priv(tz); struct ti_bandgap *bgp; diff --git a/include/linux/thermal.h b/include/linux/thermal.h index c99440aac1a11..a5ae4af955ff9 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -80,8 +80,8 @@ struct thermal_zone_device_ops { int (*set_trip_hyst) (struct thermal_zone_device *, int, int); int (*get_crit_temp) (struct thermal_zone_device *, int *); int (*set_emul_temp) (struct thermal_zone_device *, int); - int (*get_trend) (struct thermal_zone_device *, struct thermal_trip *, - enum thermal_trend *); + int (*get_trend) (struct thermal_zone_device *, + const struct thermal_trip *, enum thermal_trend *); void (*hot)(struct thermal_zone_device *); void (*critical)(struct thermal_zone_device *); }; From df203da47f4428bc286fc99318936416253a321c Mon Sep 17 00:00:00 2001 From: Nigel Croxon Date: Mon, 11 Sep 2023 14:25:23 -0700 Subject: [PATCH 097/161] md/raid1: fix error: ISO C90 forbids mixed declarations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a compile error when this commit is added: md: raid1: fix potential OOB in raid1_remove_disk() drivers/md/raid1.c: In function 'raid1_remove_disk': drivers/md/raid1.c:1844:9: error: ISO C90 forbids mixed declarations and code [-Werror=declaration-after-statement] 1844 |         struct raid1_info *p = conf->mirrors + number;     |         ^~~~~~ That's because the new code was inserted before the struct. The change is move the struct command above this commit. Fixes: 8b0472b50bcf ("md: raid1: fix potential OOB in raid1_remove_disk()") Signed-off-by: Nigel Croxon Signed-off-by: Song Liu Link: https://lore.kernel.org/r/46d929d0-2aab-4cf2-b2bf-338963e8ba5a@redhat.com --- drivers/md/raid1.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4b30a17421623..2aabac773fe72 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1837,12 +1837,11 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct r1conf *conf = mddev->private; int err = 0; int number = rdev->raid_disk; + struct raid1_info *p = conf->mirrors + number; if (unlikely(number >= conf->raid_disks)) goto abort; - struct raid1_info *p = conf->mirrors + number; - if (rdev != p->rdev) p = conf->mirrors + conf->raid_disks + number; From 81faf9e0c3d39d47c6825469591d60a2cd0bbe10 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Mon, 28 Aug 2023 14:18:23 -0400 Subject: [PATCH 098/161] drm/amdkfd: Fix reg offset for setting CWSR grace period This patch fixes the case where the code currently passes absolute register address and not the reg offset, which HWS expects, when sending the PM4 packet to set/update CWSR grace period. Additionally, cleanup the signature of build_grace_period_packet_info function as it no longer needs the inst parameter. Signed-off-by: Mukul Joshi Reviewed-by: Jonathan Kim Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c | 3 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h | 3 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 6 ++---- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 3 +-- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 3 +-- drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c | 3 +-- drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 3 +-- 7 files changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index f1f2c24de081e..69810b3f1c636 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -980,8 +980,7 @@ void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev, uint32_t wait_times, uint32_t grace_period, uint32_t *reg_offset, - uint32_t *reg_data, - uint32_t inst) + uint32_t *reg_data) { *reg_data = wait_times; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h index ecaead24e8c96..67bcaa3d42264 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h @@ -55,5 +55,4 @@ void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev, uint32_t wait_times, uint32_t grace_period, uint32_t *reg_offset, - uint32_t *reg_data, - uint32_t inst); + uint32_t *reg_data); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index fa5ee96f88454..3c45a188b701a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -1103,8 +1103,7 @@ void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev, uint32_t wait_times, uint32_t grace_period, uint32_t *reg_offset, - uint32_t *reg_data, - uint32_t inst) + uint32_t *reg_data) { *reg_data = wait_times; @@ -1120,8 +1119,7 @@ void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev, SCH_WAVE, grace_period); - *reg_offset = SOC15_REG_OFFSET(GC, GET_INST(GC, inst), - mmCP_IQ_WAIT_TIME2); + *reg_offset = SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2); } void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h index 936e501908cef..ce424615f59b5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h @@ -100,5 +100,4 @@ void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev, uint32_t wait_times, uint32_t grace_period, uint32_t *reg_offset, - uint32_t *reg_data, - uint32_t inst); + uint32_t *reg_data); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index b166f30f083e0..8a6cb41444a47 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1677,8 +1677,7 @@ static int start_cpsch(struct device_queue_manager *dqm) dqm->dev->kfd2kgd->build_grace_period_packet_info( dqm->dev->adev, dqm->wait_times, grace_period, ®_offset, - &dqm->wait_times, - ffs(dqm->dev->xcc_mask) - 1); + &dqm->wait_times); } dqm_unlock(dqm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c index 8ce6f52009051..1a03173e23133 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c @@ -299,8 +299,7 @@ static int pm_set_grace_period_v9(struct packet_manager *pm, pm->dqm->wait_times, grace_period, ®_offset, - ®_data, - 0); + ®_data); if (grace_period == USE_DEFAULT_GRACE_PERIOD) reg_data = pm->dqm->wait_times; diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h index 8433f99f66679..f3f40dbb8ff71 100644 --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h @@ -326,8 +326,7 @@ struct kfd2kgd_calls { uint32_t wait_times, uint32_t grace_period, uint32_t *reg_offset, - uint32_t *reg_data, - uint32_t inst); + uint32_t *reg_data); void (*get_cu_occupancy)(struct amdgpu_device *adev, int pasid, int *wave_cnt, int *max_waves_per_cu, uint32_t inst); void (*program_trap_handler_settings)(struct amdgpu_device *adev, From 2f06b27444f928a79389b149247508bdad54252b Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Tue, 29 Aug 2023 12:06:09 -0400 Subject: [PATCH 099/161] drm/amdkfd: Fix unaligned 64-bit doorbell warning This patch fixes the following unaligned 64-bit doorbell warning seen when submitting packets on HIQ on GFX v9.4.3 by making the HIQ doorbell 64-bit aligned. The warning is seen when GPU is loaded in any mode other than SPX mode. [ +0.000301] ------------[ cut here ]------------ [ +0.000003] Unaligned 64-bit doorbell [ +0.000030] WARNING: /amdkfd/kfd_doorbell.c:339 write_kernel_doorbell64+0x72/0x80 [ +0.000003] RIP: 0010:write_kernel_doorbell64+0x72/0x80 [ +0.000004] RSP: 0018:ffffc90004287730 EFLAGS: 00010246 [ +0.000005] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [ +0.000003] RDX: 0000000000000001 RSI: ffffffff82837c71 RDI: 00000000ffffffff [ +0.000003] RBP: ffffc90004287748 R08: 0000000000000003 R09: 0000000000000001 [ +0.000002] R10: 000000000000001a R11: ffff88a034008198 R12: ffffc900013bd004 [ +0.000003] R13: 0000000000000008 R14: ffffc900042877b0 R15: 000000000000007f [ +0.000003] FS: 00007fa8c7b62000(0000) GS:ffff889f88400000(0000) knlGS:0000000000000000 [ +0.000004] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ +0.000003] CR2: 000056111c45aaf0 CR3: 00000001414f2002 CR4: 0000000000770ee0 [ +0.000003] PKRU: 55555554 [ +0.000002] Call Trace: [ +0.000004] [ +0.000006] kq_submit_packet+0x45/0x50 [amdgpu] [ +0.000524] pm_send_set_resources+0x7f/0xc0 [amdgpu] [ +0.000500] set_sched_resources+0xe4/0x160 [amdgpu] [ +0.000503] start_cpsch+0x1c5/0x2a0 [amdgpu] [ +0.000497] kgd2kfd_device_init.cold+0x816/0xb42 [amdgpu] [ +0.000743] amdgpu_amdkfd_device_init+0x15f/0x1f0 [amdgpu] [ +0.000602] amdgpu_device_init.cold+0x1813/0x2176 [amdgpu] [ +0.000684] ? pci_bus_read_config_word+0x4a/0x80 [ +0.000012] ? do_pci_enable_device+0xdc/0x110 [ +0.000008] amdgpu_driver_load_kms+0x1a/0x110 [amdgpu] [ +0.000545] amdgpu_pci_probe+0x197/0x400 [amdgpu] Fixes: c31866651086 ("drm/amdgpu: use doorbell mgr for kfd kernel doorbells") Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index c2e0b79dcc6da..7b38537c7c99b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -162,6 +162,7 @@ void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, return NULL; *doorbell_off = amdgpu_doorbell_index_on_bar(kfd->adev, kfd->doorbells, inx); + inx *= 2; pr_debug("Get kernel queue doorbell\n" " doorbell offset == 0x%08X\n" @@ -176,6 +177,7 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) unsigned int inx; inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr); + inx /= 2; mutex_lock(&kfd->doorbell_mutex); __clear_bit(inx, kfd->doorbell_bitmap); From 97e3c6a853f2af9145daf0c6ca25bcdf55c759d4 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Fri, 25 Aug 2023 11:59:09 -0400 Subject: [PATCH 100/161] drm/amdgpu: Store CU info from all XCCs for GFX v9.4.3 Currently, we store CU info only for a single XCC assuming that it is the same for all XCCs. However, that may not be true. As a result, store CU info for all XCCs. This info is later used for CU masking. Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 2 +- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 +- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 76 +++++++++---------- drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 3 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 8 +- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 11 ++- .../gpu/drm/amd/include/kgd_kfd_interface.h | 6 +- 14 files changed, 60 insertions(+), 65 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index cdf6087706aa8..25d5fda5b243e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -478,7 +478,7 @@ void amdgpu_amdkfd_get_cu_info(struct amdgpu_device *adev, struct kfd_cu_info *c cu_info->cu_active_number = acu_info.number; cu_info->cu_ao_mask = acu_info.ao_cu_mask; memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0], - sizeof(acu_info.bitmap)); + sizeof(cu_info->cu_bitmap)); cu_info->num_shader_engines = adev->gfx.config.max_shader_engines; cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se; cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 395c1768b9fc7..0ca95c4d4bfbe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -43,6 +43,7 @@ #define AMDGPU_GFX_LBPW_DISABLED_MODE 0x00000008L #define AMDGPU_MAX_GC_INSTANCES 8 +#define KGD_MAX_QUEUES 128 #define AMDGPU_MAX_GFX_QUEUES KGD_MAX_QUEUES #define AMDGPU_MAX_COMPUTE_QUEUES KGD_MAX_QUEUES @@ -257,7 +258,7 @@ struct amdgpu_cu_info { uint32_t number; uint32_t ao_cu_mask; uint32_t ao_cu_bitmap[4][4]; - uint32_t bitmap[4][4]; + uint32_t bitmap[AMDGPU_MAX_GC_INSTANCES][4][4]; }; struct amdgpu_gfx_ras { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 99f4df133ed3e..2cd2ecebf4656 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -839,7 +839,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) memcpy(&dev_info->cu_ao_bitmap[0], &adev->gfx.cu_info.ao_cu_bitmap[0], sizeof(adev->gfx.cu_info.ao_cu_bitmap)); memcpy(&dev_info->cu_bitmap[0], &adev->gfx.cu_info.bitmap[0], - sizeof(adev->gfx.cu_info.bitmap)); + sizeof(dev_info->cu_bitmap)); dev_info->vram_type = adev->gmc.vram_type; dev_info->vram_bit_width = adev->gmc.vram_width; dev_info->vce_harvest_config = adev->vce.harvest_config; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 0aee9c8288a2b..9032d7a24d7cd 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -9449,7 +9449,7 @@ static int gfx_v10_0_get_cu_info(struct amdgpu_device *adev, gfx_v10_0_set_user_wgp_inactive_bitmap_per_sh( adev, disable_masks[i * 2 + j]); bitmap = gfx_v10_0_get_cu_active_bitmap_per_sh(adev); - cu_info->bitmap[i][j] = bitmap; + cu_info->bitmap[0][i][j] = bitmap; for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) { if (bitmap & mask) { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 5c3db694afa87..762d7a19f1be1 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -6368,7 +6368,7 @@ static int gfx_v11_0_get_cu_info(struct amdgpu_device *adev, * SE6: {SH0,SH1} --> {bitmap[2][2], bitmap[2][3]} * SE7: {SH0,SH1} --> {bitmap[3][2], bitmap[3][3]} */ - cu_info->bitmap[i % 4][j + (i / 4) * 2] = bitmap; + cu_info->bitmap[0][i % 4][j + (i / 4) * 2] = bitmap; for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) { if (bitmap & mask) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c index da6caff78c22b..34f9211b26793 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c @@ -3577,7 +3577,7 @@ static void gfx_v6_0_get_cu_info(struct amdgpu_device *adev) gfx_v6_0_set_user_cu_inactive_bitmap( adev, disable_masks[i * 2 + j]); bitmap = gfx_v6_0_get_cu_enabled(adev); - cu_info->bitmap[i][j] = bitmap; + cu_info->bitmap[0][i][j] = bitmap; for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) { if (bitmap & mask) { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c index 90b034b173c1c..c2faf6b4c2fce 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c @@ -5119,7 +5119,7 @@ static void gfx_v7_0_get_cu_info(struct amdgpu_device *adev) gfx_v7_0_set_user_cu_inactive_bitmap( adev, disable_masks[i * 2 + j]); bitmap = gfx_v7_0_get_cu_active_bitmap(adev); - cu_info->bitmap[i][j] = bitmap; + cu_info->bitmap[0][i][j] = bitmap; for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) { if (bitmap & mask) { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 51c1745c83697..885ebd703260f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -7121,7 +7121,7 @@ static void gfx_v8_0_get_cu_info(struct amdgpu_device *adev) gfx_v8_0_set_user_cu_inactive_bitmap( adev, disable_masks[i * 2 + j]); bitmap = gfx_v8_0_get_cu_active_bitmap(adev); - cu_info->bitmap[i][j] = bitmap; + cu_info->bitmap[0][i][j] = bitmap; for (k = 0; k < adev->gfx.config.max_cu_per_sh; k ++) { if (bitmap & mask) { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 458faf657042e..fd61574a737cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -1499,7 +1499,7 @@ static void gfx_v9_0_init_always_on_cu_mask(struct amdgpu_device *adev) amdgpu_gfx_select_se_sh(adev, i, j, 0xffffffff, 0); for (k = 0; k < adev->gfx.config.max_cu_per_sh; k ++) { - if (cu_info->bitmap[i][j] & mask) { + if (cu_info->bitmap[0][i][j] & mask) { if (counter == pg_always_on_cu_num) WREG32_SOC15(GC, 0, mmRLC_PG_ALWAYS_ON_CU_MASK, cu_bitmap); if (counter < always_on_cu_num) @@ -7233,7 +7233,7 @@ static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev, * SE6,SH0 --> bitmap[2][1] * SE7,SH0 --> bitmap[3][1] */ - cu_info->bitmap[i % 4][j + i / 4] = bitmap; + cu_info->bitmap[0][i % 4][j + i / 4] = bitmap; for (k = 0; k < adev->gfx.config.max_cu_per_sh; k ++) { if (bitmap & mask) { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 0a26a00074a63..18ce5fe45f6f8 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -4259,7 +4259,7 @@ static void gfx_v9_4_3_set_gds_init(struct amdgpu_device *adev) } static void gfx_v9_4_3_set_user_cu_inactive_bitmap(struct amdgpu_device *adev, - u32 bitmap) + u32 bitmap, int xcc_id) { u32 data; @@ -4269,15 +4269,15 @@ static void gfx_v9_4_3_set_user_cu_inactive_bitmap(struct amdgpu_device *adev, data = bitmap << GC_USER_SHADER_ARRAY_CONFIG__INACTIVE_CUS__SHIFT; data &= GC_USER_SHADER_ARRAY_CONFIG__INACTIVE_CUS_MASK; - WREG32_SOC15(GC, GET_INST(GC, 0), regGC_USER_SHADER_ARRAY_CONFIG, data); + WREG32_SOC15(GC, GET_INST(GC, xcc_id), regGC_USER_SHADER_ARRAY_CONFIG, data); } -static u32 gfx_v9_4_3_get_cu_active_bitmap(struct amdgpu_device *adev) +static u32 gfx_v9_4_3_get_cu_active_bitmap(struct amdgpu_device *adev, int xcc_id) { u32 data, mask; - data = RREG32_SOC15(GC, GET_INST(GC, 0), regCC_GC_SHADER_ARRAY_CONFIG); - data |= RREG32_SOC15(GC, GET_INST(GC, 0), regGC_USER_SHADER_ARRAY_CONFIG); + data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCC_GC_SHADER_ARRAY_CONFIG); + data |= RREG32_SOC15(GC, GET_INST(GC, xcc_id), regGC_USER_SHADER_ARRAY_CONFIG); data &= CC_GC_SHADER_ARRAY_CONFIG__INACTIVE_CUS_MASK; data >>= CC_GC_SHADER_ARRAY_CONFIG__INACTIVE_CUS__SHIFT; @@ -4290,7 +4290,7 @@ static u32 gfx_v9_4_3_get_cu_active_bitmap(struct amdgpu_device *adev) static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev, struct amdgpu_cu_info *cu_info) { - int i, j, k, counter, active_cu_number = 0; + int i, j, k, counter, xcc_id, active_cu_number = 0; u32 mask, bitmap, ao_bitmap, ao_cu_mask = 0; unsigned disable_masks[4 * 4]; @@ -4309,46 +4309,38 @@ static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev, adev->gfx.config.max_sh_per_se); mutex_lock(&adev->grbm_idx_mutex); - for (i = 0; i < adev->gfx.config.max_shader_engines; i++) { - for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) { - mask = 1; - ao_bitmap = 0; - counter = 0; - gfx_v9_4_3_xcc_select_se_sh(adev, i, j, 0xffffffff, 0); - gfx_v9_4_3_set_user_cu_inactive_bitmap( - adev, disable_masks[i * adev->gfx.config.max_sh_per_se + j]); - bitmap = gfx_v9_4_3_get_cu_active_bitmap(adev); - - /* - * The bitmap(and ao_cu_bitmap) in cu_info structure is - * 4x4 size array, and it's usually suitable for Vega - * ASICs which has 4*2 SE/SH layout. - * But for Arcturus, SE/SH layout is changed to 8*1. - * To mostly reduce the impact, we make it compatible - * with current bitmap array as below: - * SE4,SH0 --> bitmap[0][1] - * SE5,SH0 --> bitmap[1][1] - * SE6,SH0 --> bitmap[2][1] - * SE7,SH0 --> bitmap[3][1] - */ - cu_info->bitmap[i % 4][j + i / 4] = bitmap; - - for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) { - if (bitmap & mask) { - if (counter < adev->gfx.config.max_cu_per_sh) - ao_bitmap |= mask; - counter++; + for (xcc_id = 0; xcc_id < NUM_XCC(adev->gfx.xcc_mask); xcc_id++) { + for (i = 0; i < adev->gfx.config.max_shader_engines; i++) { + for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) { + mask = 1; + ao_bitmap = 0; + counter = 0; + gfx_v9_4_3_xcc_select_se_sh(adev, i, j, 0xffffffff, xcc_id); + gfx_v9_4_3_set_user_cu_inactive_bitmap( + adev, + disable_masks[i * adev->gfx.config.max_sh_per_se + j], + xcc_id); + bitmap = gfx_v9_4_3_get_cu_active_bitmap(adev, xcc_id); + + cu_info->bitmap[xcc_id][i][j] = bitmap; + + for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) { + if (bitmap & mask) { + if (counter < adev->gfx.config.max_cu_per_sh) + ao_bitmap |= mask; + counter++; + } + mask <<= 1; } - mask <<= 1; + active_cu_number += counter; + if (i < 2 && j < 2) + ao_cu_mask |= (ao_bitmap << (i * 16 + j * 8)); + cu_info->ao_cu_bitmap[i][j] = ao_bitmap; } - active_cu_number += counter; - if (i < 2 && j < 2) - ao_cu_mask |= (ao_bitmap << (i * 16 + j * 8)); - cu_info->ao_cu_bitmap[i % 4][j + i / 4] = ao_bitmap; } + gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, + xcc_id); } - gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, - 0); mutex_unlock(&adev->grbm_idx_mutex); cu_info->number = active_cu_number; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 86fb7ac7982a1..f76b7aee5c0a1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -2087,7 +2087,8 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, amdgpu_amdkfd_get_cu_info(kdev->adev, &cu_info); cu->num_simd_per_cu = cu_info.simd_per_cu; - cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number; + cu->num_simd_cores = cu_info.simd_per_cu * + (cu_info.cu_active_number / kdev->kfd->num_nodes); cu->max_waves_simd = cu_info.max_waves_per_simd; cu->wave_front_size = cu_info.wave_front_size; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index d01bb57733b36..763966236658e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -104,11 +104,13 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, bool wgp_mode_req = KFD_GC_VERSION(mm->dev) >= IP_VERSION(10, 0, 0); uint32_t en_mask = wgp_mode_req ? 0x3 : 0x1; int i, se, sh, cu, cu_bitmap_sh_mul, inc = wgp_mode_req ? 2 : 1; + uint32_t cu_active_per_node; amdgpu_amdkfd_get_cu_info(mm->dev->adev, &cu_info); - if (cu_mask_count > cu_info.cu_active_number) - cu_mask_count = cu_info.cu_active_number; + cu_active_per_node = cu_info.cu_active_number / mm->dev->kfd->num_nodes; + if (cu_mask_count > cu_active_per_node) + cu_mask_count = cu_active_per_node; /* Exceeding these bounds corrupts the stack and indicates a coding error. * Returning with no CU's enabled will hang the queue, which should be @@ -141,7 +143,7 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, for (se = 0; se < cu_info.num_shader_engines; se++) for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++) cu_per_sh[se][sh] = hweight32( - cu_info.cu_bitmap[se % 4][sh + (se / 4) * cu_bitmap_sh_mul]); + cu_info.cu_bitmap[0][se % 4][sh + (se / 4) * cu_bitmap_sh_mul]); /* Symmetrically map cu_mask to all SEs & SHs: * se_mask programs up to 2 SH in the upper and lower 16 bits. diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index ff98fded95349..c54795682dfbd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -450,8 +450,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, sysfs_show_32bit_prop(buffer, offs, "cpu_cores_count", dev->node_props.cpu_cores_count); sysfs_show_32bit_prop(buffer, offs, "simd_count", - dev->gpu ? (dev->node_props.simd_count * - NUM_XCC(dev->gpu->xcc_mask)) : 0); + dev->gpu ? dev->node_props.simd_count : 0); sysfs_show_32bit_prop(buffer, offs, "mem_banks_count", dev->node_props.mem_banks_count); sysfs_show_32bit_prop(buffer, offs, "caches_count", @@ -1604,7 +1603,7 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, int i, j, k; struct kfd_cache_properties *pcache = NULL; - cu_sibling_map_mask = cu_info->cu_bitmap[0][0]; + cu_sibling_map_mask = cu_info->cu_bitmap[0][0][0]; cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1); first_active_cu = ffs(cu_sibling_map_mask); @@ -1647,7 +1646,7 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); k += 4; - cu_sibling_map_mask = cu_info->cu_bitmap[i % 4][j + i / 4]; + cu_sibling_map_mask = cu_info->cu_bitmap[0][i % 4][j + i / 4]; cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1); } } @@ -1708,8 +1707,8 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) { ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info, - pcu_info->cu_bitmap[i % 4][j + i / 4], ct, - cu_processor_id, k); + pcu_info->cu_bitmap[0][i % 4][j + i / 4], ct, + cu_processor_id, k); if (ret < 0) break; diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h index f3f40dbb8ff71..3b5a56585c4b7 100644 --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h @@ -31,12 +31,12 @@ #include #include #include +#include "amdgpu_irq.h" +#include "amdgpu_gfx.h" struct pci_dev; struct amdgpu_device; -#define KGD_MAX_QUEUES 128 - struct kfd_dev; struct kgd_mem; @@ -68,7 +68,7 @@ struct kfd_cu_info { uint32_t wave_front_size; uint32_t max_scratch_slots_per_cu; uint32_t lds_size; - uint32_t cu_bitmap[4][4]; + uint32_t cu_bitmap[AMDGPU_MAX_GC_INSTANCES][4][4]; }; /* For getting GPU local memory information from KGD */ From 0752e66e91fa86fa5481b04b22053363833ffb85 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Fri, 25 Aug 2023 12:18:06 -0400 Subject: [PATCH 101/161] drm/amdkfd: Update cache info reporting for GFX v9.4.3 Update cache info reporting in sysfs to report the correct number of CUs and associated cache information based on different spatial partitioning modes. Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_crat.h | 4 ++ drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 82 +++++++++++++---------- drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 2 +- 3 files changed, 51 insertions(+), 37 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h index 387a8ef493855..74c2d7a0d6285 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h @@ -79,6 +79,10 @@ struct crat_header { #define CRAT_SUBTYPE_IOLINK_AFFINITY 5 #define CRAT_SUBTYPE_MAX 6 +/* + * Do not change the value of CRAT_SIBLINGMAP_SIZE from 32 + * as it breaks the ABI. + */ #define CRAT_SIBLINGMAP_SIZE 32 /* diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index c54795682dfbd..c8c75ff7cea80 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1596,14 +1596,17 @@ static int fill_in_l1_pcache(struct kfd_cache_properties **props_ext, static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, struct kfd_gpu_cache_info *pcache_info, struct kfd_cu_info *cu_info, - int cache_type, unsigned int cu_processor_id) + int cache_type, unsigned int cu_processor_id, + struct kfd_node *knode) { unsigned int cu_sibling_map_mask; int first_active_cu; - int i, j, k; + int i, j, k, xcc, start, end; struct kfd_cache_properties *pcache = NULL; - cu_sibling_map_mask = cu_info->cu_bitmap[0][0][0]; + start = ffs(knode->xcc_mask) - 1; + end = start + NUM_XCC(knode->xcc_mask); + cu_sibling_map_mask = cu_info->cu_bitmap[start][0][0]; cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1); first_active_cu = ffs(cu_sibling_map_mask); @@ -1638,16 +1641,18 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, cu_sibling_map_mask = cu_sibling_map_mask >> (first_active_cu - 1); k = 0; - for (i = 0; i < cu_info->num_shader_engines; i++) { - for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) { - pcache->sibling_map[k] = (uint8_t)(cu_sibling_map_mask & 0xFF); - pcache->sibling_map[k+1] = (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); - pcache->sibling_map[k+2] = (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); - pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); - k += 4; - - cu_sibling_map_mask = cu_info->cu_bitmap[0][i % 4][j + i / 4]; - cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1); + for (xcc = start; xcc < end; xcc++) { + for (i = 0; i < cu_info->num_shader_engines; i++) { + for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) { + pcache->sibling_map[k] = (uint8_t)(cu_sibling_map_mask & 0xFF); + pcache->sibling_map[k+1] = (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); + pcache->sibling_map[k+2] = (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); + pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); + k += 4; + + cu_sibling_map_mask = cu_info->cu_bitmap[xcc][i % 4][j + i / 4]; + cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1); + } } } pcache->sibling_map_size = k; @@ -1665,7 +1670,7 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct kfd_node *kdev) { struct kfd_gpu_cache_info *pcache_info = NULL; - int i, j, k; + int i, j, k, xcc, start, end; int ct = 0; unsigned int cu_processor_id; int ret; @@ -1699,37 +1704,42 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct * then it will consider only one CU from * the shared unit */ + start = ffs(kdev->xcc_mask) - 1; + end = start + NUM_XCC(kdev->xcc_mask); + for (ct = 0; ct < num_of_cache_types; ct++) { cu_processor_id = gpu_processor_id; if (pcache_info[ct].cache_level == 1) { - for (i = 0; i < pcu_info->num_shader_engines; i++) { - for (j = 0; j < pcu_info->num_shader_arrays_per_engine; j++) { - for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) { - - ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info, - pcu_info->cu_bitmap[0][i % 4][j + i / 4], ct, - cu_processor_id, k); - - if (ret < 0) - break; - - if (!ret) { - num_of_entries++; - list_add_tail(&props_ext->list, &dev->cache_props); + for (xcc = start; xcc < end; xcc++) { + for (i = 0; i < pcu_info->num_shader_engines; i++) { + for (j = 0; j < pcu_info->num_shader_arrays_per_engine; j++) { + for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) { + + ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info, + pcu_info->cu_bitmap[xcc][i % 4][j + i / 4], ct, + cu_processor_id, k); + + if (ret < 0) + break; + + if (!ret) { + num_of_entries++; + list_add_tail(&props_ext->list, &dev->cache_props); + } + + /* Move to next CU block */ + num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <= + pcu_info->num_cu_per_sh) ? + pcache_info[ct].num_cu_shared : + (pcu_info->num_cu_per_sh - k); + cu_processor_id += num_cu_shared; } - - /* Move to next CU block */ - num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <= - pcu_info->num_cu_per_sh) ? - pcache_info[ct].num_cu_shared : - (pcu_info->num_cu_per_sh - k); - cu_processor_id += num_cu_shared; } } } } else { ret = fill_in_l2_l3_pcache(&props_ext, pcache_info, - pcu_info, ct, cu_processor_id); + pcu_info, ct, cu_processor_id, kdev); if (ret < 0) break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index dea32a9e55060..27386ce9a021d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -89,7 +89,7 @@ struct kfd_mem_properties { struct attribute attr; }; -#define CACHE_SIBLINGMAP_SIZE 64 +#define CACHE_SIBLINGMAP_SIZE 128 struct kfd_cache_properties { struct list_head list; From fc6efed2c728c9c10b058512fc9c1613f870a8e8 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Tue, 22 Aug 2023 11:35:25 -0400 Subject: [PATCH 102/161] drm/amdkfd: Update CU masking for GFX 9.4.3 The CU mask passed from user-space will change based on different spatial partitioning mode. As a result, update CU masking code for GFX9.4.3 to work for all partitioning modes. Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 28 ++++++++--- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 2 +- .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 2 +- .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c | 2 +- .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 2 +- .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 46 ++++++++++++------- .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 2 +- 7 files changed, 56 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index 763966236658e..447829c22295c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -97,14 +97,16 @@ void free_mqd_hiq_sdma(struct mqd_manager *mm, void *mqd, void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, const uint32_t *cu_mask, uint32_t cu_mask_count, - uint32_t *se_mask) + uint32_t *se_mask, uint32_t inst) { struct kfd_cu_info cu_info; uint32_t cu_per_sh[KFD_MAX_NUM_SE][KFD_MAX_NUM_SH_PER_SE] = {0}; bool wgp_mode_req = KFD_GC_VERSION(mm->dev) >= IP_VERSION(10, 0, 0); uint32_t en_mask = wgp_mode_req ? 0x3 : 0x1; - int i, se, sh, cu, cu_bitmap_sh_mul, inc = wgp_mode_req ? 2 : 1; + int i, se, sh, cu, cu_bitmap_sh_mul, cu_inc = wgp_mode_req ? 2 : 1; uint32_t cu_active_per_node; + int inc = cu_inc * NUM_XCC(mm->dev->xcc_mask); + int xcc_inst = inst + ffs(mm->dev->xcc_mask) - 1; amdgpu_amdkfd_get_cu_info(mm->dev->adev, &cu_info); @@ -143,7 +145,8 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, for (se = 0; se < cu_info.num_shader_engines; se++) for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++) cu_per_sh[se][sh] = hweight32( - cu_info.cu_bitmap[0][se % 4][sh + (se / 4) * cu_bitmap_sh_mul]); + cu_info.cu_bitmap[xcc_inst][se % 4][sh + (se / 4) * + cu_bitmap_sh_mul]); /* Symmetrically map cu_mask to all SEs & SHs: * se_mask programs up to 2 SH in the upper and lower 16 bits. @@ -166,20 +169,33 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, * cu_mask[0] bit8 -> se_mask[0] bit1 (SE0,SH0,CU1) * ... * + * For GFX 9.4.3, the following code only looks at a + * subset of the cu_mask corresponding to the inst parameter. + * If we have n XCCs under one GPU node + * cu_mask[0] bit0 -> XCC0 se_mask[0] bit0 (XCC0,SE0,SH0,CU0) + * cu_mask[0] bit1 -> XCC1 se_mask[0] bit0 (XCC1,SE0,SH0,CU0) + * .. + * cu_mask[0] bitn -> XCCn se_mask[0] bit0 (XCCn,SE0,SH0,CU0) + * cu_mask[0] bit n+1 -> XCC0 se_mask[1] bit0 (XCC0,SE1,SH0,CU0) + * + * For example, if there are 6 XCCs under 1 KFD node, this code + * running for each inst, will look at the bits as: + * inst, inst + 6, inst + 12... + * * First ensure all CUs are disabled, then enable user specified CUs. */ for (i = 0; i < cu_info.num_shader_engines; i++) se_mask[i] = 0; - i = 0; - for (cu = 0; cu < 16; cu += inc) { + i = inst; + for (cu = 0; cu < 16; cu += cu_inc) { for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++) { for (se = 0; se < cu_info.num_shader_engines; se++) { if (cu_per_sh[se][sh] > cu) { if (cu_mask[i / 32] & (en_mask << (i % 32))) se_mask[se] |= en_mask << (cu + sh * 16); i += inc; - if (i == cu_mask_count) + if (i >= cu_mask_count) return; } } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h index 23158db7da035..57bf5e513f4d1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -138,7 +138,7 @@ void free_mqd_hiq_sdma(struct mqd_manager *mm, void *mqd, void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, const uint32_t *cu_mask, uint32_t cu_mask_count, - uint32_t *se_mask); + uint32_t *se_mask, uint32_t inst); int kfd_hiq_load_mqd_kiq(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, uint32_t queue_id, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index ee1d32d957f2b..1a4a69943c714 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -52,7 +52,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, return; mqd_symmetrically_map_cu_mask(mm, - minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask); + minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask, 0); m = get_mqd(mqd); m->compute_static_thread_mgmt_se0 = se_mask[0]; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c index 83699392c8089..8b7fed9135269 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c @@ -52,7 +52,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, return; mqd_symmetrically_map_cu_mask(mm, - minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask); + minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask, 0); m = get_mqd(mqd); m->compute_static_thread_mgmt_se0 = se_mask[0]; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c index 0bbf0edbabd47..964b5d50a77e6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c @@ -71,7 +71,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, } mqd_symmetrically_map_cu_mask(mm, - minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask); + minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask, 0); m->compute_static_thread_mgmt_se0 = se_mask[0]; m->compute_static_thread_mgmt_se1 = se_mask[1]; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index e23d32f356077..42d881809dc70 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -60,7 +60,7 @@ static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) } static void update_cu_mask(struct mqd_manager *mm, void *mqd, - struct mqd_update_info *minfo) + struct mqd_update_info *minfo, uint32_t inst) { struct v9_mqd *m; uint32_t se_mask[KFD_MAX_NUM_SE] = {0}; @@ -69,27 +69,36 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, return; mqd_symmetrically_map_cu_mask(mm, - minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask); + minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask, inst); m = get_mqd(mqd); + m->compute_static_thread_mgmt_se0 = se_mask[0]; m->compute_static_thread_mgmt_se1 = se_mask[1]; m->compute_static_thread_mgmt_se2 = se_mask[2]; m->compute_static_thread_mgmt_se3 = se_mask[3]; - m->compute_static_thread_mgmt_se4 = se_mask[4]; - m->compute_static_thread_mgmt_se5 = se_mask[5]; - m->compute_static_thread_mgmt_se6 = se_mask[6]; - m->compute_static_thread_mgmt_se7 = se_mask[7]; - - pr_debug("update cu mask to %#x %#x %#x %#x %#x %#x %#x %#x\n", - m->compute_static_thread_mgmt_se0, - m->compute_static_thread_mgmt_se1, - m->compute_static_thread_mgmt_se2, - m->compute_static_thread_mgmt_se3, - m->compute_static_thread_mgmt_se4, - m->compute_static_thread_mgmt_se5, - m->compute_static_thread_mgmt_se6, - m->compute_static_thread_mgmt_se7); + if (KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 3)) { + m->compute_static_thread_mgmt_se4 = se_mask[4]; + m->compute_static_thread_mgmt_se5 = se_mask[5]; + m->compute_static_thread_mgmt_se6 = se_mask[6]; + m->compute_static_thread_mgmt_se7 = se_mask[7]; + + pr_debug("update cu mask to %#x %#x %#x %#x %#x %#x %#x %#x\n", + m->compute_static_thread_mgmt_se0, + m->compute_static_thread_mgmt_se1, + m->compute_static_thread_mgmt_se2, + m->compute_static_thread_mgmt_se3, + m->compute_static_thread_mgmt_se4, + m->compute_static_thread_mgmt_se5, + m->compute_static_thread_mgmt_se6, + m->compute_static_thread_mgmt_se7); + } else { + pr_debug("inst: %u, update cu mask to %#x %#x %#x %#x\n", + inst, m->compute_static_thread_mgmt_se0, + m->compute_static_thread_mgmt_se1, + m->compute_static_thread_mgmt_se2, + m->compute_static_thread_mgmt_se3); + } } static void set_priority(struct v9_mqd *m, struct queue_properties *q) @@ -290,7 +299,8 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, if (mm->dev->kfd->cwsr_enabled && q->ctx_save_restore_area_address) m->cp_hqd_ctx_save_control = 0; - update_cu_mask(mm, mqd, minfo); + if (KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 3)) + update_cu_mask(mm, mqd, minfo, 0); set_priority(m, q); q->is_active = QUEUE_IS_ACTIVE(*q); @@ -676,6 +686,8 @@ static void update_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd + size * xcc); update_mqd(mm, m, q, minfo); + update_cu_mask(mm, mqd, minfo, xcc); + if (q->format == KFD_QUEUE_FORMAT_AQL) { switch (xcc) { case 0: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c index 657c378229808..3e1a574d4ea66 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -55,7 +55,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, return; mqd_symmetrically_map_cu_mask(mm, - minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask); + minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask, 0); m = get_mqd(mqd); m->compute_static_thread_mgmt_se0 = se_mask[0]; From 6be6d112419713334ddd9c01f219ca16adaa4c76 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 8 Sep 2023 08:57:02 +0800 Subject: [PATCH 103/161] blk-mq: fix tags UAF when shrinking q->nr_hw_queues When nr_hw_queues shrink, we free the excess tags before realloc'ing hw_ctxs for each queue. During that resize, we may need to access those tags, like blk_mq_tag_idle(hctx) will access queue shared tags. This can cause a slab use-after-free, as reported by KASAN. Fix it by moving the releasing of excess tags to the end. Fixes: e1dd7bc93029 ("blk-mq: fix tags leak when shrink nr_hw_queues") Reported-by: Yi Zhang Closes: https://lore.kernel.org/all/CAHj4cs_CK63uoDpGBGZ6DN4OCTpzkR3UaVgK=LX8Owr8ej2ieQ@mail.gmail.com/ Cc: Ming Lei Signed-off-by: Chengming Zhou Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20230908005702.2183908-1-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-mq.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index ec922c6bccbe2..1fafd54dce3cb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4405,11 +4405,8 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, struct blk_mq_tags **new_tags; int i; - if (set->nr_hw_queues >= new_nr_hw_queues) { - for (i = new_nr_hw_queues; i < set->nr_hw_queues; i++) - __blk_mq_free_map_and_rqs(set, i); + if (set->nr_hw_queues >= new_nr_hw_queues) goto done; - } new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); @@ -4719,7 +4716,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, { struct request_queue *q; LIST_HEAD(head); - int prev_nr_hw_queues; + int prev_nr_hw_queues = set->nr_hw_queues; + int i; lockdep_assert_held(&set->tag_list_lock); @@ -4746,7 +4744,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, blk_mq_sysfs_unregister_hctxs(q); } - prev_nr_hw_queues = set->nr_hw_queues; if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) goto reregister; @@ -4781,6 +4778,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_unfreeze_queue(q); + + /* Free the excess tags when nr_hw_queues shrink. */ + for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) + __blk_mq_free_map_and_rqs(set, i); } void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) From ef064187a9709393a981a56cce1e31880fd97107 Mon Sep 17 00:00:00 2001 From: Yifan Zhang Date: Fri, 8 Sep 2023 16:46:39 +0800 Subject: [PATCH 104/161] drm/amd/display: fix the white screen issue when >= 64GB DRAM Dropping bit 31:4 of page table base is wrong, it makes page table base points to wrong address if phys addr is beyond 64GB; dropping page_table_start/end bit 31:4 is unnecessary since dcn20_vmid_setup will do that. Also, while we are at it, cleanup the assignments using upper_32_bits()/lower_32_bits() and AMDGPU_GPU_PAGE_SHIFT. Cc: stable@vger.kernel.org Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2354 Fixes: 81d0bcf99009 ("drm/amdgpu: make display pinning more flexible (v2)") Acked-by: Harry Wentland Reviewed-by: Alex Deucher Signed-off-by: Yifan Zhang Co-developed-by: Hamza Mahfooz Signed-off-by: Hamza Mahfooz Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 88ba8b66de1f7..6a0ea15936aef 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1274,11 +1274,15 @@ static void mmhub_read_system_context(struct amdgpu_device *adev, struct dc_phy_ pt_base = amdgpu_gmc_pd_addr(adev->gart.bo); - page_table_start.high_part = (u32)(adev->gmc.gart_start >> 44) & 0xF; - page_table_start.low_part = (u32)(adev->gmc.gart_start >> 12); - page_table_end.high_part = (u32)(adev->gmc.gart_end >> 44) & 0xF; - page_table_end.low_part = (u32)(adev->gmc.gart_end >> 12); - page_table_base.high_part = upper_32_bits(pt_base) & 0xF; + page_table_start.high_part = upper_32_bits(adev->gmc.gart_start >> + AMDGPU_GPU_PAGE_SHIFT); + page_table_start.low_part = lower_32_bits(adev->gmc.gart_start >> + AMDGPU_GPU_PAGE_SHIFT); + page_table_end.high_part = upper_32_bits(adev->gmc.gart_end >> + AMDGPU_GPU_PAGE_SHIFT); + page_table_end.low_part = lower_32_bits(adev->gmc.gart_end >> + AMDGPU_GPU_PAGE_SHIFT); + page_table_base.high_part = upper_32_bits(pt_base); page_table_base.low_part = lower_32_bits(pt_base); pa_config->system_aperture.start_addr = (uint64_t)logical_addr_low << 18; From 169ed4ece8373f02f10642eae5240e3d1ef5c038 Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz Date: Fri, 8 Sep 2023 10:36:44 -0400 Subject: [PATCH 105/161] Revert "drm/amd: Disable S/G for APUs when 64GB or more host memory" This reverts commit 70e64c4d522b732e31c6475a3be2349de337d321. Since, we now have an actual fix for this issue, we can get rid of this workaround as it can cause pin failures if enough VRAM isn't carved out by the BIOS. Cc: stable@vger.kernel.org # 6.1+ Acked-by: Harry Wentland Reviewed-by: Alex Deucher Signed-off-by: Hamza Mahfooz Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 26 ------------------- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 5 ++-- 3 files changed, 3 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index dc2d53081e806..a79d53bdbe136 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1293,7 +1293,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, void amdgpu_device_pci_config_reset(struct amdgpu_device *adev); int amdgpu_device_pci_reset(struct amdgpu_device *adev); bool amdgpu_device_need_post(struct amdgpu_device *adev); -bool amdgpu_sg_display_supported(struct amdgpu_device *adev); bool amdgpu_device_pcie_dynamic_switching_supported(void); bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev); bool amdgpu_device_aspm_support_quirk(void); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3f001a50b34a6..30c4f5cca02c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1244,32 +1244,6 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev) return true; } -/* - * On APUs with >= 64GB white flickering has been observed w/ SG enabled. - * Disable S/G on such systems until we have a proper fix. - * https://gitlab.freedesktop.org/drm/amd/-/issues/2354 - * https://gitlab.freedesktop.org/drm/amd/-/issues/2735 - */ -bool amdgpu_sg_display_supported(struct amdgpu_device *adev) -{ - switch (amdgpu_sg_display) { - case -1: - break; - case 0: - return false; - case 1: - return true; - default: - return false; - } - if ((totalram_pages() << (PAGE_SHIFT - 10)) + - (adev->gmc.real_vram_size / 1024) >= 64000000) { - DRM_WARN("Disabling S/G due to >=64GB RAM\n"); - return false; - } - return true; -} - /* * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic * speed switching. Until we have confirmation from Intel that a specific host diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 6a0ea15936aef..954906c515aa9 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1644,8 +1644,9 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) } break; } - if (init_data.flags.gpu_vm_support) - init_data.flags.gpu_vm_support = amdgpu_sg_display_supported(adev); + if (init_data.flags.gpu_vm_support && + (amdgpu_sg_display == 0)) + init_data.flags.gpu_vm_support = false; if (init_data.flags.gpu_vm_support) adev->mode_info.gpu_vm_support = true; From 679fc891bf11845730b572fc44f8a0eb846aba29 Mon Sep 17 00:00:00 2001 From: Bhawanpreet Lakha Date: Tue, 22 Aug 2023 10:02:46 -0400 Subject: [PATCH 106/161] drm/amd/display: Add dirty rect support for Replay Dirty rect can be used with replay, so enable them to allow for more powersaving. Reviewed-by: Sun peng Li Acked-by: Stylon Wang Signed-off-by: Bhawanpreet Lakha Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 954906c515aa9..ca129983a08b2 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -8078,7 +8078,8 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, bundle->surface_updates[planes_count].plane_info = &bundle->plane_infos[planes_count]; - if (acrtc_state->stream->link->psr_settings.psr_feature_enabled) { + if (acrtc_state->stream->link->psr_settings.psr_feature_enabled || + acrtc_state->stream->link->replay_settings.replay_feature_enabled) { fill_dc_dirty_rects(plane, old_plane_state, new_plane_state, new_crtc_state, &bundle->flip_addrs[planes_count], From 81cc8779cf46d6323c83475706b61d9552230274 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Sep 2023 13:54:38 +0300 Subject: [PATCH 107/161] drm/amdgpu: fix retry loop test This loop will exit with "retry" set to -1 if it fails but the code checks for if "retry" is zero. Fix this by changing post-op to a pre-op. --retry vs retry--. Fixes: e01eeffc3f86 ("drm/amd/pm: avoid driver getting empty metrics table for the first time") Reviewed-by: Evan Quan Signed-off-by: Dan Carpenter Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 199a673b81201..de80e191a92c4 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -336,7 +336,7 @@ static int smu_v13_0_6_setup_driver_pptable(struct smu_context *smu) /* Store one-time values in driver PPTable */ if (!pptable->Init) { - while (retry--) { + while (--retry) { ret = smu_v13_0_6_get_metrics_table(smu, NULL, true); if (ret) return ret; From f5b2c10b57615828b531bb0ae56bd6325a41167e Mon Sep 17 00:00:00 2001 From: Swapnil Patel Date: Thu, 17 Aug 2023 14:04:26 -0400 Subject: [PATCH 108/161] drm/amd/display: Don't check registers, if using AUX BL control [Why] Currently the driver looks DCN registers to access if BL is on or not. This check is not valid if we are using AUX based brightness control. This causes driver to not send out "backlight off" command during power off sequence as it already thinks it is off. [How] Only check DCN registers if we aren't using AUX based brightness control. Reviewed-by: Wenjing Liu Acked-by: Stylon Wang Signed-off-by: Swapnil Patel Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c index ad967b58d7bec..478281f2a5ba7 100644 --- a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c @@ -964,7 +964,9 @@ void dce110_edp_backlight_control( return; } - if (link->panel_cntl) { + if (link->panel_cntl && !(link->dpcd_sink_ext_caps.bits.oled || + link->dpcd_sink_ext_caps.bits.hdr_aux_backlight_control == 1 || + link->dpcd_sink_ext_caps.bits.sdr_aux_backlight_control == 1)) { bool is_backlight_on = link->panel_cntl->funcs->is_panel_backlight_on(link->panel_cntl); if ((enable && is_backlight_on) || (!enable && !is_backlight_on)) { From 1832403cd41ca6b19b24e9d64f79cb08d920ca44 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 6 Sep 2023 11:35:04 -0400 Subject: [PATCH 109/161] drm/amdgpu/soc21: don't remap HDP registers for SR-IOV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This matches the behavior for soc15 and nv. Acked-by: Christian König Reviewed-by: Timmy Tsai Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/soc21.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 40d23738ee4ec..8b2ff2b281b0a 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -766,7 +766,7 @@ static int soc21_common_hw_init(void *handle) * for the purpose of expose those registers * to process space */ - if (adev->nbio.funcs->remap_hdp_registers) + if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev)) adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ adev->nbio.funcs->enable_doorbell_aperture(adev, true); From ab43213e7afd08ac68d4282060bacf309e70fd14 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 7 Sep 2023 15:44:54 -0400 Subject: [PATCH 110/161] drm/amdgpu/nbio4.3: set proper rmmio_remap.reg_offset for SR-IOV Needed for HDP flush to work correctly. Reviewed-by: Timmy Tsai Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c index d5ed9e0e1a5f1..e5b5b0f4940f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c @@ -345,6 +345,9 @@ static void nbio_v4_3_init_registers(struct amdgpu_device *adev) data &= ~RCC_DEV0_EPF2_STRAP2__STRAP_NO_SOFT_RESET_DEV0_F2_MASK; WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF2_STRAP2, data); } + if (amdgpu_sriov_vf(adev)) + adev->rmmio_remap.reg_offset = SOC15_REG_OFFSET(NBIO, 0, + regBIF_BX_DEV0_EPF0_VF0_HDP_MEM_COHERENCY_FLUSH_CNTL) << 2; } static u32 nbio_v4_3_get_rom_offset(struct amdgpu_device *adev) From ffd6bde302061aeee405ab364403af30210f0b99 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 8 Sep 2023 21:21:55 +0800 Subject: [PATCH 111/161] drm/amdgpu: fallback to old RAS error message for aqua_vanjaram So driver doesn't generate incorrect message until the new format is settled down for aqua_vanjaram Signed-off-by: Hawking Zhang Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 3c4600e15b862..937c54fc71745 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1052,7 +1052,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, info->ce_count = obj->err_data.ce_count; if (err_data.ce_count) { - if (adev->smuio.funcs && + if (!adev->aid_mask && + adev->smuio.funcs && adev->smuio.funcs->get_socket_id && adev->smuio.funcs->get_die_id) { dev_info(adev->dev, "socket: %d, die: %d " @@ -1072,7 +1073,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, } } if (err_data.ue_count) { - if (adev->smuio.funcs && + if (!adev->aid_mask && + adev->smuio.funcs && adev->smuio.funcs->get_socket_id && adev->smuio.funcs->get_die_id) { dev_info(adev->dev, "socket: %d, die: %d " From ec5fa9fcdeca69edf7dab5ca3b2e0ceb1c08fe9a Mon Sep 17 00:00:00 2001 From: Wayne Lin Date: Tue, 22 Aug 2023 16:03:17 +0800 Subject: [PATCH 112/161] drm/amd/display: Adjust the MST resume flow [Why] In drm_dp_mst_topology_mgr_resume() today, it will resume the mst branch to be ready handling mst mode and also consecutively do the mst topology probing. Which will cause the dirver have chance to fire hotplug event before restoring the old state. Then Userspace will react to the hotplug event based on a wrong state. [How] Adjust the mst resume flow as: 1. set dpcd to resume mst branch status 2. restore source old state 3. Do mst resume topology probing For drm_dp_mst_topology_mgr_resume(), it's better to adjust it to pull out topology probing work into a 2nd part procedure of the mst resume. Will have a follow up patch in drm. Reviewed-by: Chao-kai Wang Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Acked-by: Stylon Wang Signed-off-by: Wayne Lin Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 93 ++++++++++++++++--- 1 file changed, 80 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index ca129983a08b2..c6fd34bab3585 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2340,14 +2340,62 @@ static int dm_late_init(void *handle) return detect_mst_link_for_all_connectors(adev_to_drm(adev)); } +static void resume_mst_branch_status(struct drm_dp_mst_topology_mgr *mgr) +{ + int ret; + u8 guid[16]; + u64 tmp64; + + mutex_lock(&mgr->lock); + if (!mgr->mst_primary) + goto out_fail; + + if (drm_dp_read_dpcd_caps(mgr->aux, mgr->dpcd) < 0) { + drm_dbg_kms(mgr->dev, "dpcd read failed - undocked during suspend?\n"); + goto out_fail; + } + + ret = drm_dp_dpcd_writeb(mgr->aux, DP_MSTM_CTRL, + DP_MST_EN | + DP_UP_REQ_EN | + DP_UPSTREAM_IS_SRC); + if (ret < 0) { + drm_dbg_kms(mgr->dev, "mst write failed - undocked during suspend?\n"); + goto out_fail; + } + + /* Some hubs forget their guids after they resume */ + ret = drm_dp_dpcd_read(mgr->aux, DP_GUID, guid, 16); + if (ret != 16) { + drm_dbg_kms(mgr->dev, "dpcd read failed - undocked during suspend?\n"); + goto out_fail; + } + + if (memchr_inv(guid, 0, 16) == NULL) { + tmp64 = get_jiffies_64(); + memcpy(&guid[0], &tmp64, sizeof(u64)); + memcpy(&guid[8], &tmp64, sizeof(u64)); + + ret = drm_dp_dpcd_write(mgr->aux, DP_GUID, guid, 16); + + if (ret != 16) { + drm_dbg_kms(mgr->dev, "check mstb guid failed - undocked during suspend?\n"); + goto out_fail; + } + } + + memcpy(mgr->mst_primary->guid, guid, 16); + +out_fail: + mutex_unlock(&mgr->lock); +} + static void s3_handle_mst(struct drm_device *dev, bool suspend) { struct amdgpu_dm_connector *aconnector; struct drm_connector *connector; struct drm_connector_list_iter iter; struct drm_dp_mst_topology_mgr *mgr; - int ret; - bool need_hotplug = false; drm_connector_list_iter_begin(dev, &iter); drm_for_each_connector_iter(connector, &iter) { @@ -2369,18 +2417,15 @@ static void s3_handle_mst(struct drm_device *dev, bool suspend) if (!dp_is_lttpr_present(aconnector->dc_link)) try_to_configure_aux_timeout(aconnector->dc_link->ddc, LINK_AUX_DEFAULT_TIMEOUT_PERIOD); - ret = drm_dp_mst_topology_mgr_resume(mgr, true); - if (ret < 0) { - dm_helpers_dp_mst_stop_top_mgr(aconnector->dc_link->ctx, - aconnector->dc_link); - need_hotplug = true; - } + /* TODO: move resume_mst_branch_status() into drm mst resume again + * once topology probing work is pulled out from mst resume into mst + * resume 2nd step. mst resume 2nd step should be called after old + * state getting restored (i.e. drm_atomic_helper_resume()). + */ + resume_mst_branch_status(mgr); } } drm_connector_list_iter_end(&iter); - - if (need_hotplug) - drm_kms_helper_hotplug_event(dev); } static int amdgpu_dm_smu_write_watermarks_table(struct amdgpu_device *adev) @@ -2774,7 +2819,8 @@ static int dm_resume(void *handle) struct dm_atomic_state *dm_state = to_dm_atomic_state(dm->atomic_obj.state); enum dc_connection_type new_connection_type = dc_connection_none; struct dc_state *dc_state; - int i, r, j; + int i, r, j, ret; + bool need_hotplug = false; if (amdgpu_in_reset(adev)) { dc_state = dm->cached_dc_state; @@ -2872,7 +2918,7 @@ static int dm_resume(void *handle) continue; /* - * this is the case when traversing through already created + * this is the case when traversing through already created end sink * MST connectors, should be skipped */ if (aconnector && aconnector->mst_root) @@ -2932,6 +2978,27 @@ static int dm_resume(void *handle) dm->cached_state = NULL; + /* Do mst topology probing after resuming cached state*/ + drm_connector_list_iter_begin(ddev, &iter); + drm_for_each_connector_iter(connector, &iter) { + aconnector = to_amdgpu_dm_connector(connector); + if (aconnector->dc_link->type != dc_connection_mst_branch || + aconnector->mst_root) + continue; + + ret = drm_dp_mst_topology_mgr_resume(&aconnector->mst_mgr, true); + + if (ret < 0) { + dm_helpers_dp_mst_stop_top_mgr(aconnector->dc_link->ctx, + aconnector->dc_link); + need_hotplug = true; + } + } + drm_connector_list_iter_end(&iter); + + if (need_hotplug) + drm_kms_helper_hotplug_event(ddev); + amdgpu_dm_irq_resume_late(adev); amdgpu_dm_smu_write_watermarks_table(adev); From fc52a64416b010c8324e2cb50070faae868521c1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 8 Sep 2023 16:39:29 -0400 Subject: [PATCH 113/161] tracing/synthetic: Fix order of struct trace_dynamic_info To make handling BIG and LITTLE endian better the offset/len of dynamic fields of the synthetic events was changed into a structure of: struct trace_dynamic_info { #ifdef CONFIG_CPU_BIG_ENDIAN u16 offset; u16 len; #else u16 len; u16 offset; #endif }; to replace the manual changes of: data_offset = offset & 0xffff; data_offest = len << 16; But if you look closely, the above is: << 16 | offset Which in little endian would be in memory: offset_lo offset_hi len_lo len_hi and in big endian: len_hi len_lo offset_hi offset_lo Which if broken into a structure would be: struct trace_dynamic_info { #ifdef CONFIG_CPU_BIG_ENDIAN u16 len; u16 offset; #else u16 offset; u16 len; #endif }; Which is the opposite of what was defined. Fix this and just to be safe also add "__packed". Link: https://lore.kernel.org/all/20230908154417.5172e343@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20230908163929.2c25f3dc@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mark Rutland Tested-by: Sven Schnelle Acked-by: Masami Hiramatsu (Google) Fixes: ddeea494a16f3 ("tracing/synthetic: Use union instead of casts") Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 12f875e9e69a7..21ae37e49319a 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -62,13 +62,13 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...); /* Used to find the offset and length of dynamic fields in trace events */ struct trace_dynamic_info { #ifdef CONFIG_CPU_BIG_ENDIAN - u16 offset; u16 len; + u16 offset; #else - u16 len; u16 offset; + u16 len; #endif -}; +} __packed; /* * The trace entry - the most basic unit of tracing. This is what From 9296da8c40900b4dae3d973aa22be306e2a77671 Mon Sep 17 00:00:00 2001 From: David Francis Date: Tue, 22 Nov 2022 15:14:32 -0500 Subject: [PATCH 114/161] drm/amdkfd: Checkpoint and restore queues on GFX11 The code in kfd_mqd_manager_v11.c to support criu dump and restore of queue state was missing. Added it; should be equivalent to kfd_mqd_manager_v10.c. CC: Felix Kuehling Reviewed-by: Harish Kasiviswanathan Acked-by: Alex Deucher Signed-off-by: David Francis Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c index 964b5d50a77e6..15277f1d5cf0a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c @@ -321,6 +321,43 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, return 0; } +static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, void *ctl_stack_dst) +{ + struct v11_compute_mqd *m; + + m = get_mqd(mqd); + + memcpy(mqd_dst, m, sizeof(struct v11_compute_mqd)); +} + +static void restore_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *qp, + const void *mqd_src, + const void *ctl_stack_src, const u32 ctl_stack_size) +{ + uint64_t addr; + struct v11_compute_mqd *m; + + m = (struct v11_compute_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; + + memcpy(m, mqd_src, sizeof(*m)); + + *mqd = m; + if (gart_addr) + *gart_addr = addr; + + m->cp_hqd_pq_doorbell_control = + qp->doorbell_off << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; + pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", + m->cp_hqd_pq_doorbell_control); + + qp->is_active = 0; +} + + static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) @@ -458,6 +495,8 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type, mqd->mqd_size = sizeof(struct v11_compute_mqd); mqd->get_wave_state = get_wave_state; mqd->mqd_stride = kfd_mqd_stride; + mqd->checkpoint_mqd = checkpoint_mqd; + mqd->restore_mqd = restore_mqd; #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif @@ -502,6 +541,8 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_sdma; mqd->destroy_mqd = kfd_destroy_mqd_sdma; mqd->is_occupied = kfd_is_occupied_sdma; + mqd->checkpoint_mqd = checkpoint_mqd; + mqd->restore_mqd = restore_mqd; mqd->mqd_size = sizeof(struct v11_sdma_mqd); mqd->mqd_stride = kfd_mqd_stride; #if defined(CONFIG_DEBUG_FS) From 62663b849662c1a5126b6274d91671b90566ef13 Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Mon, 11 Sep 2023 17:17:04 +0300 Subject: [PATCH 115/161] tracing/synthetic: Print out u64 values properly The synth traces incorrectly print pointer to the synthetic event values instead of the actual value when using u64 type. Fix by addressing the contents of the union properly. Link: https://lore.kernel.org/linux-trace-kernel/20230911141704.3585965-1-tero.kristo@linux.intel.com Fixes: ddeea494a16f ("tracing/synthetic: Use union instead of casts") Cc: stable@vger.kernel.org Signed-off-by: Tero Kristo Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_synth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 9897d0bfcab71..14cb275a0bab0 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -337,7 +337,7 @@ static void print_synth_event_num_val(struct trace_seq *s, break; default: - trace_seq_printf(s, print_fmt, name, val, space); + trace_seq_printf(s, print_fmt, name, val->as_u64, space); break; } } From 5e7e82254270c8cf8b107451c5de01cee2f135ae Mon Sep 17 00:00:00 2001 From: David Francis Date: Tue, 5 Sep 2023 10:13:51 -0400 Subject: [PATCH 116/161] drm/amdgpu: Handle null atom context in VBIOS info ioctl On some APU systems, there is no atom context and so the atom_context struct is null. Add a check to the VBIOS_INFO branch of amdgpu_info_ioctl to handle this case, returning all zeroes. Reviewed-by: Alex Deucher Signed-off-by: David Francis Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 2cd2ecebf4656..d30dc0b718c73 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -940,12 +940,17 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) struct atom_context *atom_context; atom_context = adev->mode_info.atom_context; - memcpy(vbios_info.name, atom_context->name, sizeof(atom_context->name)); - memcpy(vbios_info.vbios_pn, atom_context->vbios_pn, sizeof(atom_context->vbios_pn)); - vbios_info.version = atom_context->version; - memcpy(vbios_info.vbios_ver_str, atom_context->vbios_ver_str, - sizeof(atom_context->vbios_ver_str)); - memcpy(vbios_info.date, atom_context->date, sizeof(atom_context->date)); + if (atom_context) { + memcpy(vbios_info.name, atom_context->name, + sizeof(atom_context->name)); + memcpy(vbios_info.vbios_pn, atom_context->vbios_pn, + sizeof(atom_context->vbios_pn)); + vbios_info.version = atom_context->version; + memcpy(vbios_info.vbios_ver_str, atom_context->vbios_ver_str, + sizeof(atom_context->vbios_ver_str)); + memcpy(vbios_info.date, atom_context->date, + sizeof(atom_context->date)); + } return copy_to_user(out, &vbios_info, min((size_t)size, sizeof(vbios_info))) ? -EFAULT : 0; From db5494a85294f057e0bb41bdb5372c2dbf46fb79 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 10 Sep 2023 16:44:50 -0700 Subject: [PATCH 117/161] drm/amd/display: fix replay_mode kernel-doc warning Fix the typo in the kernel-doc for @replay_mode to prevent kernel-doc warnings: drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h:623: warning: Incorrect use of kernel-doc format: * @replay mode: Replay supported drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h:626: warning: Function parameter or member 'replay_mode' not described in 'amdgpu_hdmi_vsdb_info' Fixes: ec8e59cb4e0c ("drm/amd/display: Get replay info from VSDB") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Bhawanpreet Lakha Cc: Harry Wentland Cc: Alex Deucher Cc: Leo Li Cc: Rodrigo Siqueira Cc: amd-gfx@lists.freedesktop.org Cc: dri-devel@lists.freedesktop.org Signed-off-by: Hamza Mahfooz Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h index a2d34be82613c..9e4cc5eeda767 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h @@ -620,7 +620,7 @@ struct amdgpu_hdmi_vsdb_info { unsigned int max_refresh_rate_hz; /** - * @replay mode: Replay supported + * @replay_mode: Replay supported */ bool replay_mode; }; From 64be47ba286117ee4e3dd9d064c88ea2913e3269 Mon Sep 17 00:00:00 2001 From: Mustapha Ghaddar Date: Thu, 10 Aug 2023 16:20:23 -0400 Subject: [PATCH 118/161] drm/amd/display: Add DPIA Link Encoder Assignment Fix For DPIA we should have preferred DIG assignment based on DPIA selected as per the ASIC design. Reviewed-by: George Shen Acked-by: Hamza Mahfooz Signed-off-by: Mustapha Ghaddar Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../drm/amd/display/dc/core/dc_link_enc_cfg.c | 35 +++++++++++++++---- drivers/gpu/drm/amd/display/dc/dc.h | 1 + .../amd/display/dc/dcn314/dcn314_resource.c | 23 ++++++++++++ .../gpu/drm/amd/display/dc/inc/core_types.h | 1 + .../drm/amd/display/dc/link/link_factory.c | 4 +++ 5 files changed, 58 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_enc_cfg.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_enc_cfg.c index 30c0644d4418f..b66eeac4d3d2f 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_enc_cfg.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_enc_cfg.c @@ -169,11 +169,23 @@ static void add_link_enc_assignment( /* Return first available DIG link encoder. */ static enum engine_id find_first_avail_link_enc( const struct dc_context *ctx, - const struct dc_state *state) + const struct dc_state *state, + enum engine_id eng_id_requested) { enum engine_id eng_id = ENGINE_ID_UNKNOWN; int i; + if (eng_id_requested != ENGINE_ID_UNKNOWN) { + + for (i = 0; i < ctx->dc->res_pool->res_cap->num_dig_link_enc; i++) { + eng_id = state->res_ctx.link_enc_cfg_ctx.link_enc_avail[i]; + if (eng_id == eng_id_requested) + return eng_id; + } + } + + eng_id = ENGINE_ID_UNKNOWN; + for (i = 0; i < ctx->dc->res_pool->res_cap->num_dig_link_enc; i++) { eng_id = state->res_ctx.link_enc_cfg_ctx.link_enc_avail[i]; if (eng_id != ENGINE_ID_UNKNOWN) @@ -287,7 +299,7 @@ void link_enc_cfg_link_encs_assign( struct dc_stream_state *streams[], uint8_t stream_count) { - enum engine_id eng_id = ENGINE_ID_UNKNOWN; + enum engine_id eng_id = ENGINE_ID_UNKNOWN, eng_id_req = ENGINE_ID_UNKNOWN; int i; int j; @@ -377,8 +389,15 @@ void link_enc_cfg_link_encs_assign( * assigned to that endpoint. */ link_enc = get_link_enc_used_by_link(state, stream->link); - if (link_enc == NULL) - eng_id = find_first_avail_link_enc(stream->ctx, state); + if (link_enc == NULL) { + + if (stream->link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA && + stream->link->dpia_preferred_eng_id != ENGINE_ID_UNKNOWN) + eng_id_req = stream->link->dpia_preferred_eng_id; + + if (eng_id == ENGINE_ID_UNKNOWN) + eng_id = find_first_avail_link_enc(stream->ctx, state, eng_id_req); + } else eng_id = link_enc->preferred_engine; @@ -402,7 +421,9 @@ void link_enc_cfg_link_encs_assign( DC_LOG_DEBUG("%s: CUR %s(%d) - enc_id(%d)\n", __func__, assignment.ep_id.ep_type == DISPLAY_ENDPOINT_PHY ? "PHY" : "DPIA", - assignment.ep_id.link_id.enum_id - 1, + assignment.ep_id.ep_type == DISPLAY_ENDPOINT_PHY ? + assignment.ep_id.link_id.enum_id : + assignment.ep_id.link_id.enum_id - 1, assignment.eng_id); } for (i = 0; i < MAX_PIPES; i++) { @@ -413,7 +434,9 @@ void link_enc_cfg_link_encs_assign( DC_LOG_DEBUG("%s: NEW %s(%d) - enc_id(%d)\n", __func__, assignment.ep_id.ep_type == DISPLAY_ENDPOINT_PHY ? "PHY" : "DPIA", - assignment.ep_id.link_id.enum_id - 1, + assignment.ep_id.ep_type == DISPLAY_ENDPOINT_PHY ? + assignment.ep_id.link_id.enum_id : + assignment.ep_id.link_id.enum_id - 1, assignment.eng_id); } diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 0d0bef8eb331d..31e3183497a7f 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -1496,6 +1496,7 @@ struct dc_link { * object creation. */ enum engine_id eng_id; + enum engine_id dpia_preferred_eng_id; bool test_pattern_enabled; enum dp_test_pattern current_test_pattern; diff --git a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c index 1c1fb2fa08229..004beed9bd444 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_resource.c @@ -1032,6 +1032,28 @@ static const struct dce_i2c_mask i2c_masks = { I2C_COMMON_MASK_SH_LIST_DCN30(_MASK) }; +/* ========================================================== */ + +/* + * DPIA index | Preferred Encoder | Host Router + * 0 | C | 0 + * 1 | First Available | 0 + * 2 | D | 1 + * 3 | First Available | 1 + */ +/* ========================================================== */ +static const enum engine_id dpia_to_preferred_enc_id_table[] = { + ENGINE_ID_DIGC, + ENGINE_ID_DIGC, + ENGINE_ID_DIGD, + ENGINE_ID_DIGD +}; + +static enum engine_id dcn314_get_preferred_eng_id_dpia(unsigned int dpia_index) +{ + return dpia_to_preferred_enc_id_table[dpia_index]; +} + static struct dce_i2c_hw *dcn31_i2c_hw_create( struct dc_context *ctx, uint32_t inst) @@ -1785,6 +1807,7 @@ static struct resource_funcs dcn314_res_pool_funcs = { .update_bw_bounding_box = dcn314_update_bw_bounding_box, .patch_unknown_plane_state = dcn20_patch_unknown_plane_state, .get_panel_config_defaults = dcn314_get_panel_config_defaults, + .get_preferred_eng_id_dpia = dcn314_get_preferred_eng_id_dpia, }; static struct clock_source *dcn30_clock_source_create( diff --git a/drivers/gpu/drm/amd/display/dc/inc/core_types.h b/drivers/gpu/drm/amd/display/dc/inc/core_types.h index 027aec70c070f..eaad1260bfd18 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/core_types.h +++ b/drivers/gpu/drm/amd/display/dc/inc/core_types.h @@ -65,6 +65,7 @@ struct resource_context; struct clk_bw_params; struct resource_funcs { + enum engine_id (*get_preferred_eng_id_dpia)(unsigned int dpia_index); void (*destroy)(struct resource_pool **pool); void (*link_init)(struct dc_link *link); struct panel_cntl*(*panel_cntl_create)( diff --git a/drivers/gpu/drm/amd/display/dc/link/link_factory.c b/drivers/gpu/drm/amd/display/dc/link/link_factory.c index 195ca9e52edaa..0895742a31024 100644 --- a/drivers/gpu/drm/amd/display/dc/link/link_factory.c +++ b/drivers/gpu/drm/amd/display/dc/link/link_factory.c @@ -791,6 +791,10 @@ static bool construct_dpia(struct dc_link *link, /* Set dpia port index : 0 to number of dpia ports */ link->ddc_hw_inst = init_params->connector_index; + // Assign Dpia preferred eng_id + if (link->dc->res_pool->funcs->get_preferred_eng_id_dpia) + link->dpia_preferred_eng_id = link->dc->res_pool->funcs->get_preferred_eng_id_dpia(link->ddc_hw_inst); + /* TODO: Create link encoder */ link->psr_settings.psr_version = DC_PSR_VERSION_UNSUPPORTED; From 29319378449035c6fc6391b31a3c2cbaf75be221 Mon Sep 17 00:00:00 2001 From: Mustapha Ghaddar Date: Tue, 22 Aug 2023 16:18:03 -0400 Subject: [PATCH 119/161] drm/amd/display: Fix 2nd DPIA encoder Assignment [HOW & Why] There seems to be an issue with 2nd DPIA acquiring link encoder for tiled displays. Solution is to remove check for eng_id before we get first dynamic encoder for it Reviewed-by: Cruise Hung Reviewed-by: Meenakshikumar Somasundaram Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Acked-by: Stylon Wang Signed-off-by: Mustapha Ghaddar Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc_link_enc_cfg.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_enc_cfg.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_enc_cfg.c index b66eeac4d3d2f..be5a6d008b290 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_enc_cfg.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_enc_cfg.c @@ -395,8 +395,7 @@ void link_enc_cfg_link_encs_assign( stream->link->dpia_preferred_eng_id != ENGINE_ID_UNKNOWN) eng_id_req = stream->link->dpia_preferred_eng_id; - if (eng_id == ENGINE_ID_UNKNOWN) - eng_id = find_first_avail_link_enc(stream->ctx, state, eng_id_req); + eng_id = find_first_avail_link_enc(stream->ctx, state, eng_id_req); } else eng_id = link_enc->preferred_engine; @@ -501,7 +500,6 @@ struct dc_link *link_enc_cfg_get_link_using_link_enc( if (stream) link = stream->link; - // dm_output_to_console("%s: No link using DIG(%d).\n", __func__, eng_id); return link; } From a06023a8f78d3e9e73ca4363ccf3871a06e16ecc Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Fri, 8 Sep 2023 20:19:16 +0000 Subject: [PATCH 120/161] selftests/user_events: Fix failures when user_events is not installed When user_events is not installed the self tests currently fail. Now that these self tests run by default we need to ensure they don't fail when user_events was not enabled for the kernel being tested. Add common methods to detect if tracefs and user_events is enabled. If either is not enabled skip the test. If tracefs is enabled, but is not mounted, mount tracefs and fail if there were any errors. Fail if not run as root. Fixes: 68b4d2d58389 ("selftests/user_events: Reenable build") Reported-by: Naresh Kamboju Link: https://lore.kernel.org/all/CA+G9fYuugZ0OMeS6HvpSS4nuf_A3s455ecipGBvER0LJHojKZg@mail.gmail.com/ Signed-off-by: Beau Belgrave Signed-off-by: Shuah Khan --- .../testing/selftests/user_events/abi_test.c | 3 + .../testing/selftests/user_events/dyn_test.c | 2 + .../selftests/user_events/ftrace_test.c | 3 + .../testing/selftests/user_events/perf_test.c | 3 + .../user_events/user_events_selftests.h | 100 ++++++++++++++++++ 5 files changed, 111 insertions(+) create mode 100644 tools/testing/selftests/user_events/user_events_selftests.h diff --git a/tools/testing/selftests/user_events/abi_test.c b/tools/testing/selftests/user_events/abi_test.c index 5125c42efe657..22374d29ffdd4 100644 --- a/tools/testing/selftests/user_events/abi_test.c +++ b/tools/testing/selftests/user_events/abi_test.c @@ -19,6 +19,7 @@ #include #include "../kselftest_harness.h" +#include "user_events_selftests.h" const char *data_file = "/sys/kernel/tracing/user_events_data"; const char *enable_file = "/sys/kernel/tracing/events/user_events/__abi_event/enable"; @@ -93,6 +94,8 @@ FIXTURE(user) { }; FIXTURE_SETUP(user) { + USER_EVENT_FIXTURE_SETUP(return); + change_event(false); self->check = 0; } diff --git a/tools/testing/selftests/user_events/dyn_test.c b/tools/testing/selftests/user_events/dyn_test.c index 91a4444ad42b2..32c827a52d7d8 100644 --- a/tools/testing/selftests/user_events/dyn_test.c +++ b/tools/testing/selftests/user_events/dyn_test.c @@ -15,6 +15,7 @@ #include #include "../kselftest_harness.h" +#include "user_events_selftests.h" const char *abi_file = "/sys/kernel/tracing/user_events_data"; const char *enable_file = "/sys/kernel/tracing/events/user_events/__test_event/enable"; @@ -146,6 +147,7 @@ FIXTURE(user) { }; FIXTURE_SETUP(user) { + USER_EVENT_FIXTURE_SETUP(return); } FIXTURE_TEARDOWN(user) { diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c index 5beb0aef1d814..6a260caeeddc5 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -16,6 +16,7 @@ #include #include "../kselftest_harness.h" +#include "user_events_selftests.h" const char *data_file = "/sys/kernel/tracing/user_events_data"; const char *status_file = "/sys/kernel/tracing/user_events_status"; @@ -206,6 +207,8 @@ FIXTURE(user) { }; FIXTURE_SETUP(user) { + USER_EVENT_FIXTURE_SETUP(return); + self->status_fd = open(status_file, O_RDONLY); ASSERT_NE(-1, self->status_fd); diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c index 8b09be566fa21..f893398cda05a 100644 --- a/tools/testing/selftests/user_events/perf_test.c +++ b/tools/testing/selftests/user_events/perf_test.c @@ -17,6 +17,7 @@ #include #include "../kselftest_harness.h" +#include "user_events_selftests.h" const char *data_file = "/sys/kernel/tracing/user_events_data"; const char *id_file = "/sys/kernel/tracing/events/user_events/__test_event/id"; @@ -113,6 +114,8 @@ FIXTURE(user) { }; FIXTURE_SETUP(user) { + USER_EVENT_FIXTURE_SETUP(return); + self->data_fd = open(data_file, O_RDWR); ASSERT_NE(-1, self->data_fd); } diff --git a/tools/testing/selftests/user_events/user_events_selftests.h b/tools/testing/selftests/user_events/user_events_selftests.h new file mode 100644 index 0000000000000..690378942f821 --- /dev/null +++ b/tools/testing/selftests/user_events/user_events_selftests.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _USER_EVENTS_SELFTESTS_H +#define _USER_EVENTS_SELFTESTS_H + +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +static inline bool tracefs_enabled(char **message, bool *fail) +{ + struct stat buf; + int ret; + + *message = ""; + *fail = false; + + /* Ensure tracefs is installed */ + ret = stat("/sys/kernel/tracing", &buf); + + if (ret == -1) { + *message = "Tracefs is not installed"; + return false; + } + + /* Ensure mounted tracefs */ + ret = stat("/sys/kernel/tracing/README", &buf); + + if (ret == -1 && errno == ENOENT) { + if (mount(NULL, "/sys/kernel/tracing", "tracefs", 0, NULL) != 0) { + *message = "Cannot mount tracefs"; + *fail = true; + return false; + } + + ret = stat("/sys/kernel/tracing/README", &buf); + } + + if (ret == -1) { + *message = "Cannot access tracefs"; + *fail = true; + return false; + } + + return true; +} + +static inline bool user_events_enabled(char **message, bool *fail) +{ + struct stat buf; + int ret; + + *message = ""; + *fail = false; + + if (getuid() != 0) { + *message = "Must be run as root"; + *fail = true; + return false; + } + + if (!tracefs_enabled(message, fail)) + return false; + + /* Ensure user_events is installed */ + ret = stat("/sys/kernel/tracing/user_events_data", &buf); + + if (ret == -1) { + switch (errno) { + case ENOENT: + *message = "user_events is not installed"; + return false; + + default: + *message = "Cannot access user_events_data"; + *fail = true; + return false; + } + } + + return true; +} + +#define USER_EVENT_FIXTURE_SETUP(statement) do { \ + char *message; \ + bool fail; \ + if (!user_events_enabled(&message, &fail)) { \ + if (fail) { \ + TH_LOG("Setup failed due to: %s", message); \ + ASSERT_FALSE(fail); \ + } \ + SKIP(statement, "Skipping due to: %s", message); \ + } \ +} while (0) + +#endif /* _USER_EVENTS_SELFTESTS_H */ From 7dc1e125f07aeeb8b6eacfd9a05ef3ef6fe539c7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 8 Sep 2023 18:17:21 -0400 Subject: [PATCH 121/161] ftrace/selftests: Add softlink to latest log directory When I'm debugging something with the ftrace selftests and need to look at the logs, it becomes tedious that I need to do the following: ls -ltr logs [ copy the last directory ] ls logs/ to see where the logs are. Instead, do the common practice of having a "latest" softlink to the last run selftest. This way after running the selftest I only need to do: ls logs/latest/ and it will always give me the directory of the last run selftest logs! Signed-off-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/ftracetest | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index cb5f18c06593d..7df8baa0f98f8 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -124,6 +124,7 @@ parse_opts() { # opts ;; --logdir|-l) LOG_DIR=$2 + LINK_PTR= shift 2 ;; *.tc) @@ -181,7 +182,10 @@ fi TOP_DIR=`absdir $0` TEST_DIR=$TOP_DIR/test.d TEST_CASES=`find_testcases $TEST_DIR` -LOG_DIR=$TOP_DIR/logs/`date +%Y%m%d-%H%M%S`/ +LOG_TOP_DIR=$TOP_DIR/logs +LOG_DATE=`date +%Y%m%d-%H%M%S` +LOG_DIR=$LOG_TOP_DIR/$LOG_DATE/ +LINK_PTR=$LOG_TOP_DIR/latest KEEP_LOG=0 KTAP=0 DEBUG=0 @@ -207,6 +211,10 @@ else LOG_FILE=$LOG_DIR/ftracetest.log mkdir -p $LOG_DIR || errexit "Failed to make a log directory: $LOG_DIR" date > $LOG_FILE + if [ "x-$LINK_PTR" != "x-" ]; then + unlink $LINK_PTR + ln -fs $LOG_DATE $LINK_PTR + fi fi # Define text colors From 7ab6fe6625c9bdcb8fa5f61c8f8e30e13f689284 Mon Sep 17 00:00:00 2001 From: Naresh Kamboju Date: Thu, 7 Sep 2023 13:32:09 +0530 Subject: [PATCH 122/161] selftests: user_events: create test-specific Kconfig fragments Create the config file in user_events directory of testcase which need more kernel configuration than the default defconfig. User could use these configs with merge_config.sh script: The Kconfig CONFIG_USER_EVENTS=y is needed for the test to read data from the following files, - "/sys/kernel/tracing/user_events_data" - "/sys/kernel/tracing/user_events_status" - "/sys/kernel/tracing/events/user_events/*" Enable config for specific testcase: (export ARCH=xxx #for cross compiling) ./scripts/kconfig/merge_config.sh .config \ tools/testing/selftests/user_events/config Enable configs for all testcases: (export ARCH=xxx #for cross compiling) ./scripts/kconfig/merge_config.sh .config \ tools/testing/selftests/*/config Cc: Beau Belgrave Cc: Shuah Khan Cc: linux-kselftest@vger.kernel.org Signed-off-by: Naresh Kamboju Signed-off-by: Shuah Khan --- tools/testing/selftests/user_events/config | 1 + 1 file changed, 1 insertion(+) create mode 100644 tools/testing/selftests/user_events/config diff --git a/tools/testing/selftests/user_events/config b/tools/testing/selftests/user_events/config new file mode 100644 index 0000000000000..64f7a9a90cec4 --- /dev/null +++ b/tools/testing/selftests/user_events/config @@ -0,0 +1 @@ +CONFIG_USER_EVENTS=y From 9243e5430995498f14f92be56da995ded107d71e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 11 Sep 2023 20:06:54 -0400 Subject: [PATCH 123/161] tracefs/eventfs: Use list_for_each_srcu() in dcache_dir_open_wrapper() The eventfs files list is protected by SRCU. In earlier iterations it was protected with just RCU, but because it needed to also call sleepable code, it had to be switch to SRCU. The dcache_dir_open_wrapper() list_for_each_rcu() was missed and did not get converted over to list_for_each_srcu(). That needs to be fixed. Link: https://lore.kernel.org/linux-trace-kernel/20230911120053.ca82f545e7f46ea753deda18@kernel.org/ Link: https://lore.kernel.org/linux-trace-kernel/20230911200654.71ce927c@gandalf.local.home Cc: Mark Rutland Cc: Ajay Kaher Cc: "Paul E. McKenney" Reported-by: Masami Hiramatsu (Google) Fixes: 63940449555e7 ("eventfs: Implement eventfs lookup, read, open functions") Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index f168aca454587..9f64e73327968 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -452,7 +452,8 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) ei = ti->private; idx = srcu_read_lock(&eventfs_srcu); - list_for_each_entry_rcu(ef, &ei->e_top_files, list) { + list_for_each_entry_srcu(ef, &ei->e_top_files, list, + srcu_read_lock_held(&eventfs_srcu)) { create_dentry(ef, dentry, false); } srcu_read_unlock(&eventfs_srcu, idx); From cfaa80c91f6f99b9342b6557f0f0e1143e434066 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Sat, 9 Sep 2023 16:14:34 +0800 Subject: [PATCH 124/161] net/tls: do not free tls_rec on async operation in bpf_exec_tx_verdict() I got the below warning when do fuzzing test: BUG: KASAN: null-ptr-deref in scatterwalk_copychunks+0x320/0x470 Read of size 4 at addr 0000000000000008 by task kworker/u8:1/9 CPU: 0 PID: 9 Comm: kworker/u8:1 Tainted: G OE Hardware name: linux,dummy-virt (DT) Workqueue: pencrypt_parallel padata_parallel_worker Call trace: dump_backtrace+0x0/0x420 show_stack+0x34/0x44 dump_stack+0x1d0/0x248 __kasan_report+0x138/0x140 kasan_report+0x44/0x6c __asan_load4+0x94/0xd0 scatterwalk_copychunks+0x320/0x470 skcipher_next_slow+0x14c/0x290 skcipher_walk_next+0x2fc/0x480 skcipher_walk_first+0x9c/0x110 skcipher_walk_aead_common+0x380/0x440 skcipher_walk_aead_encrypt+0x54/0x70 ccm_encrypt+0x13c/0x4d0 crypto_aead_encrypt+0x7c/0xfc pcrypt_aead_enc+0x28/0x84 padata_parallel_worker+0xd0/0x2dc process_one_work+0x49c/0xbdc worker_thread+0x124/0x880 kthread+0x210/0x260 ret_from_fork+0x10/0x18 This is because the value of rec_seq of tls_crypto_info configured by the user program is too large, for example, 0xffffffffffffff. In addition, TLS is asynchronously accelerated. When tls_do_encryption() returns -EINPROGRESS and sk->sk_err is set to EBADMSG due to rec_seq overflow, skmsg is released before the asynchronous encryption process ends. As a result, the UAF problem occurs during the asynchronous processing of the encryption module. If the operation is asynchronous and the encryption module returns EINPROGRESS, do not free the record information. Fixes: 635d93981786 ("net/tls: free record only on encryption error") Signed-off-by: Liu Jian Reviewed-by: Sabrina Dubroca Link: https://lore.kernel.org/r/20230909081434.2324940-1-liujian56@huawei.com Signed-off-by: Paolo Abeni --- net/tls/tls_sw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 1ed4a611631fe..d1fc295b83b59 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -817,7 +817,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, psock = sk_psock_get(sk); if (!psock || !policy) { err = tls_push_record(sk, flags, record_type); - if (err && sk->sk_err == EBADMSG) { + if (err && err != -EINPROGRESS && sk->sk_err == EBADMSG) { *copied -= sk_msg_free(sk, msg); tls_free_open_rec(sk); err = -sk->sk_err; @@ -846,7 +846,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, switch (psock->eval) { case __SK_PASS: err = tls_push_record(sk, flags, record_type); - if (err && sk->sk_err == EBADMSG) { + if (err && err != -EINPROGRESS && sk->sk_err == EBADMSG) { *copied -= sk_msg_free(sk, msg); tls_free_open_rec(sk); err = -sk->sk_err; From 40d84e198b0ae64df71ac0e70675b16900b90bde Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 6 Sep 2023 12:18:41 +0800 Subject: [PATCH 125/161] PM: hibernate: Rename function parameter from snapshot_test to exclusive Several functions reply on snapshot_test to decide whether to open the resume device exclusively. However there is no strict connection between the snapshot_test and the open mode. Rename the 'snapshot_test' input parameter to 'exclusive' to better reflect the use case. No functional change is expected. Signed-off-by: Chen Yu Signed-off-by: Rafael J. Wysocki --- kernel/power/power.h | 4 ++-- kernel/power/swap.c | 14 ++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/kernel/power/power.h b/kernel/power/power.h index 46eb14dc50c31..a98f95e309a33 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -168,11 +168,11 @@ extern int swsusp_swap_in_use(void); #define SF_HW_SIG 8 /* kernel/power/hibernate.c */ -int swsusp_check(bool snapshot_test); +int swsusp_check(bool exclusive); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); -void swsusp_close(bool snapshot_test); +void swsusp_close(bool exclusive); #ifdef CONFIG_SUSPEND extern int swsusp_unmark(void); #endif diff --git a/kernel/power/swap.c b/kernel/power/swap.c index f6ebcd00c4100..74edbce2320ba 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1513,12 +1513,13 @@ int swsusp_read(unsigned int *flags_p) static void *swsusp_holder; /** - * swsusp_check - Check for swsusp signature in the resume device + * swsusp_check - Check for swsusp signature in the resume device + * @exclusive: Open the resume device exclusively. */ -int swsusp_check(bool snapshot_test) +int swsusp_check(bool exclusive) { - void *holder = snapshot_test ? &swsusp_holder : NULL; + void *holder = exclusive ? &swsusp_holder : NULL; int error; hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, BLK_OPEN_READ, @@ -1563,17 +1564,18 @@ int swsusp_check(bool snapshot_test) } /** - * swsusp_close - close swap device. + * swsusp_close - close swap device. + * @exclusive: Close the resume device which is exclusively opened. */ -void swsusp_close(bool snapshot_test) +void swsusp_close(bool exclusive) { if (IS_ERR(hib_resume_bdev)) { pr_debug("Image device not initialised\n"); return; } - blkdev_put(hib_resume_bdev, snapshot_test ? &swsusp_holder : NULL); + blkdev_put(hib_resume_bdev, exclusive ? &swsusp_holder : NULL); } /** From 148b6f4cc3920e563094540fe1a12d00d3bbccae Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 6 Sep 2023 12:18:52 +0800 Subject: [PATCH 126/161] PM: hibernate: Fix the exclusive get block device in test_resume mode Commit 5904de0d735b ("PM: hibernate: Do not get block device exclusively in test_resume mode") fixes a hibernation issue under test_resume mode. That commit is supposed to open the block device in non-exclusive mode when in test_resume. However the code does the opposite, which is against its description. In summary, the swap device is only opened exclusively by swsusp_check() with its corresponding *close(), and must be in non test_resume mode. This is to avoid the race condition that different processes scribble the device at the same time. All the other cases should use non-exclusive mode. Fix it by really disabling exclusive mode under test_resume. Fixes: 5904de0d735b ("PM: hibernate: Do not get block device exclusively in test_resume mode") Closes: https://lore.kernel.org/lkml/000000000000761f5f0603324129@google.com/ Reported-by: Pengfei Xu Signed-off-by: Chen Yu Tested-by: Chenzhou Feng Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 2b4a946a6ff5c..8d35b9f9aaa3f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -786,9 +786,9 @@ int hibernate(void) unlock_device_hotplug(); if (snapshot_test) { pm_pr_dbg("Checking hibernation image\n"); - error = swsusp_check(snapshot_test); + error = swsusp_check(false); if (!error) - error = load_image_and_restore(snapshot_test); + error = load_image_and_restore(false); } thaw_processes(); @@ -945,14 +945,14 @@ static int software_resume(void) pm_pr_dbg("Looking for hibernation image.\n"); mutex_lock(&system_transition_mutex); - error = swsusp_check(false); + error = swsusp_check(true); if (error) goto Unlock; /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; - swsusp_close(false); + swsusp_close(true); goto Unlock; } @@ -973,7 +973,7 @@ static int software_resume(void) goto Close_Finish; } - error = load_image_and_restore(false); + error = load_image_and_restore(true); thaw_processes(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); @@ -987,7 +987,7 @@ static int software_resume(void) pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: - swsusp_close(false); + swsusp_close(true); goto Finish; } From 7c95ec3b59479bb24093918bbfc801c9f31826f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 8 Sep 2023 08:25:27 +0300 Subject: [PATCH 127/161] drm/i915: Only check eDP HPD when AUX CH is shared MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apparently Acer Chromebook C740 (BDW-ULT) doesn't have the eDP HPD line properly connected, and thus fails the new HPD check during eDP probe. The result is that we lose the eDP output. I suspect all such machines would be Chromebooks or other Linux exclusive systems as the Windows driver likely wouldn't work either. I did check a few other BDW machines here and those do have eDP HPD connected, one of them even is a different Chromebook (Samus). To account for these funky machines let's skip the HPD check when it looks like the eDP port is the only one using that specific AUX channel. In case of multiple ports sharing the same AUX CH (eg. on Asrock B250M-HDV) we still do the check and thus should correctly ignore the eDP port in favor of the other DP port (usually a DP->VGA converter). v2: Don't oops during list iteration Cc: stable@vger.kernel.org Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/9264 Fixes: cfe5bdfb27fa ("drm/i915: Check HPD live state during eDP probe") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20230908052527.685-1-ville.syrjala@linux.intel.com Reviewed-by: Luca Coelho (cherry picked from commit 70052100fabec5d8c1b09c9959817a2f4517e6b5) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_bios.c | 21 +++++++++++++++++++++ drivers/gpu/drm/i915/display/intel_bios.h | 1 + drivers/gpu/drm/i915/display/intel_dp.c | 7 ++++++- 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_bios.c b/drivers/gpu/drm/i915/display/intel_bios.c index 858c959f7babf..f735b035436c0 100644 --- a/drivers/gpu/drm/i915/display/intel_bios.c +++ b/drivers/gpu/drm/i915/display/intel_bios.c @@ -3540,6 +3540,27 @@ enum aux_ch intel_bios_dp_aux_ch(const struct intel_bios_encoder_data *devdata) return map_aux_ch(devdata->i915, devdata->child.aux_channel); } +bool intel_bios_dp_has_shared_aux_ch(const struct intel_bios_encoder_data *devdata) +{ + struct drm_i915_private *i915; + u8 aux_channel; + int count = 0; + + if (!devdata || !devdata->child.aux_channel) + return false; + + i915 = devdata->i915; + aux_channel = devdata->child.aux_channel; + + list_for_each_entry(devdata, &i915->display.vbt.display_devices, node) { + if (intel_bios_encoder_supports_dp(devdata) && + aux_channel == devdata->child.aux_channel) + count++; + } + + return count > 1; +} + int intel_bios_dp_boost_level(const struct intel_bios_encoder_data *devdata) { if (!devdata || devdata->i915->display.vbt.version < 196 || !devdata->child.iboost) diff --git a/drivers/gpu/drm/i915/display/intel_bios.h b/drivers/gpu/drm/i915/display/intel_bios.h index 9680e3e92bb51..49e24b7cf6753 100644 --- a/drivers/gpu/drm/i915/display/intel_bios.h +++ b/drivers/gpu/drm/i915/display/intel_bios.h @@ -273,6 +273,7 @@ enum aux_ch intel_bios_dp_aux_ch(const struct intel_bios_encoder_data *devdata); int intel_bios_dp_boost_level(const struct intel_bios_encoder_data *devdata); int intel_bios_dp_max_lane_count(const struct intel_bios_encoder_data *devdata); int intel_bios_dp_max_link_rate(const struct intel_bios_encoder_data *devdata); +bool intel_bios_dp_has_shared_aux_ch(const struct intel_bios_encoder_data *devdata); int intel_bios_hdmi_boost_level(const struct intel_bios_encoder_data *devdata); int intel_bios_hdmi_ddc_pin(const struct intel_bios_encoder_data *devdata); int intel_bios_hdmi_level_shift(const struct intel_bios_encoder_data *devdata); diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 12bd2f322e627..e0e4cb5292846 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -5512,8 +5512,13 @@ static bool intel_edp_init_connector(struct intel_dp *intel_dp, /* * VBT and straps are liars. Also check HPD as that seems * to be the most reliable piece of information available. + * + * ... expect on devices that forgot to hook HPD up for eDP + * (eg. Acer Chromebook C710), so we'll check it only if multiple + * ports are attempting to use the same AUX CH, according to VBT. */ - if (!intel_digital_port_connected(encoder)) { + if (intel_bios_dp_has_shared_aux_ch(encoder->devdata) && + !intel_digital_port_connected(encoder)) { /* * If this fails, presume the DPCD answer came * from some other port using the same AUX CH. From 403f0e771457e2b8811dc280719d11b9bacf10f4 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Fri, 8 Sep 2023 13:29:13 +0200 Subject: [PATCH 128/161] net: macb: fix sleep inside spinlock macb_set_tx_clk() is called under a spinlock but itself calls clk_set_rate() which can sleep. This results in: | BUG: sleeping function called from invalid context at kernel/locking/mutex.c:580 | pps pps1: new PPS source ptp1 | in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 40, name: kworker/u4:3 | preempt_count: 1, expected: 0 | RCU nest depth: 0, expected: 0 | 4 locks held by kworker/u4:3/40: | #0: ffff000003409148 | macb ff0c0000.ethernet: gem-ptp-timer ptp clock registered. | ((wq_completion)events_power_efficient){+.+.}-{0:0}, at: process_one_work+0x14c/0x51c | #1: ffff8000833cbdd8 ((work_completion)(&pl->resolve)){+.+.}-{0:0}, at: process_one_work+0x14c/0x51c | #2: ffff000004f01578 (&pl->state_mutex){+.+.}-{4:4}, at: phylink_resolve+0x44/0x4e8 | #3: ffff000004f06f50 (&bp->lock){....}-{3:3}, at: macb_mac_link_up+0x40/0x2ac | irq event stamp: 113998 | hardirqs last enabled at (113997): [] _raw_spin_unlock_irq+0x30/0x64 | hardirqs last disabled at (113998): [] _raw_spin_lock_irqsave+0xac/0xc8 | softirqs last enabled at (113608): [] __do_softirq+0x430/0x4e4 | softirqs last disabled at (113597): [] ____do_softirq+0x10/0x1c | CPU: 0 PID: 40 Comm: kworker/u4:3 Not tainted 6.5.0-11717-g9355ce8b2f50-dirty #368 | Hardware name: ... ZynqMP ... (DT) | Workqueue: events_power_efficient phylink_resolve | Call trace: | dump_backtrace+0x98/0xf0 | show_stack+0x18/0x24 | dump_stack_lvl+0x60/0xac | dump_stack+0x18/0x24 | __might_resched+0x144/0x24c | __might_sleep+0x48/0x98 | __mutex_lock+0x58/0x7b0 | mutex_lock_nested+0x24/0x30 | clk_prepare_lock+0x4c/0xa8 | clk_set_rate+0x24/0x8c | macb_mac_link_up+0x25c/0x2ac | phylink_resolve+0x178/0x4e8 | process_one_work+0x1ec/0x51c | worker_thread+0x1ec/0x3e4 | kthread+0x120/0x124 | ret_from_fork+0x10/0x20 The obvious fix is to move the call to macb_set_tx_clk() out of the protected area. This seems safe as rx and tx are both disabled anyway at this point. It is however not entirely clear what the spinlock shall protect. It could be the read-modify-write access to the NCFGR register, but this is accessed in macb_set_rx_mode() and macb_set_rxcsum_feature() as well without holding the spinlock. It could also be the register accesses done in mog_init_rings() or macb_init_buffers(), but again these functions are called without holding the spinlock in macb_hresp_error_task(). The locking seems fishy in this driver and it might deserve another look before this patch is applied. Fixes: 633e98a711ac0 ("net: macb: use resolved link config in mac_link_up()") Signed-off-by: Sascha Hauer Link: https://lore.kernel.org/r/20230908112913.1701766-1-s.hauer@pengutronix.de Signed-off-by: Paolo Abeni --- drivers/net/ethernet/cadence/macb_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 31f664ee4d778..b940dcd3ace68 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -756,8 +756,6 @@ static void macb_mac_link_up(struct phylink_config *config, if (rx_pause) ctrl |= MACB_BIT(PAE); - macb_set_tx_clk(bp, speed); - /* Initialize rings & buffers as clearing MACB_BIT(TE) in link down * cleared the pipeline and control registers. */ @@ -777,6 +775,9 @@ static void macb_mac_link_up(struct phylink_config *config, spin_unlock_irqrestore(&bp->lock, flags); + if (!(bp->caps & MACB_CAPS_MACB_IS_EMAC)) + macb_set_tx_clk(bp, speed); + /* Enable Rx and Tx; Enable PTP unicast */ ctrl = macb_readl(bp, NCR); if (gem_has_ptp(bp)) From 88956eabfdea7d01d550535af120d4ef265b1d02 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 12 Sep 2023 11:25:00 +1000 Subject: [PATCH 129/161] NFSD: fix possible oops when nfsd/pool_stats is closed. If /proc/fs/nfsd/pool_stats is open when the last nfsd thread exits, then when the file is closed a NULL pointer is dereferenced. This is because nfsd_pool_stats_release() assumes that the pointer to the svc_serv cannot become NULL while a reference is held. This used to be the case but a recent patch split nfsd_last_thread() out from nfsd_put(), and clearing the pointer is done in nfsd_last_thread(). This is easily reproduced by running rpc.nfsd 8 ; ( rpc.nfsd 0;true) < /proc/fs/nfsd/pool_stats Fortunately nfsd_pool_stats_release() has easy access to the svc_serv pointer, and so can call svc_put() on it directly. Fixes: 9f28a971ee9f ("nfsd: separate nfsd_last_thread() from nfsd_put()") Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfssvc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 1582af33e204a..c7af1095f6b54 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -1082,11 +1082,12 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file) int nfsd_pool_stats_release(struct inode *inode, struct file *file) { + struct seq_file *seq = file->private_data; + struct svc_serv *serv = seq->private; int ret = seq_release(inode, file); - struct net *net = inode->i_sb->s_fs_info; mutex_lock(&nfsd_mutex); - nfsd_put(net); + svc_put(serv); mutex_unlock(&nfsd_mutex); return ret; } From c8414dab164a74bd3bb859a2d836cb537d6b9298 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Tue, 12 Sep 2023 21:47:52 +0800 Subject: [PATCH 130/161] eventfs: Fix the NULL pointer dereference bug in eventfs_remove_rec() Inject fault while probing btrfs.ko, if kstrdup() fails in eventfs_prepare_ef() in eventfs_add_dir(), it will return ERR_PTR to assign file->ef. But the eventfs_remove() check NULL in trace_module_remove_events(), which causes the below NULL pointer dereference. As both Masami and Steven suggest, allocater side should handle the error carefully and remove it, so fix the places where it failed. Could not create tracefs 'raid56_write' directory Btrfs loaded, zoned=no, fsverity=no Unable to handle kernel NULL pointer dereference at virtual address 000000000000001c Mem abort info: ESR = 0x0000000096000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x04: level 0 translation fault Data abort info: ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=0000000102544000 [000000000000001c] pgd=0000000000000000, p4d=0000000000000000 Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: btrfs(-) libcrc32c xor xor_neon raid6_pq cfg80211 rfkill 8021q garp mrp stp llc ipv6 [last unloaded: btrfs] CPU: 15 PID: 1343 Comm: rmmod Tainted: G N 6.5.0+ #40 Hardware name: linux,dummy-virt (DT) pstate: 80000005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : eventfs_remove_rec+0x24/0xc0 lr : eventfs_remove+0x68/0x1d8 sp : ffff800082d63b60 x29: ffff800082d63b60 x28: ffffb84b80ddd00c x27: ffffb84b3054ba40 x26: 0000000000000002 x25: ffff800082d63bf8 x24: ffffb84b8398e440 x23: ffffb84b82af3000 x22: dead000000000100 x21: dead000000000122 x20: ffff800082d63bf8 x19: fffffffffffffff4 x18: ffffb84b82508820 x17: 0000000000000000 x16: 0000000000000000 x15: 000083bc876a3166 x14: 000000000000006d x13: 000000000000006d x12: 0000000000000000 x11: 0000000000000001 x10: 00000000000017e0 x9 : 0000000000000001 x8 : 0000000000000000 x7 : 0000000000000000 x6 : ffffb84b84289804 x5 : 0000000000000000 x4 : 9696969696969697 x3 : ffff33a5b7601f38 x2 : 0000000000000000 x1 : ffff800082d63bf8 x0 : fffffffffffffff4 Call trace: eventfs_remove_rec+0x24/0xc0 eventfs_remove+0x68/0x1d8 remove_event_file_dir+0x88/0x100 event_remove+0x140/0x15c trace_module_notify+0x1fc/0x230 notifier_call_chain+0x98/0x17c blocking_notifier_call_chain+0x4c/0x74 __arm64_sys_delete_module+0x1a4/0x298 invoke_syscall+0x44/0x100 el0_svc_common.constprop.1+0x68/0xe0 do_el0_svc+0x1c/0x28 el0_svc+0x3c/0xc4 el0t_64_sync_handler+0xa0/0xc4 el0t_64_sync+0x174/0x178 Code: 5400052c a90153b3 aa0003f3 aa0103f4 (f9401400) ---[ end trace 0000000000000000 ]--- Kernel panic - not syncing: Oops: Fatal exception SMP: stopping secondary CPUs Dumping ftrace buffer: (ftrace buffer empty) Kernel Offset: 0x384b00c00000 from 0xffff800080000000 PHYS_OFFSET: 0xffffcc5b80000000 CPU features: 0x88000203,3c020000,1000421b Memory Limit: none Rebooting in 1 seconds.. Link: https://lore.kernel.org/linux-trace-kernel/20230912134752.1838524-1-ruanjinjie@huawei.com Link: https://lore.kernel.org/all/20230912025808.668187-1-ruanjinjie@huawei.com/ Link: https://lore.kernel.org/all/20230911052818.1020547-1-ruanjinjie@huawei.com/ Link: https://lore.kernel.org/all/20230909072817.182846-1-ruanjinjie@huawei.com/ Link: https://lore.kernel.org/all/20230908074816.3724716-1-ruanjinjie@huawei.com/ Cc: Ajay Kaher Fixes: 5bdcd5f5331a ("eventfs: Implement removal of meta data from eventfs") Signed-off-by: Jinjie Ruan Suggested-by: Masami Hiramatsu (Google) Suggested-by: Steven Rostedt Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 065c639918582..91951d038ba49 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2286,6 +2286,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, { struct event_subsystem *system, *iter; struct trace_subsystem_dir *dir; + struct eventfs_file *ef; int res; /* First see if we did not already create this dir */ @@ -2318,13 +2319,14 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } else __get_system(system); - dir->ef = eventfs_add_subsystem_dir(name, parent); - if (IS_ERR(dir->ef)) { + ef = eventfs_add_subsystem_dir(name, parent); + if (IS_ERR(ef)) { pr_warn("Failed to create system directory %s\n", name); __put_system(system); goto out_free; } + dir->ef = ef; dir->tr = tr; dir->ref_count = 1; dir->nr_events = 1; @@ -2404,6 +2406,7 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) struct trace_event_call *call = file->event_call; struct eventfs_file *ef_subsystem = NULL; struct trace_array *tr = file->tr; + struct eventfs_file *ef; const char *name; int ret; @@ -2420,12 +2423,14 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) return -ENOMEM; name = trace_event_name(call); - file->ef = eventfs_add_dir(name, ef_subsystem); - if (IS_ERR(file->ef)) { + ef = eventfs_add_dir(name, ef_subsystem); + if (IS_ERR(ef)) { pr_warn("Could not create tracefs '%s' directory\n", name); return -1; } + file->ef = ef; + if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file, &ftrace_enable_fops); From 7a6102aa6df0d5d032b4cbc51935d1d4cda17254 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 11 Sep 2023 15:58:25 +0200 Subject: [PATCH 131/161] veth: Update XDP feature set when bringing up device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's an early return in veth_set_features() if the device is in a down state, which leads to the XDP feature flags not being updated when enabling GRO while the device is down. Which in turn leads to XDP_REDIRECT not working, because the redirect code now checks the flags. Fix this by updating the feature flags after bringing the device up. Before this patch: NETDEV_XDP_ACT_BASIC: yes NETDEV_XDP_ACT_REDIRECT: yes NETDEV_XDP_ACT_NDO_XMIT: no NETDEV_XDP_ACT_XSK_ZEROCOPY: no NETDEV_XDP_ACT_HW_OFFLOAD: no NETDEV_XDP_ACT_RX_SG: yes NETDEV_XDP_ACT_NDO_XMIT_SG: no After this patch: NETDEV_XDP_ACT_BASIC: yes NETDEV_XDP_ACT_REDIRECT: yes NETDEV_XDP_ACT_NDO_XMIT: yes NETDEV_XDP_ACT_XSK_ZEROCOPY: no NETDEV_XDP_ACT_HW_OFFLOAD: no NETDEV_XDP_ACT_RX_SG: yes NETDEV_XDP_ACT_NDO_XMIT_SG: yes Fixes: fccca038f300 ("veth: take into account device reconfiguration for xdp_features flag") Fixes: 66c0e13ad236 ("drivers: net: turn on XDP features") Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20230911135826.722295-1-toke@redhat.com Signed-off-by: Paolo Abeni --- drivers/net/veth.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 9c6f4f83f22b0..0deefd1573cf2 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1446,6 +1446,8 @@ static int veth_open(struct net_device *dev) netif_carrier_on(peer); } + veth_set_xdp_features(dev); + return 0; } From 7e021da80f48582171029714f8a487347f29dddb Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 12 Sep 2023 10:10:39 +0900 Subject: [PATCH 132/161] selftests: tracing: Fix to unmount tracefs for recovering environment Fix to unmount the tracefs if the ftracetest mounted it for recovering system environment. If the tracefs is already mounted, this does nothing. Suggested-by: Mark Brown Link: https://lore.kernel.org/all/29fce076-746c-4650-8358-b4e0fa215cf7@sirena.org.uk/ Fixes: cbd965bde74c ("ftrace/selftests: Return the skip code when tracing directory not configured in kernel") Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) Reviewed-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/ftracetest | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index 7df8baa0f98f8..c778d4dcc17e5 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -31,6 +31,9 @@ err_ret=1 # kselftest skip code is 4 err_skip=4 +# umount required +UMOUNT_DIR="" + # cgroup RT scheduling prevents chrt commands from succeeding, which # induces failures in test wakeup tests. Disable for the duration of # the tests. @@ -45,6 +48,9 @@ setup() { cleanup() { echo $sched_rt_runtime_orig > $sched_rt_runtime + if [ -n "${UMOUNT_DIR}" ]; then + umount ${UMOUNT_DIR} ||: + fi } errexit() { # message @@ -161,11 +167,13 @@ if [ -z "$TRACING_DIR" ]; then mount -t tracefs nodev /sys/kernel/tracing || errexit "Failed to mount /sys/kernel/tracing" TRACING_DIR="/sys/kernel/tracing" + UMOUNT_DIR=${TRACING_DIR} # If debugfs exists, then so does /sys/kernel/debug elif [ -d "/sys/kernel/debug" ]; then mount -t debugfs nodev /sys/kernel/debug || errexit "Failed to mount /sys/kernel/debug" TRACING_DIR="/sys/kernel/debug/tracing" + UMOUNT_DIR=${TRACING_DIR} else err_ret=$err_skip errexit "debugfs and tracefs are not configured in this kernel" From 08700ec705043eb0cee01b35cf5b9d63f0230d12 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 6 Sep 2023 03:46:57 +0900 Subject: [PATCH 133/161] linux/export: fix reference to exported functions for parisc64 John David Anglin reported parisc has been broken since commit ddb5cdbafaaa ("kbuild: generate KSYMTAB entries by modpost"). Like ia64, parisc64 uses a function descriptor. The function references must be prefixed with P%. Also, symbols prefixed $$ from the library have the symbol type STT_LOPROC instead of STT_FUNC. They should be handled as functions too. Fixes: ddb5cdbafaaa ("kbuild: generate KSYMTAB entries by modpost") Reported-by: John David Anglin Tested-by: John David Anglin Tested-by: Helge Deller Closes: https://lore.kernel.org/linux-parisc/1901598a-e11d-f7dd-a5d9-9a69d06e6b6e@bell.net/T/#u Signed-off-by: Masahiro Yamada Signed-off-by: Helge Deller --- include/linux/export-internal.h | 2 ++ scripts/mod/modpost.c | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h index 1c849db953a51..45fca09b23194 100644 --- a/include/linux/export-internal.h +++ b/include/linux/export-internal.h @@ -52,6 +52,8 @@ #ifdef CONFIG_IA64 #define KSYM_FUNC(name) @fptr(name) +#elif defined(CONFIG_PARISC) && defined(CONFIG_64BIT) +#define KSYM_FUNC(name) P%name #else #define KSYM_FUNC(name) name #endif diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index b29b29707f104..ba981f22908ad 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1226,6 +1226,15 @@ static void check_export_symbol(struct module *mod, struct elf_info *elf, */ s->is_func = (ELF_ST_TYPE(sym->st_info) == STT_FUNC); + /* + * For parisc64, symbols prefixed $$ from the library have the symbol type + * STT_LOPROC. They should be handled as functions too. + */ + if (elf->hdr->e_ident[EI_CLASS] == ELFCLASS64 && + elf->hdr->e_machine == EM_PARISC && + ELF_ST_TYPE(sym->st_info) == STT_LOPROC) + s->is_func = true; + if (match(secname, PATTERNS(INIT_SECTIONS))) warn("%s: %s: EXPORT_SYMBOL used for init symbol. Remove __init or EXPORT_SYMBOL.\n", mod->name, name); From dad651b2a44eb6b201738f810254279dca29d30d Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 12 Sep 2023 17:52:49 +0200 Subject: [PATCH 134/161] nvme-pci: do not set the NUMA node of device if it has none If a device has no NUMA node information associated with it, the driver puts the device in node first_memory_node (say node 0). Not having a NUMA node and being associated with node 0 are completely different things and it makes little sense to mix the two. Signed-off-by: Pratyush Yadav Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index baf69af7ea78e..f5ba2d7102eae 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2916,9 +2916,6 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, struct nvme_dev *dev; int ret = -ENOMEM; - if (node == NUMA_NO_NODE) - set_dev_node(&pdev->dev, first_memory_node); - dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) return ERR_PTR(-ENOMEM); From 8cdd9f1aaedf823006449faa4e540026c692ac43 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Sep 2023 15:42:13 +0000 Subject: [PATCH 135/161] ipv6: fix ip6_sock_set_addr_preferences() typo ip6_sock_set_addr_preferences() second argument should be an integer. SUNRPC attempts to set IPV6_PREFER_SRC_PUBLIC were translated to IPV6_PREFER_SRC_TMP Fixes: 18d5ad623275 ("ipv6: add ip6_sock_set_addr_preferences") Signed-off-by: Eric Dumazet Cc: Christoph Hellwig Cc: Chuck Lever Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230911154213.713941-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 0675be0f3fa0e..fe274c122a563 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1360,7 +1360,7 @@ static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val) return 0; } -static inline int ip6_sock_set_addr_preferences(struct sock *sk, bool val) +static inline int ip6_sock_set_addr_preferences(struct sock *sk, int val) { int ret; From ea72883a3bf11fb09dd1ad4f8328cc040263881a Mon Sep 17 00:00:00 2001 From: "Justin M. Forbes" Date: Tue, 12 Sep 2023 12:02:47 -0500 Subject: [PATCH 136/161] tpm: Fix typo in tpmrm class definition Commit d2e8071bed0be ("tpm: make all 'class' structures const") unfortunately had a typo for the name on tpmrm. Fixes: d2e8071bed0b ("tpm: make all 'class' structures const") Signed-off-by: Justin M. Forbes Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm-chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c index 23f6f2eda84c0..42b1062e33cd5 100644 --- a/drivers/char/tpm/tpm-chip.c +++ b/drivers/char/tpm/tpm-chip.c @@ -33,7 +33,7 @@ const struct class tpm_class = { .shutdown_pre = tpm_class_shutdown, }; const struct class tpmrm_class = { - .name = "tmprm", + .name = "tpmrm", }; dev_t tpm_devt; From 1bfb2b618d52e59a4ef1896b46c4698ad2be66b7 Mon Sep 17 00:00:00 2001 From: Song Shuai Date: Wed, 6 Sep 2023 17:58:17 +0800 Subject: [PATCH 137/161] riscv: kexec: Align the kexeced kernel entry The current riscv boot protocol requires 2MB alignment for RV64 and 4MB alignment for RV32. In KEXEC_FILE path, the elf_find_pbase() function should align the kexeced kernel entry according to the requirement, otherwise the kexeced kernel would silently BUG at the setup_vm(). Fixes: 8acea455fafa ("RISC-V: Support for kexec_file on panic") Signed-off-by: Song Shuai Link: https://lore.kernel.org/r/20230906095817.364390-1-songshuaishuai@tinylab.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/elf_kexec.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c index f4099059ed8f2..e60fbd8660c4a 100644 --- a/arch/riscv/kernel/elf_kexec.c +++ b/arch/riscv/kernel/elf_kexec.c @@ -98,7 +98,13 @@ static int elf_find_pbase(struct kimage *image, unsigned long kernel_len, kbuf.image = image; kbuf.buf_min = lowest_paddr; kbuf.buf_max = ULONG_MAX; - kbuf.buf_align = PAGE_SIZE; + + /* + * Current riscv boot protocol requires 2MB alignment for + * RV64 and 4MB alignment for RV32 + * + */ + kbuf.buf_align = PMD_SIZE; kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE); kbuf.top_down = false; From 8eb8fe67e2c84324398f5983c41b4f831d0705b3 Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Tue, 12 Sep 2023 15:24:10 +0800 Subject: [PATCH 138/161] riscv: errata: fix T-Head dcache.cva encoding The dcache.cva encoding shown in the comments are wrong, it's for dcache.cval1 (which is restricted to L1) instead. Fix this in the comment and in the hardcoded instruction. Signed-off-by: Icenowy Zheng Tested-by: Sergey Matyukevich Reviewed-by: Heiko Stuebner Reviewed-by: Guo Ren Tested-by: Drew Fustini Link: https://lore.kernel.org/r/20230912072410.2481-1-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/errata_list.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h index e2ecd01bfac77..b55b434f00591 100644 --- a/arch/riscv/include/asm/errata_list.h +++ b/arch/riscv/include/asm/errata_list.h @@ -105,7 +105,7 @@ asm volatile(ALTERNATIVE( \ * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 | * 0000001 01001 rs1 000 00000 0001011 * dcache.cva rs1 (clean, virtual address) - * 0000001 00100 rs1 000 00000 0001011 + * 0000001 00101 rs1 000 00000 0001011 * * dcache.cipa rs1 (clean then invalidate, physical address) * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 | @@ -118,7 +118,7 @@ asm volatile(ALTERNATIVE( \ * 0000000 11001 00000 000 00000 0001011 */ #define THEAD_inval_A0 ".long 0x0265000b" -#define THEAD_clean_A0 ".long 0x0245000b" +#define THEAD_clean_A0 ".long 0x0255000b" #define THEAD_flush_A0 ".long 0x0275000b" #define THEAD_SYNC_S ".long 0x0190000b" From ccf1dab96be4caed7c5235b1cfdb606ac161b996 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Mon, 11 Sep 2023 16:23:58 +0200 Subject: [PATCH 139/161] selinux: fix handling of empty opts in selinux_fs_context_submount() selinux_set_mnt_opts() relies on the fact that the mount options pointer is always NULL when all options are unset (specifically in its !selinux_initialized() branch. However, the new selinux_fs_context_submount() hook breaks this rule by allocating a new structure even if no options are set. That causes any submount created before a SELinux policy is loaded to be rejected in selinux_set_mnt_opts(). Fix this by making selinux_fs_context_submount() leave fc->security set to NULL when there are no options to be copied from the reference superblock. Cc: Reported-by: Adam Williamson Link: https://bugzilla.redhat.com/show_bug.cgi?id=2236345 Fixes: d80a8f1b58c2 ("vfs, security: Fix automount superblock LSM init problem, preventing NFS sb sharing") Signed-off-by: Ondrej Mosnacek Reviewed-by: Jeff Layton Signed-off-by: Paul Moore --- security/selinux/hooks.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 10350534de6d6..2aa0e219d7217 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2775,14 +2775,20 @@ static int selinux_umount(struct vfsmount *mnt, int flags) static int selinux_fs_context_submount(struct fs_context *fc, struct super_block *reference) { - const struct superblock_security_struct *sbsec; + const struct superblock_security_struct *sbsec = selinux_superblock(reference); struct selinux_mnt_opts *opts; + /* + * Ensure that fc->security remains NULL when no options are set + * as expected by selinux_set_mnt_opts(). + */ + if (!(sbsec->flags & (FSCONTEXT_MNT|CONTEXT_MNT|DEFCONTEXT_MNT))) + return 0; + opts = kzalloc(sizeof(*opts), GFP_KERNEL); if (!opts) return -ENOMEM; - sbsec = selinux_superblock(reference); if (sbsec->flags & FSCONTEXT_MNT) opts->fscontext_sid = sbsec->sid; if (sbsec->flags & CONTEXT_MNT) From edcfe22985d09ee8e2346c9217f5a52ab150099f Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Mon, 11 Sep 2023 14:49:06 -0400 Subject: [PATCH 140/161] drm/amdkfd: Insert missing TLB flush on GFX10 and later Heavy-weight TLB flush is required after unmap on all GPUs for correctness and security. Signed-off-by: Harish Kasiviswanathan Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 3d9ce44d88da5..fa24e1852493d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -1466,8 +1466,7 @@ void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type); static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev) { - return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3) || - KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2) || + return KFD_GC_VERSION(dev) > IP_VERSION(9, 4, 2) || (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 1) && dev->sdma_fw_version >= 18) || KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0); } From c6d277064b1da7f9015b575a562734de87a7e463 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 11 Sep 2023 11:36:55 -0700 Subject: [PATCH 141/161] tcp: Factorise sk_family-independent comparison in inet_bind2_bucket_match(_addr_any). This is a prep patch to make the following patches cleaner that touch inet_bind2_bucket_match() and inet_bind2_bucket_match_addr_any(). Both functions have duplicated comparison for netns, port, and l3mdev. Let's factorise them. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 7876b7d703cb5..5c54f2804174d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -815,41 +815,39 @@ static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk) { + if (!net_eq(ib2_net(tb), net) || tb->port != port || + tb->l3mdev != l3mdev) + return false; + #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family != tb->family) return false; if (sk->sk_family == AF_INET6) - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && - ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); - else + return ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); #endif - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && tb->rcv_saddr == sk->sk_rcv_saddr; + return tb->rcv_saddr == sk->sk_rcv_saddr; } bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk) { + if (!net_eq(ib2_net(tb), net) || tb->port != port || + tb->l3mdev != l3mdev) + return false; + #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family != tb->family) { if (sk->sk_family == AF_INET) - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && - ipv6_addr_any(&tb->v6_rcv_saddr); + return ipv6_addr_any(&tb->v6_rcv_saddr); return false; } if (sk->sk_family == AF_INET6) - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && - ipv6_addr_any(&tb->v6_rcv_saddr); - else + return ipv6_addr_any(&tb->v6_rcv_saddr); #endif - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && tb->rcv_saddr == 0; + return tb->rcv_saddr == 0; } /* The socket's bhash2 hashbucket spinlock must be held when this is called */ From aa99e5f87bd54db55dd37cb130bd5eb55933027f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 11 Sep 2023 11:36:56 -0700 Subject: [PATCH 142/161] tcp: Fix bind() regression for v4-mapped-v6 wildcard address. Andrei Vagin reported bind() regression with strace logs. If we bind() a TCPv6 socket to ::FFFF:0.0.0.0 and then bind() a TCPv4 socket to 127.0.0.1, the 2nd bind() should fail but now succeeds. from socket import * s1 = socket(AF_INET6, SOCK_STREAM) s1.bind(('::ffff:0.0.0.0', 0)) s2 = socket(AF_INET, SOCK_STREAM) s2.bind(('127.0.0.1', s1.getsockname()[1])) During the 2nd bind(), if tb->family is AF_INET6 and sk->sk_family is AF_INET in inet_bind2_bucket_match_addr_any(), we still need to check if tb has the v4-mapped-v6 wildcard address. The example above does not work after commit 5456262d2baa ("net: Fix incorrect address comparison when searching for a bind2 bucket"), but the blamed change is not the commit. Before the commit, the leading zeros of ::FFFF:0.0.0.0 were treated as 0.0.0.0, and the sequence above worked by chance. Technically, this case has been broken since bhash2 was introduced. Note that if we bind() two sockets to 127.0.0.1 and then ::FFFF:0.0.0.0, the 2nd bind() fails properly because we fall back to using bhash to detect conflicts for the v4-mapped-v6 address. Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") Reported-by: Andrei Vagin Closes: https://lore.kernel.org/netdev/ZPuYBOFC8zsK6r9T@google.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ipv6.h | 5 +++++ net/ipv4/inet_hashtables.c | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index fe274c122a563..c6932d1a3fa80 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -784,6 +784,11 @@ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) cpu_to_be32(0x0000ffff))) == 0UL; } +static inline bool ipv6_addr_v4mapped_any(const struct in6_addr *a) +{ + return ipv6_addr_v4mapped(a) && ipv4_is_zeronet(a->s6_addr32[3]); +} + static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a) { return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 5c54f2804174d..a58b04052ca6b 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -839,7 +839,8 @@ bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family != tb->family) { if (sk->sk_family == AF_INET) - return ipv6_addr_any(&tb->v6_rcv_saddr); + return ipv6_addr_any(&tb->v6_rcv_saddr) || + ipv6_addr_v4mapped_any(&tb->v6_rcv_saddr); return false; } From c48ef9c4aed3632566b57ba66cec6ec78624d4cb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 11 Sep 2023 11:36:57 -0700 Subject: [PATCH 143/161] tcp: Fix bind() regression for v4-mapped-v6 non-wildcard address. Since bhash2 was introduced, the example below does not work as expected. These two bind() should conflict, but the 2nd bind() now succeeds. from socket import * s1 = socket(AF_INET6, SOCK_STREAM) s1.bind(('::ffff:127.0.0.1', 0)) s2 = socket(AF_INET, SOCK_STREAM) s2.bind(('127.0.0.1', s1.getsockname()[1])) During the 2nd bind() in inet_csk_get_port(), inet_bind2_bucket_find() fails to find the 1st socket's tb2, so inet_bind2_bucket_create() allocates a new tb2 for the 2nd socket. Then, we call inet_csk_bind_conflict() that checks conflicts in the new tb2 by inet_bhash2_conflict(). However, the new tb2 does not include the 1st socket, thus the bind() finally succeeds. In this case, inet_bind2_bucket_match() must check if AF_INET6 tb2 has the conflicting v4-mapped-v6 address so that inet_bind2_bucket_find() returns the 1st socket's tb2. Note that if we bind two sockets to 127.0.0.1 and then ::FFFF:127.0.0.1, the 2nd bind() fails properly for the same reason mentinoed in the previous commit. Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index a58b04052ca6b..c32f5e28758be 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -820,8 +820,13 @@ static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, return false; #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family != tb->family) + if (sk->sk_family != tb->family) { + if (sk->sk_family == AF_INET) + return ipv6_addr_v4mapped(&tb->v6_rcv_saddr) && + tb->v6_rcv_saddr.s6_addr32[3] == sk->sk_rcv_saddr; + return false; + } if (sk->sk_family == AF_INET6) return ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); From 0071d15517b4a3d265abc00395beb1138e7236c7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 11 Sep 2023 11:36:58 -0700 Subject: [PATCH 144/161] selftest: tcp: Fix address length in bind_wildcard.c. The selftest passes the IPv6 address length for an IPv4 address. We should pass the correct length. Note inet_bind_sk() does not check if the size is larger than sizeof(struct sockaddr_in), so there is no real bug in this selftest. Fixes: 13715acf8ab5 ("selftest: Add test for bind() conflicts.") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- tools/testing/selftests/net/bind_wildcard.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index 58edfc15d28bd..e7ebe72e879d7 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -100,7 +100,7 @@ void bind_sockets(struct __test_metadata *_metadata, TEST_F(bind_wildcard, v4_v6) { bind_sockets(_metadata, self, - (struct sockaddr *)&self->addr4, sizeof(self->addr6), + (struct sockaddr *)&self->addr4, sizeof(self->addr4), (struct sockaddr *)&self->addr6, sizeof(self->addr6)); } From 2895d879dd41a588d80acde1aa832deb38d67823 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 11 Sep 2023 11:36:59 -0700 Subject: [PATCH 145/161] selftest: tcp: Move expected_errno into each test case in bind_wildcard.c. This is a preparation patch for the following patch. Let's define expected_errno in each test case so that we can add other test cases easily. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- tools/testing/selftests/net/bind_wildcard.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index e7ebe72e879d7..81f6945360998 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -10,37 +10,41 @@ FIXTURE(bind_wildcard) { struct sockaddr_in addr4; struct sockaddr_in6 addr6; - int expected_errno; }; FIXTURE_VARIANT(bind_wildcard) { const __u32 addr4_const; const struct in6_addr *addr6_const; + int expected_errno; }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) { .addr4_const = INADDR_ANY, .addr6_const = &in6addr_any, + .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) { .addr4_const = INADDR_ANY, .addr6_const = &in6addr_loopback, + .expected_errno = 0, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) { .addr4_const = INADDR_LOOPBACK, .addr6_const = &in6addr_any, + .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) { .addr4_const = INADDR_LOOPBACK, .addr6_const = &in6addr_loopback, + .expected_errno = 0, }; FIXTURE_SETUP(bind_wildcard) @@ -52,11 +56,6 @@ FIXTURE_SETUP(bind_wildcard) self->addr6.sin6_family = AF_INET6; self->addr6.sin6_port = htons(0); self->addr6.sin6_addr = *variant->addr6_const; - - if (variant->addr6_const == &in6addr_any) - self->expected_errno = EADDRINUSE; - else - self->expected_errno = 0; } FIXTURE_TEARDOWN(bind_wildcard) @@ -65,6 +64,7 @@ FIXTURE_TEARDOWN(bind_wildcard) void bind_sockets(struct __test_metadata *_metadata, FIXTURE_DATA(bind_wildcard) *self, + int expected_errno, struct sockaddr *addr1, socklen_t addrlen1, struct sockaddr *addr2, socklen_t addrlen2) { @@ -86,9 +86,9 @@ void bind_sockets(struct __test_metadata *_metadata, ASSERT_GT(fd[1], 0); ret = bind(fd[1], addr2, addrlen2); - if (self->expected_errno) { + if (expected_errno) { ASSERT_EQ(ret, -1); - ASSERT_EQ(errno, self->expected_errno); + ASSERT_EQ(errno, expected_errno); } else { ASSERT_EQ(ret, 0); } @@ -99,14 +99,14 @@ void bind_sockets(struct __test_metadata *_metadata, TEST_F(bind_wildcard, v4_v6) { - bind_sockets(_metadata, self, + bind_sockets(_metadata, self, variant->expected_errno, (struct sockaddr *)&self->addr4, sizeof(self->addr4), (struct sockaddr *)&self->addr6, sizeof(self->addr6)); } TEST_F(bind_wildcard, v6_v4) { - bind_sockets(_metadata, self, + bind_sockets(_metadata, self, variant->expected_errno, (struct sockaddr *)&self->addr6, sizeof(self->addr6), (struct sockaddr *)&self->addr4, sizeof(self->addr4)); } From 8637d8e8b653f4c8b6fd277b434b118f844d1d77 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 11 Sep 2023 11:37:00 -0700 Subject: [PATCH 146/161] selftest: tcp: Add v4-mapped-v6 cases in bind_wildcard.c. We add these 8 test cases in bind_wildcard.c to check bind() conflicts. 1st bind() 2nd bind() --------- --------- 0.0.0.0 ::FFFF:0.0.0.0 ::FFFF:0.0.0.0 0.0.0.0 0.0.0.0 ::FFFF:127.0.0.1 ::FFFF:127.0.0.1 0.0.0.0 127.0.0.1 ::FFFF:0.0.0.0 ::FFFF:0.0.0.0 127.0.0.1 127.0.0.1 ::FFFF:127.0.0.1 ::FFFF:127.0.0.1 127.0.0.1 All test passed without bhash2 and with bhash2 and this series. Before bhash2: $ uname -r 6.0.0-rc1-00393-g0bf73255d3a3 $ ./bind_wildcard ... # PASSED: 16 / 16 tests passed. Just after bhash2: $ uname -r 6.0.0-rc1-00394-g28044fc1d495 $ ./bind_wildcard ... ok 15 bind_wildcard.v4_local_v6_v4mapped_local.v4_v6 not ok 16 bind_wildcard.v4_local_v6_v4mapped_local.v6_v4 # FAILED: 15 / 16 tests passed. On net.git: $ ./bind_wildcard ... not ok 14 bind_wildcard.v4_local_v6_v4mapped_any.v6_v4 not ok 16 bind_wildcard.v4_local_v6_v4mapped_local.v6_v4 # FAILED: 13 / 16 tests passed. With this series: $ ./bind_wildcard ... # PASSED: 16 / 16 tests passed. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- tools/testing/selftests/net/bind_wildcard.c | 46 +++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index 81f6945360998..a2662348cdb1a 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -6,6 +6,24 @@ #include "../kselftest_harness.h" +struct in6_addr in6addr_v4mapped_any = { + .s6_addr = { + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 255, 255, + 0, 0, 0, 0 + } +}; + +struct in6_addr in6addr_v4mapped_loopback = { + .s6_addr = { + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 255, 255, + 127, 0, 0, 1 + } +}; + FIXTURE(bind_wildcard) { struct sockaddr_in addr4; @@ -33,6 +51,20 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) .expected_errno = 0, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_any) +{ + .addr4_const = INADDR_ANY, + .addr6_const = &in6addr_v4mapped_any, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_local) +{ + .addr4_const = INADDR_ANY, + .addr6_const = &in6addr_v4mapped_loopback, + .expected_errno = EADDRINUSE, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) { .addr4_const = INADDR_LOOPBACK, @@ -47,6 +79,20 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) .expected_errno = 0, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_any) +{ + .addr4_const = INADDR_LOOPBACK, + .addr6_const = &in6addr_v4mapped_any, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) +{ + .addr4_const = INADDR_LOOPBACK, + .addr6_const = &in6addr_v4mapped_loopback, + .expected_errno = EADDRINUSE, +}; + FIXTURE_SETUP(bind_wildcard) { self->addr4.sin_family = AF_INET; From e2ad626f8f409899baf1bf192d0533a851128b19 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 13 Sep 2023 00:11:27 +0200 Subject: [PATCH 147/161] pmdomain: Rename the genpd subsystem to pmdomain It has been pointed out that naming a subsystem "genpd" isn't very self-explanatory and the acronym itself that means Generic PM Domain, is known only by a limited group of people. In a way to improve the situation, let's rename the subsystem to pmdomain, which ideally should indicate that this is about so called Power Domains or "PM domains" as we often also use within the Linux Kernel terminology. Suggested-by: Rafael J. Wysocki Signed-off-by: Ulf Hansson Reviewed-by: Linus Walleij Acked-by: Arnd Bergmann Acked-by: Heiko Stuebner Acked-by: Rafael J. Wysocki Acked-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20230912221127.487327-1-ulf.hansson@linaro.org --- MAINTAINERS | 22 +++++++++---------- drivers/Makefile | 2 +- drivers/{genpd => pmdomain}/Makefile | 0 drivers/{genpd => pmdomain}/actions/Makefile | 0 .../actions/owl-sps-helper.c | 0 drivers/{genpd => pmdomain}/actions/owl-sps.c | 0 drivers/{genpd => pmdomain}/amlogic/Makefile | 0 .../amlogic/meson-ee-pwrc.c | 0 .../amlogic/meson-gx-pwrc-vpu.c | 0 .../amlogic/meson-secure-pwrc.c | 0 drivers/{genpd => pmdomain}/apple/Makefile | 0 .../{genpd => pmdomain}/apple/pmgr-pwrstate.c | 0 drivers/{genpd => pmdomain}/bcm/Makefile | 0 drivers/{genpd => pmdomain}/bcm/bcm-pmb.c | 0 .../{genpd => pmdomain}/bcm/bcm2835-power.c | 0 .../{genpd => pmdomain}/bcm/bcm63xx-power.c | 0 .../bcm/raspberrypi-power.c | 0 drivers/{genpd => pmdomain}/imx/Makefile | 0 drivers/{genpd => pmdomain}/imx/gpc.c | 0 drivers/{genpd => pmdomain}/imx/gpcv2.c | 0 .../{genpd => pmdomain}/imx/imx8m-blk-ctrl.c | 0 .../{genpd => pmdomain}/imx/imx8mp-blk-ctrl.c | 0 .../{genpd => pmdomain}/imx/imx93-blk-ctrl.c | 0 drivers/{genpd => pmdomain}/imx/imx93-pd.c | 0 drivers/{genpd => pmdomain}/imx/scu-pd.c | 0 drivers/{genpd => pmdomain}/mediatek/Makefile | 0 .../mediatek/mt6795-pm-domains.h | 0 .../mediatek/mt8167-pm-domains.h | 0 .../mediatek/mt8173-pm-domains.h | 0 .../mediatek/mt8183-pm-domains.h | 0 .../mediatek/mt8186-pm-domains.h | 0 .../mediatek/mt8188-pm-domains.h | 0 .../mediatek/mt8192-pm-domains.h | 0 .../mediatek/mt8195-pm-domains.h | 0 .../mediatek/mtk-pm-domains.c | 0 .../mediatek/mtk-pm-domains.h | 0 .../{genpd => pmdomain}/mediatek/mtk-scpsys.c | 0 drivers/{genpd => pmdomain}/qcom/Makefile | 0 drivers/{genpd => pmdomain}/qcom/cpr.c | 0 drivers/{genpd => pmdomain}/qcom/rpmhpd.c | 0 drivers/{genpd => pmdomain}/qcom/rpmpd.c | 0 drivers/{genpd => pmdomain}/renesas/Makefile | 0 .../renesas/r8a7742-sysc.c | 0 .../renesas/r8a7743-sysc.c | 0 .../renesas/r8a7745-sysc.c | 0 .../renesas/r8a77470-sysc.c | 0 .../renesas/r8a774a1-sysc.c | 0 .../renesas/r8a774b1-sysc.c | 0 .../renesas/r8a774c0-sysc.c | 0 .../renesas/r8a774e1-sysc.c | 0 .../renesas/r8a7779-sysc.c | 0 .../renesas/r8a7790-sysc.c | 0 .../renesas/r8a7791-sysc.c | 0 .../renesas/r8a7792-sysc.c | 0 .../renesas/r8a7794-sysc.c | 0 .../renesas/r8a7795-sysc.c | 0 .../renesas/r8a7796-sysc.c | 0 .../renesas/r8a77965-sysc.c | 0 .../renesas/r8a77970-sysc.c | 0 .../renesas/r8a77980-sysc.c | 0 .../renesas/r8a77990-sysc.c | 0 .../renesas/r8a77995-sysc.c | 0 .../renesas/r8a779a0-sysc.c | 0 .../renesas/r8a779f0-sysc.c | 0 .../renesas/r8a779g0-sysc.c | 0 .../renesas/rcar-gen4-sysc.c | 0 .../renesas/rcar-gen4-sysc.h | 0 .../{genpd => pmdomain}/renesas/rcar-sysc.c | 0 .../{genpd => pmdomain}/renesas/rcar-sysc.h | 0 .../renesas/rmobile-sysc.c | 0 drivers/{genpd => pmdomain}/rockchip/Makefile | 0 .../{genpd => pmdomain}/rockchip/pm-domains.c | 0 drivers/{genpd => pmdomain}/samsung/Makefile | 0 .../samsung/exynos-pm-domains.c | 0 drivers/{genpd => pmdomain}/st/Makefile | 0 .../st/ste-ux500-pm-domain.c | 0 drivers/{genpd => pmdomain}/starfive/Makefile | 0 .../{genpd => pmdomain}/starfive/jh71xx-pmu.c | 0 drivers/{genpd => pmdomain}/sunxi/Makefile | 0 .../{genpd => pmdomain}/sunxi/sun20i-ppu.c | 0 drivers/{genpd => pmdomain}/tegra/Makefile | 0 .../tegra/powergate-bpmp.c | 0 drivers/{genpd => pmdomain}/ti/Makefile | 0 drivers/{genpd => pmdomain}/ti/omap_prm.c | 0 .../ti/ti_sci_pm_domains.c | 0 drivers/{genpd => pmdomain}/xilinx/Makefile | 0 .../xilinx/zynqmp-pm-domains.c | 0 87 files changed, 12 insertions(+), 12 deletions(-) rename drivers/{genpd => pmdomain}/Makefile (100%) rename drivers/{genpd => pmdomain}/actions/Makefile (100%) rename drivers/{genpd => pmdomain}/actions/owl-sps-helper.c (100%) rename drivers/{genpd => pmdomain}/actions/owl-sps.c (100%) rename drivers/{genpd => pmdomain}/amlogic/Makefile (100%) rename drivers/{genpd => pmdomain}/amlogic/meson-ee-pwrc.c (100%) rename drivers/{genpd => pmdomain}/amlogic/meson-gx-pwrc-vpu.c (100%) rename drivers/{genpd => pmdomain}/amlogic/meson-secure-pwrc.c (100%) rename drivers/{genpd => pmdomain}/apple/Makefile (100%) rename drivers/{genpd => pmdomain}/apple/pmgr-pwrstate.c (100%) rename drivers/{genpd => pmdomain}/bcm/Makefile (100%) rename drivers/{genpd => pmdomain}/bcm/bcm-pmb.c (100%) rename drivers/{genpd => pmdomain}/bcm/bcm2835-power.c (100%) rename drivers/{genpd => pmdomain}/bcm/bcm63xx-power.c (100%) rename drivers/{genpd => pmdomain}/bcm/raspberrypi-power.c (100%) rename drivers/{genpd => pmdomain}/imx/Makefile (100%) rename drivers/{genpd => pmdomain}/imx/gpc.c (100%) rename drivers/{genpd => pmdomain}/imx/gpcv2.c (100%) rename drivers/{genpd => pmdomain}/imx/imx8m-blk-ctrl.c (100%) rename drivers/{genpd => pmdomain}/imx/imx8mp-blk-ctrl.c (100%) rename drivers/{genpd => pmdomain}/imx/imx93-blk-ctrl.c (100%) rename drivers/{genpd => pmdomain}/imx/imx93-pd.c (100%) rename drivers/{genpd => pmdomain}/imx/scu-pd.c (100%) rename drivers/{genpd => pmdomain}/mediatek/Makefile (100%) rename drivers/{genpd => pmdomain}/mediatek/mt6795-pm-domains.h (100%) rename drivers/{genpd => pmdomain}/mediatek/mt8167-pm-domains.h (100%) rename drivers/{genpd => pmdomain}/mediatek/mt8173-pm-domains.h (100%) rename drivers/{genpd => pmdomain}/mediatek/mt8183-pm-domains.h (100%) rename drivers/{genpd => pmdomain}/mediatek/mt8186-pm-domains.h (100%) rename drivers/{genpd => pmdomain}/mediatek/mt8188-pm-domains.h (100%) rename drivers/{genpd => pmdomain}/mediatek/mt8192-pm-domains.h (100%) rename drivers/{genpd => pmdomain}/mediatek/mt8195-pm-domains.h (100%) rename drivers/{genpd => pmdomain}/mediatek/mtk-pm-domains.c (100%) rename drivers/{genpd => pmdomain}/mediatek/mtk-pm-domains.h (100%) rename drivers/{genpd => pmdomain}/mediatek/mtk-scpsys.c (100%) rename drivers/{genpd => pmdomain}/qcom/Makefile (100%) rename drivers/{genpd => pmdomain}/qcom/cpr.c (100%) rename drivers/{genpd => pmdomain}/qcom/rpmhpd.c (100%) rename drivers/{genpd => pmdomain}/qcom/rpmpd.c (100%) rename drivers/{genpd => pmdomain}/renesas/Makefile (100%) rename drivers/{genpd => pmdomain}/renesas/r8a7742-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a7743-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a7745-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a77470-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a774a1-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a774b1-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a774c0-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a774e1-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a7779-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a7790-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a7791-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a7792-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a7794-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a7795-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a7796-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a77965-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a77970-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a77980-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a77990-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a77995-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a779a0-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a779f0-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/r8a779g0-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/rcar-gen4-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/rcar-gen4-sysc.h (100%) rename drivers/{genpd => pmdomain}/renesas/rcar-sysc.c (100%) rename drivers/{genpd => pmdomain}/renesas/rcar-sysc.h (100%) rename drivers/{genpd => pmdomain}/renesas/rmobile-sysc.c (100%) rename drivers/{genpd => pmdomain}/rockchip/Makefile (100%) rename drivers/{genpd => pmdomain}/rockchip/pm-domains.c (100%) rename drivers/{genpd => pmdomain}/samsung/Makefile (100%) rename drivers/{genpd => pmdomain}/samsung/exynos-pm-domains.c (100%) rename drivers/{genpd => pmdomain}/st/Makefile (100%) rename drivers/{genpd => pmdomain}/st/ste-ux500-pm-domain.c (100%) rename drivers/{genpd => pmdomain}/starfive/Makefile (100%) rename drivers/{genpd => pmdomain}/starfive/jh71xx-pmu.c (100%) rename drivers/{genpd => pmdomain}/sunxi/Makefile (100%) rename drivers/{genpd => pmdomain}/sunxi/sun20i-ppu.c (100%) rename drivers/{genpd => pmdomain}/tegra/Makefile (100%) rename drivers/{genpd => pmdomain}/tegra/powergate-bpmp.c (100%) rename drivers/{genpd => pmdomain}/ti/Makefile (100%) rename drivers/{genpd => pmdomain}/ti/omap_prm.c (100%) rename drivers/{genpd => pmdomain}/ti/ti_sci_pm_domains.c (100%) rename drivers/{genpd => pmdomain}/xilinx/Makefile (100%) rename drivers/{genpd => pmdomain}/xilinx/zynqmp-pm-domains.c (100%) diff --git a/MAINTAINERS b/MAINTAINERS index 90f13281d2970..4d9e7d42412f0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1855,7 +1855,7 @@ F: Documentation/devicetree/bindings/phy/amlogic* F: arch/arm/boot/dts/amlogic/ F: arch/arm/mach-meson/ F: arch/arm64/boot/dts/amlogic/ -F: drivers/genpd/amlogic/ +F: drivers/pmdomain/amlogic/ F: drivers/mmc/host/meson* F: drivers/phy/amlogic/ F: drivers/pinctrl/meson/ @@ -1918,7 +1918,7 @@ F: drivers/bluetooth/hci_bcm4377.c F: drivers/clk/clk-apple-nco.c F: drivers/cpufreq/apple-soc-cpufreq.c F: drivers/dma/apple-admac.c -F: drivers/genpd/apple/ +F: drivers/pmdomain/apple/ F: drivers/i2c/busses/i2c-pasemi-core.c F: drivers/i2c/busses/i2c-pasemi-platform.c F: drivers/iommu/apple-dart.c @@ -2435,7 +2435,7 @@ F: arch/arm/mach-ux500/ F: drivers/clk/clk-nomadik.c F: drivers/clocksource/clksrc-dbx500-prcmu.c F: drivers/dma/ste_dma40* -F: drivers/genpd/st/ste-ux500-pm-domain.c +F: drivers/pmdomain/st/ste-ux500-pm-domain.c F: drivers/hwspinlock/u8500_hsem.c F: drivers/i2c/busses/i2c-nomadik.c F: drivers/iio/adc/ab8500-gpadc.c @@ -2598,7 +2598,7 @@ F: arch/arm/include/debug/renesas-scif.S F: arch/arm/mach-shmobile/ F: arch/arm64/boot/dts/renesas/ F: arch/riscv/boot/dts/renesas/ -F: drivers/genpd/renesas/ +F: drivers/pmdomain/renesas/ F: drivers/soc/renesas/ F: include/linux/soc/renesas/ K: \brenesas, @@ -4026,7 +4026,7 @@ F: arch/mips/kernel/*bmips* F: drivers/irqchip/irq-bcm63* F: drivers/irqchip/irq-bcm7* F: drivers/irqchip/irq-brcmstb* -F: drivers/genpd/bcm/bcm63xx-power.c +F: drivers/pmdomain/bcm/bcm63xx-power.c F: include/linux/bcm963xx_nvram.h F: include/linux/bcm963xx_tag.h @@ -4248,7 +4248,7 @@ R: Broadcom internal kernel review list L: linux-pm@vger.kernel.org S: Maintained T: git https://github.com/broadcom/stblinux.git -F: drivers/genpd/bcm/bcm-pmb.c +F: drivers/pmdomain/bcm/bcm-pmb.c F: include/dt-bindings/soc/bcm-pmb.h BROADCOM SPECIFIC AMBA DRIVER (BCMA) @@ -8729,7 +8729,7 @@ M: Ulf Hansson L: linux-pm@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/linux-pm.git -F: drivers/genpd/ +F: drivers/pmdomain/ GENERIC RESISTIVE TOUCHSCREEN ADC DRIVER M: Eugen Hristev @@ -17680,7 +17680,7 @@ L: linux-pm@vger.kernel.org L: linux-arm-msm@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/power/avs/qcom,cpr.yaml -F: drivers/genpd/qcom/cpr.c +F: drivers/pmdomain/qcom/cpr.c QUALCOMM CPUFREQ DRIVER MSM8996/APQ8096 M: Ilia Lin @@ -20514,7 +20514,7 @@ STARFIVE JH71XX PMU CONTROLLER DRIVER M: Walker Chen S: Supported F: Documentation/devicetree/bindings/power/starfive* -F: drivers/genpd/starfive/jh71xx-pmu.c +F: drivers/pmdomain/starfive/jh71xx-pmu.c F: include/dt-bindings/power/starfive,jh7110-pmu.h STARFIVE SOC DRIVERS @@ -21339,7 +21339,7 @@ F: drivers/irqchip/irq-ti-sci-inta.c F: drivers/irqchip/irq-ti-sci-intr.c F: drivers/reset/reset-ti-sci.c F: drivers/soc/ti/ti_sci_inta_msi.c -F: drivers/genpd/ti/ti_sci_pm_domains.c +F: drivers/pmdomain/ti/ti_sci_pm_domains.c F: include/dt-bindings/soc/ti,sci_pm_domain.h F: include/linux/soc/ti/ti_sci_inta_msi.h F: include/linux/soc/ti/ti_sci_protocol.h @@ -21581,7 +21581,7 @@ L: linux-kernel@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/ti/linux.git -F: drivers/genpd/ti/omap_prm.c +F: drivers/pmdomain/ti/omap_prm.c F: drivers/soc/ti/* TI LM49xxx FAMILY ASoC CODEC DRIVERS diff --git a/drivers/Makefile b/drivers/Makefile index cb0afca2e4a04..1bec7819a837a 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -46,7 +46,7 @@ obj-$(CONFIG_DMADEVICES) += dma/ # SOC specific infrastructure drivers. obj-y += soc/ -obj-$(CONFIG_PM_GENERIC_DOMAINS) += genpd/ +obj-$(CONFIG_PM_GENERIC_DOMAINS) += pmdomain/ obj-y += virtio/ obj-$(CONFIG_VDPA) += vdpa/ diff --git a/drivers/genpd/Makefile b/drivers/pmdomain/Makefile similarity index 100% rename from drivers/genpd/Makefile rename to drivers/pmdomain/Makefile diff --git a/drivers/genpd/actions/Makefile b/drivers/pmdomain/actions/Makefile similarity index 100% rename from drivers/genpd/actions/Makefile rename to drivers/pmdomain/actions/Makefile diff --git a/drivers/genpd/actions/owl-sps-helper.c b/drivers/pmdomain/actions/owl-sps-helper.c similarity index 100% rename from drivers/genpd/actions/owl-sps-helper.c rename to drivers/pmdomain/actions/owl-sps-helper.c diff --git a/drivers/genpd/actions/owl-sps.c b/drivers/pmdomain/actions/owl-sps.c similarity index 100% rename from drivers/genpd/actions/owl-sps.c rename to drivers/pmdomain/actions/owl-sps.c diff --git a/drivers/genpd/amlogic/Makefile b/drivers/pmdomain/amlogic/Makefile similarity index 100% rename from drivers/genpd/amlogic/Makefile rename to drivers/pmdomain/amlogic/Makefile diff --git a/drivers/genpd/amlogic/meson-ee-pwrc.c b/drivers/pmdomain/amlogic/meson-ee-pwrc.c similarity index 100% rename from drivers/genpd/amlogic/meson-ee-pwrc.c rename to drivers/pmdomain/amlogic/meson-ee-pwrc.c diff --git a/drivers/genpd/amlogic/meson-gx-pwrc-vpu.c b/drivers/pmdomain/amlogic/meson-gx-pwrc-vpu.c similarity index 100% rename from drivers/genpd/amlogic/meson-gx-pwrc-vpu.c rename to drivers/pmdomain/amlogic/meson-gx-pwrc-vpu.c diff --git a/drivers/genpd/amlogic/meson-secure-pwrc.c b/drivers/pmdomain/amlogic/meson-secure-pwrc.c similarity index 100% rename from drivers/genpd/amlogic/meson-secure-pwrc.c rename to drivers/pmdomain/amlogic/meson-secure-pwrc.c diff --git a/drivers/genpd/apple/Makefile b/drivers/pmdomain/apple/Makefile similarity index 100% rename from drivers/genpd/apple/Makefile rename to drivers/pmdomain/apple/Makefile diff --git a/drivers/genpd/apple/pmgr-pwrstate.c b/drivers/pmdomain/apple/pmgr-pwrstate.c similarity index 100% rename from drivers/genpd/apple/pmgr-pwrstate.c rename to drivers/pmdomain/apple/pmgr-pwrstate.c diff --git a/drivers/genpd/bcm/Makefile b/drivers/pmdomain/bcm/Makefile similarity index 100% rename from drivers/genpd/bcm/Makefile rename to drivers/pmdomain/bcm/Makefile diff --git a/drivers/genpd/bcm/bcm-pmb.c b/drivers/pmdomain/bcm/bcm-pmb.c similarity index 100% rename from drivers/genpd/bcm/bcm-pmb.c rename to drivers/pmdomain/bcm/bcm-pmb.c diff --git a/drivers/genpd/bcm/bcm2835-power.c b/drivers/pmdomain/bcm/bcm2835-power.c similarity index 100% rename from drivers/genpd/bcm/bcm2835-power.c rename to drivers/pmdomain/bcm/bcm2835-power.c diff --git a/drivers/genpd/bcm/bcm63xx-power.c b/drivers/pmdomain/bcm/bcm63xx-power.c similarity index 100% rename from drivers/genpd/bcm/bcm63xx-power.c rename to drivers/pmdomain/bcm/bcm63xx-power.c diff --git a/drivers/genpd/bcm/raspberrypi-power.c b/drivers/pmdomain/bcm/raspberrypi-power.c similarity index 100% rename from drivers/genpd/bcm/raspberrypi-power.c rename to drivers/pmdomain/bcm/raspberrypi-power.c diff --git a/drivers/genpd/imx/Makefile b/drivers/pmdomain/imx/Makefile similarity index 100% rename from drivers/genpd/imx/Makefile rename to drivers/pmdomain/imx/Makefile diff --git a/drivers/genpd/imx/gpc.c b/drivers/pmdomain/imx/gpc.c similarity index 100% rename from drivers/genpd/imx/gpc.c rename to drivers/pmdomain/imx/gpc.c diff --git a/drivers/genpd/imx/gpcv2.c b/drivers/pmdomain/imx/gpcv2.c similarity index 100% rename from drivers/genpd/imx/gpcv2.c rename to drivers/pmdomain/imx/gpcv2.c diff --git a/drivers/genpd/imx/imx8m-blk-ctrl.c b/drivers/pmdomain/imx/imx8m-blk-ctrl.c similarity index 100% rename from drivers/genpd/imx/imx8m-blk-ctrl.c rename to drivers/pmdomain/imx/imx8m-blk-ctrl.c diff --git a/drivers/genpd/imx/imx8mp-blk-ctrl.c b/drivers/pmdomain/imx/imx8mp-blk-ctrl.c similarity index 100% rename from drivers/genpd/imx/imx8mp-blk-ctrl.c rename to drivers/pmdomain/imx/imx8mp-blk-ctrl.c diff --git a/drivers/genpd/imx/imx93-blk-ctrl.c b/drivers/pmdomain/imx/imx93-blk-ctrl.c similarity index 100% rename from drivers/genpd/imx/imx93-blk-ctrl.c rename to drivers/pmdomain/imx/imx93-blk-ctrl.c diff --git a/drivers/genpd/imx/imx93-pd.c b/drivers/pmdomain/imx/imx93-pd.c similarity index 100% rename from drivers/genpd/imx/imx93-pd.c rename to drivers/pmdomain/imx/imx93-pd.c diff --git a/drivers/genpd/imx/scu-pd.c b/drivers/pmdomain/imx/scu-pd.c similarity index 100% rename from drivers/genpd/imx/scu-pd.c rename to drivers/pmdomain/imx/scu-pd.c diff --git a/drivers/genpd/mediatek/Makefile b/drivers/pmdomain/mediatek/Makefile similarity index 100% rename from drivers/genpd/mediatek/Makefile rename to drivers/pmdomain/mediatek/Makefile diff --git a/drivers/genpd/mediatek/mt6795-pm-domains.h b/drivers/pmdomain/mediatek/mt6795-pm-domains.h similarity index 100% rename from drivers/genpd/mediatek/mt6795-pm-domains.h rename to drivers/pmdomain/mediatek/mt6795-pm-domains.h diff --git a/drivers/genpd/mediatek/mt8167-pm-domains.h b/drivers/pmdomain/mediatek/mt8167-pm-domains.h similarity index 100% rename from drivers/genpd/mediatek/mt8167-pm-domains.h rename to drivers/pmdomain/mediatek/mt8167-pm-domains.h diff --git a/drivers/genpd/mediatek/mt8173-pm-domains.h b/drivers/pmdomain/mediatek/mt8173-pm-domains.h similarity index 100% rename from drivers/genpd/mediatek/mt8173-pm-domains.h rename to drivers/pmdomain/mediatek/mt8173-pm-domains.h diff --git a/drivers/genpd/mediatek/mt8183-pm-domains.h b/drivers/pmdomain/mediatek/mt8183-pm-domains.h similarity index 100% rename from drivers/genpd/mediatek/mt8183-pm-domains.h rename to drivers/pmdomain/mediatek/mt8183-pm-domains.h diff --git a/drivers/genpd/mediatek/mt8186-pm-domains.h b/drivers/pmdomain/mediatek/mt8186-pm-domains.h similarity index 100% rename from drivers/genpd/mediatek/mt8186-pm-domains.h rename to drivers/pmdomain/mediatek/mt8186-pm-domains.h diff --git a/drivers/genpd/mediatek/mt8188-pm-domains.h b/drivers/pmdomain/mediatek/mt8188-pm-domains.h similarity index 100% rename from drivers/genpd/mediatek/mt8188-pm-domains.h rename to drivers/pmdomain/mediatek/mt8188-pm-domains.h diff --git a/drivers/genpd/mediatek/mt8192-pm-domains.h b/drivers/pmdomain/mediatek/mt8192-pm-domains.h similarity index 100% rename from drivers/genpd/mediatek/mt8192-pm-domains.h rename to drivers/pmdomain/mediatek/mt8192-pm-domains.h diff --git a/drivers/genpd/mediatek/mt8195-pm-domains.h b/drivers/pmdomain/mediatek/mt8195-pm-domains.h similarity index 100% rename from drivers/genpd/mediatek/mt8195-pm-domains.h rename to drivers/pmdomain/mediatek/mt8195-pm-domains.h diff --git a/drivers/genpd/mediatek/mtk-pm-domains.c b/drivers/pmdomain/mediatek/mtk-pm-domains.c similarity index 100% rename from drivers/genpd/mediatek/mtk-pm-domains.c rename to drivers/pmdomain/mediatek/mtk-pm-domains.c diff --git a/drivers/genpd/mediatek/mtk-pm-domains.h b/drivers/pmdomain/mediatek/mtk-pm-domains.h similarity index 100% rename from drivers/genpd/mediatek/mtk-pm-domains.h rename to drivers/pmdomain/mediatek/mtk-pm-domains.h diff --git a/drivers/genpd/mediatek/mtk-scpsys.c b/drivers/pmdomain/mediatek/mtk-scpsys.c similarity index 100% rename from drivers/genpd/mediatek/mtk-scpsys.c rename to drivers/pmdomain/mediatek/mtk-scpsys.c diff --git a/drivers/genpd/qcom/Makefile b/drivers/pmdomain/qcom/Makefile similarity index 100% rename from drivers/genpd/qcom/Makefile rename to drivers/pmdomain/qcom/Makefile diff --git a/drivers/genpd/qcom/cpr.c b/drivers/pmdomain/qcom/cpr.c similarity index 100% rename from drivers/genpd/qcom/cpr.c rename to drivers/pmdomain/qcom/cpr.c diff --git a/drivers/genpd/qcom/rpmhpd.c b/drivers/pmdomain/qcom/rpmhpd.c similarity index 100% rename from drivers/genpd/qcom/rpmhpd.c rename to drivers/pmdomain/qcom/rpmhpd.c diff --git a/drivers/genpd/qcom/rpmpd.c b/drivers/pmdomain/qcom/rpmpd.c similarity index 100% rename from drivers/genpd/qcom/rpmpd.c rename to drivers/pmdomain/qcom/rpmpd.c diff --git a/drivers/genpd/renesas/Makefile b/drivers/pmdomain/renesas/Makefile similarity index 100% rename from drivers/genpd/renesas/Makefile rename to drivers/pmdomain/renesas/Makefile diff --git a/drivers/genpd/renesas/r8a7742-sysc.c b/drivers/pmdomain/renesas/r8a7742-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a7742-sysc.c rename to drivers/pmdomain/renesas/r8a7742-sysc.c diff --git a/drivers/genpd/renesas/r8a7743-sysc.c b/drivers/pmdomain/renesas/r8a7743-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a7743-sysc.c rename to drivers/pmdomain/renesas/r8a7743-sysc.c diff --git a/drivers/genpd/renesas/r8a7745-sysc.c b/drivers/pmdomain/renesas/r8a7745-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a7745-sysc.c rename to drivers/pmdomain/renesas/r8a7745-sysc.c diff --git a/drivers/genpd/renesas/r8a77470-sysc.c b/drivers/pmdomain/renesas/r8a77470-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a77470-sysc.c rename to drivers/pmdomain/renesas/r8a77470-sysc.c diff --git a/drivers/genpd/renesas/r8a774a1-sysc.c b/drivers/pmdomain/renesas/r8a774a1-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a774a1-sysc.c rename to drivers/pmdomain/renesas/r8a774a1-sysc.c diff --git a/drivers/genpd/renesas/r8a774b1-sysc.c b/drivers/pmdomain/renesas/r8a774b1-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a774b1-sysc.c rename to drivers/pmdomain/renesas/r8a774b1-sysc.c diff --git a/drivers/genpd/renesas/r8a774c0-sysc.c b/drivers/pmdomain/renesas/r8a774c0-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a774c0-sysc.c rename to drivers/pmdomain/renesas/r8a774c0-sysc.c diff --git a/drivers/genpd/renesas/r8a774e1-sysc.c b/drivers/pmdomain/renesas/r8a774e1-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a774e1-sysc.c rename to drivers/pmdomain/renesas/r8a774e1-sysc.c diff --git a/drivers/genpd/renesas/r8a7779-sysc.c b/drivers/pmdomain/renesas/r8a7779-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a7779-sysc.c rename to drivers/pmdomain/renesas/r8a7779-sysc.c diff --git a/drivers/genpd/renesas/r8a7790-sysc.c b/drivers/pmdomain/renesas/r8a7790-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a7790-sysc.c rename to drivers/pmdomain/renesas/r8a7790-sysc.c diff --git a/drivers/genpd/renesas/r8a7791-sysc.c b/drivers/pmdomain/renesas/r8a7791-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a7791-sysc.c rename to drivers/pmdomain/renesas/r8a7791-sysc.c diff --git a/drivers/genpd/renesas/r8a7792-sysc.c b/drivers/pmdomain/renesas/r8a7792-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a7792-sysc.c rename to drivers/pmdomain/renesas/r8a7792-sysc.c diff --git a/drivers/genpd/renesas/r8a7794-sysc.c b/drivers/pmdomain/renesas/r8a7794-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a7794-sysc.c rename to drivers/pmdomain/renesas/r8a7794-sysc.c diff --git a/drivers/genpd/renesas/r8a7795-sysc.c b/drivers/pmdomain/renesas/r8a7795-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a7795-sysc.c rename to drivers/pmdomain/renesas/r8a7795-sysc.c diff --git a/drivers/genpd/renesas/r8a7796-sysc.c b/drivers/pmdomain/renesas/r8a7796-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a7796-sysc.c rename to drivers/pmdomain/renesas/r8a7796-sysc.c diff --git a/drivers/genpd/renesas/r8a77965-sysc.c b/drivers/pmdomain/renesas/r8a77965-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a77965-sysc.c rename to drivers/pmdomain/renesas/r8a77965-sysc.c diff --git a/drivers/genpd/renesas/r8a77970-sysc.c b/drivers/pmdomain/renesas/r8a77970-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a77970-sysc.c rename to drivers/pmdomain/renesas/r8a77970-sysc.c diff --git a/drivers/genpd/renesas/r8a77980-sysc.c b/drivers/pmdomain/renesas/r8a77980-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a77980-sysc.c rename to drivers/pmdomain/renesas/r8a77980-sysc.c diff --git a/drivers/genpd/renesas/r8a77990-sysc.c b/drivers/pmdomain/renesas/r8a77990-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a77990-sysc.c rename to drivers/pmdomain/renesas/r8a77990-sysc.c diff --git a/drivers/genpd/renesas/r8a77995-sysc.c b/drivers/pmdomain/renesas/r8a77995-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a77995-sysc.c rename to drivers/pmdomain/renesas/r8a77995-sysc.c diff --git a/drivers/genpd/renesas/r8a779a0-sysc.c b/drivers/pmdomain/renesas/r8a779a0-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a779a0-sysc.c rename to drivers/pmdomain/renesas/r8a779a0-sysc.c diff --git a/drivers/genpd/renesas/r8a779f0-sysc.c b/drivers/pmdomain/renesas/r8a779f0-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a779f0-sysc.c rename to drivers/pmdomain/renesas/r8a779f0-sysc.c diff --git a/drivers/genpd/renesas/r8a779g0-sysc.c b/drivers/pmdomain/renesas/r8a779g0-sysc.c similarity index 100% rename from drivers/genpd/renesas/r8a779g0-sysc.c rename to drivers/pmdomain/renesas/r8a779g0-sysc.c diff --git a/drivers/genpd/renesas/rcar-gen4-sysc.c b/drivers/pmdomain/renesas/rcar-gen4-sysc.c similarity index 100% rename from drivers/genpd/renesas/rcar-gen4-sysc.c rename to drivers/pmdomain/renesas/rcar-gen4-sysc.c diff --git a/drivers/genpd/renesas/rcar-gen4-sysc.h b/drivers/pmdomain/renesas/rcar-gen4-sysc.h similarity index 100% rename from drivers/genpd/renesas/rcar-gen4-sysc.h rename to drivers/pmdomain/renesas/rcar-gen4-sysc.h diff --git a/drivers/genpd/renesas/rcar-sysc.c b/drivers/pmdomain/renesas/rcar-sysc.c similarity index 100% rename from drivers/genpd/renesas/rcar-sysc.c rename to drivers/pmdomain/renesas/rcar-sysc.c diff --git a/drivers/genpd/renesas/rcar-sysc.h b/drivers/pmdomain/renesas/rcar-sysc.h similarity index 100% rename from drivers/genpd/renesas/rcar-sysc.h rename to drivers/pmdomain/renesas/rcar-sysc.h diff --git a/drivers/genpd/renesas/rmobile-sysc.c b/drivers/pmdomain/renesas/rmobile-sysc.c similarity index 100% rename from drivers/genpd/renesas/rmobile-sysc.c rename to drivers/pmdomain/renesas/rmobile-sysc.c diff --git a/drivers/genpd/rockchip/Makefile b/drivers/pmdomain/rockchip/Makefile similarity index 100% rename from drivers/genpd/rockchip/Makefile rename to drivers/pmdomain/rockchip/Makefile diff --git a/drivers/genpd/rockchip/pm-domains.c b/drivers/pmdomain/rockchip/pm-domains.c similarity index 100% rename from drivers/genpd/rockchip/pm-domains.c rename to drivers/pmdomain/rockchip/pm-domains.c diff --git a/drivers/genpd/samsung/Makefile b/drivers/pmdomain/samsung/Makefile similarity index 100% rename from drivers/genpd/samsung/Makefile rename to drivers/pmdomain/samsung/Makefile diff --git a/drivers/genpd/samsung/exynos-pm-domains.c b/drivers/pmdomain/samsung/exynos-pm-domains.c similarity index 100% rename from drivers/genpd/samsung/exynos-pm-domains.c rename to drivers/pmdomain/samsung/exynos-pm-domains.c diff --git a/drivers/genpd/st/Makefile b/drivers/pmdomain/st/Makefile similarity index 100% rename from drivers/genpd/st/Makefile rename to drivers/pmdomain/st/Makefile diff --git a/drivers/genpd/st/ste-ux500-pm-domain.c b/drivers/pmdomain/st/ste-ux500-pm-domain.c similarity index 100% rename from drivers/genpd/st/ste-ux500-pm-domain.c rename to drivers/pmdomain/st/ste-ux500-pm-domain.c diff --git a/drivers/genpd/starfive/Makefile b/drivers/pmdomain/starfive/Makefile similarity index 100% rename from drivers/genpd/starfive/Makefile rename to drivers/pmdomain/starfive/Makefile diff --git a/drivers/genpd/starfive/jh71xx-pmu.c b/drivers/pmdomain/starfive/jh71xx-pmu.c similarity index 100% rename from drivers/genpd/starfive/jh71xx-pmu.c rename to drivers/pmdomain/starfive/jh71xx-pmu.c diff --git a/drivers/genpd/sunxi/Makefile b/drivers/pmdomain/sunxi/Makefile similarity index 100% rename from drivers/genpd/sunxi/Makefile rename to drivers/pmdomain/sunxi/Makefile diff --git a/drivers/genpd/sunxi/sun20i-ppu.c b/drivers/pmdomain/sunxi/sun20i-ppu.c similarity index 100% rename from drivers/genpd/sunxi/sun20i-ppu.c rename to drivers/pmdomain/sunxi/sun20i-ppu.c diff --git a/drivers/genpd/tegra/Makefile b/drivers/pmdomain/tegra/Makefile similarity index 100% rename from drivers/genpd/tegra/Makefile rename to drivers/pmdomain/tegra/Makefile diff --git a/drivers/genpd/tegra/powergate-bpmp.c b/drivers/pmdomain/tegra/powergate-bpmp.c similarity index 100% rename from drivers/genpd/tegra/powergate-bpmp.c rename to drivers/pmdomain/tegra/powergate-bpmp.c diff --git a/drivers/genpd/ti/Makefile b/drivers/pmdomain/ti/Makefile similarity index 100% rename from drivers/genpd/ti/Makefile rename to drivers/pmdomain/ti/Makefile diff --git a/drivers/genpd/ti/omap_prm.c b/drivers/pmdomain/ti/omap_prm.c similarity index 100% rename from drivers/genpd/ti/omap_prm.c rename to drivers/pmdomain/ti/omap_prm.c diff --git a/drivers/genpd/ti/ti_sci_pm_domains.c b/drivers/pmdomain/ti/ti_sci_pm_domains.c similarity index 100% rename from drivers/genpd/ti/ti_sci_pm_domains.c rename to drivers/pmdomain/ti/ti_sci_pm_domains.c diff --git a/drivers/genpd/xilinx/Makefile b/drivers/pmdomain/xilinx/Makefile similarity index 100% rename from drivers/genpd/xilinx/Makefile rename to drivers/pmdomain/xilinx/Makefile diff --git a/drivers/genpd/xilinx/zynqmp-pm-domains.c b/drivers/pmdomain/xilinx/zynqmp-pm-domains.c similarity index 100% rename from drivers/genpd/xilinx/zynqmp-pm-domains.c rename to drivers/pmdomain/xilinx/zynqmp-pm-domains.c From 3c44191dd76cf9c0cc49adaf34384cbd42ef8ad2 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Mon, 11 Sep 2023 13:28:14 -0700 Subject: [PATCH 148/161] ixgbe: fix timestamp configuration code The commit in fixes introduced flags to control the status of hardware configuration while processing packets. At the same time another structure is used to provide configuration of timestamper to user-space applications. The way it was coded makes this structures go out of sync easily. The repro is easy for 82599 chips: [root@hostname ~]# hwstamp_ctl -i eth0 -r 12 -t 1 current settings: tx_type 0 rx_filter 0 new settings: tx_type 1 rx_filter 12 The eth0 device is properly configured to timestamp any PTPv2 events. [root@hostname ~]# hwstamp_ctl -i eth0 -r 1 -t 1 current settings: tx_type 1 rx_filter 12 SIOCSHWTSTAMP failed: Numerical result out of range The requested time stamping mode is not supported by the hardware. The error is properly returned because HW doesn't support all packets timestamping. But the adapter->flags is cleared of timestamp flags even though no HW configuration was done. From that point no RX timestamps are received by user-space application. But configuration shows good values: [root@hostname ~]# hwstamp_ctl -i eth0 current settings: tx_type 1 rx_filter 12 Fix the issue by applying new flags only when the HW was actually configured. Fixes: a9763f3cb54c ("ixgbe: Update PTP to support X550EM_x devices") Signed-off-by: Vadim Fedorenko Reviewed-by: Simon Horman Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c | 28 +++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c index 0310af851086b..9339edbd90821 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c @@ -979,6 +979,7 @@ static int ixgbe_ptp_set_timestamp_mode(struct ixgbe_adapter *adapter, u32 tsync_tx_ctl = IXGBE_TSYNCTXCTL_ENABLED; u32 tsync_rx_ctl = IXGBE_TSYNCRXCTL_ENABLED; u32 tsync_rx_mtrl = PTP_EV_PORT << 16; + u32 aflags = adapter->flags; bool is_l2 = false; u32 regval; @@ -996,20 +997,20 @@ static int ixgbe_ptp_set_timestamp_mode(struct ixgbe_adapter *adapter, case HWTSTAMP_FILTER_NONE: tsync_rx_ctl = 0; tsync_rx_mtrl = 0; - adapter->flags &= ~(IXGBE_FLAG_RX_HWTSTAMP_ENABLED | - IXGBE_FLAG_RX_HWTSTAMP_IN_REGISTER); + aflags &= ~(IXGBE_FLAG_RX_HWTSTAMP_ENABLED | + IXGBE_FLAG_RX_HWTSTAMP_IN_REGISTER); break; case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: tsync_rx_ctl |= IXGBE_TSYNCRXCTL_TYPE_L4_V1; tsync_rx_mtrl |= IXGBE_RXMTRL_V1_SYNC_MSG; - adapter->flags |= (IXGBE_FLAG_RX_HWTSTAMP_ENABLED | - IXGBE_FLAG_RX_HWTSTAMP_IN_REGISTER); + aflags |= (IXGBE_FLAG_RX_HWTSTAMP_ENABLED | + IXGBE_FLAG_RX_HWTSTAMP_IN_REGISTER); break; case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: tsync_rx_ctl |= IXGBE_TSYNCRXCTL_TYPE_L4_V1; tsync_rx_mtrl |= IXGBE_RXMTRL_V1_DELAY_REQ_MSG; - adapter->flags |= (IXGBE_FLAG_RX_HWTSTAMP_ENABLED | - IXGBE_FLAG_RX_HWTSTAMP_IN_REGISTER); + aflags |= (IXGBE_FLAG_RX_HWTSTAMP_ENABLED | + IXGBE_FLAG_RX_HWTSTAMP_IN_REGISTER); break; case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: @@ -1023,8 +1024,8 @@ static int ixgbe_ptp_set_timestamp_mode(struct ixgbe_adapter *adapter, tsync_rx_ctl |= IXGBE_TSYNCRXCTL_TYPE_EVENT_V2; is_l2 = true; config->rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT; - adapter->flags |= (IXGBE_FLAG_RX_HWTSTAMP_ENABLED | - IXGBE_FLAG_RX_HWTSTAMP_IN_REGISTER); + aflags |= (IXGBE_FLAG_RX_HWTSTAMP_ENABLED | + IXGBE_FLAG_RX_HWTSTAMP_IN_REGISTER); break; case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: case HWTSTAMP_FILTER_NTP_ALL: @@ -1035,7 +1036,7 @@ static int ixgbe_ptp_set_timestamp_mode(struct ixgbe_adapter *adapter, if (hw->mac.type >= ixgbe_mac_X550) { tsync_rx_ctl |= IXGBE_TSYNCRXCTL_TYPE_ALL; config->rx_filter = HWTSTAMP_FILTER_ALL; - adapter->flags |= IXGBE_FLAG_RX_HWTSTAMP_ENABLED; + aflags |= IXGBE_FLAG_RX_HWTSTAMP_ENABLED; break; } fallthrough; @@ -1046,8 +1047,6 @@ static int ixgbe_ptp_set_timestamp_mode(struct ixgbe_adapter *adapter, * Delay_Req messages and hardware does not support * timestamping all packets => return error */ - adapter->flags &= ~(IXGBE_FLAG_RX_HWTSTAMP_ENABLED | - IXGBE_FLAG_RX_HWTSTAMP_IN_REGISTER); config->rx_filter = HWTSTAMP_FILTER_NONE; return -ERANGE; } @@ -1079,8 +1078,8 @@ static int ixgbe_ptp_set_timestamp_mode(struct ixgbe_adapter *adapter, IXGBE_TSYNCRXCTL_TYPE_ALL | IXGBE_TSYNCRXCTL_TSIP_UT_EN; config->rx_filter = HWTSTAMP_FILTER_ALL; - adapter->flags |= IXGBE_FLAG_RX_HWTSTAMP_ENABLED; - adapter->flags &= ~IXGBE_FLAG_RX_HWTSTAMP_IN_REGISTER; + aflags |= IXGBE_FLAG_RX_HWTSTAMP_ENABLED; + aflags &= ~IXGBE_FLAG_RX_HWTSTAMP_IN_REGISTER; is_l2 = true; break; default: @@ -1113,6 +1112,9 @@ static int ixgbe_ptp_set_timestamp_mode(struct ixgbe_adapter *adapter, IXGBE_WRITE_FLUSH(hw); + /* configure adapter flags only when HW is actually configured */ + adapter->flags = aflags; + /* clear TX/RX time stamp registers, just to be sure */ ixgbe_ptp_clear_tx_timestamp(adapter); IXGBE_READ_REG(hw, IXGBE_RXSTMPH); From bc6ed2fa24b14e40e1005488bbe11268ce7108fa Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Mon, 11 Sep 2023 13:28:49 -0700 Subject: [PATCH 149/161] igb: clean up in all error paths when enabling SR-IOV After commit 50f303496d92 ("igb: Enable SR-IOV after reinit"), removing the igb module could hang or crash (depending on the machine) when the module has been loaded with the max_vfs parameter set to some value != 0. In case of one test machine with a dual port 82580, this hang occurred: [ 232.480687] igb 0000:41:00.1: removed PHC on enp65s0f1 [ 233.093257] igb 0000:41:00.1: IOV Disabled [ 233.329969] pcieport 0000:40:01.0: AER: Multiple Uncorrected (Non-Fatal) err0 [ 233.340302] igb 0000:41:00.0: PCIe Bus Error: severity=Uncorrected (Non-Fata) [ 233.352248] igb 0000:41:00.0: device [8086:1516] error status/mask=00100000 [ 233.361088] igb 0000:41:00.0: [20] UnsupReq (First) [ 233.368183] igb 0000:41:00.0: AER: TLP Header: 40000001 0000040f cdbfc00c c [ 233.376846] igb 0000:41:00.1: PCIe Bus Error: severity=Uncorrected (Non-Fata) [ 233.388779] igb 0000:41:00.1: device [8086:1516] error status/mask=00100000 [ 233.397629] igb 0000:41:00.1: [20] UnsupReq (First) [ 233.404736] igb 0000:41:00.1: AER: TLP Header: 40000001 0000040f cdbfc00c c [ 233.538214] pci 0000:41:00.1: AER: can't recover (no error_detected callback) [ 233.538401] igb 0000:41:00.0: removed PHC on enp65s0f0 [ 233.546197] pcieport 0000:40:01.0: AER: device recovery failed [ 234.157244] igb 0000:41:00.0: IOV Disabled [ 371.619705] INFO: task irq/35-aerdrv:257 blocked for more than 122 seconds. [ 371.627489] Not tainted 6.4.0-dirty #2 [ 371.632257] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this. [ 371.641000] task:irq/35-aerdrv state:D stack:0 pid:257 ppid:2 f0 [ 371.650330] Call Trace: [ 371.653061] [ 371.655407] __schedule+0x20e/0x660 [ 371.659313] schedule+0x5a/0xd0 [ 371.662824] schedule_preempt_disabled+0x11/0x20 [ 371.667983] __mutex_lock.constprop.0+0x372/0x6c0 [ 371.673237] ? __pfx_aer_root_reset+0x10/0x10 [ 371.678105] report_error_detected+0x25/0x1c0 [ 371.682974] ? __pfx_report_normal_detected+0x10/0x10 [ 371.688618] pci_walk_bus+0x72/0x90 [ 371.692519] pcie_do_recovery+0xb2/0x330 [ 371.696899] aer_process_err_devices+0x117/0x170 [ 371.702055] aer_isr+0x1c0/0x1e0 [ 371.705661] ? __set_cpus_allowed_ptr+0x54/0xa0 [ 371.710723] ? __pfx_irq_thread_fn+0x10/0x10 [ 371.715496] irq_thread_fn+0x20/0x60 [ 371.719491] irq_thread+0xe6/0x1b0 [ 371.723291] ? __pfx_irq_thread_dtor+0x10/0x10 [ 371.728255] ? __pfx_irq_thread+0x10/0x10 [ 371.732731] kthread+0xe2/0x110 [ 371.736243] ? __pfx_kthread+0x10/0x10 [ 371.740430] ret_from_fork+0x2c/0x50 [ 371.744428] The reproducer was a simple script: #!/bin/sh for i in `seq 1 5`; do modprobe -rv igb modprobe -v igb max_vfs=1 sleep 1 modprobe -rv igb done It turned out that this could only be reproduce on 82580 (quad and dual-port), but not on 82576, i350 and i210. Further debugging showed that igb_enable_sriov()'s call to pci_enable_sriov() is failing, because dev->is_physfn is 0 on 82580. Prior to commit 50f303496d92 ("igb: Enable SR-IOV after reinit"), igb_enable_sriov() jumped into the "err_out" cleanup branch. After this commit it only returned the error code. So the cleanup didn't take place, and the incorrect VF setup in the igb_adapter structure fooled the igb driver into assuming that VFs have been set up where no VF actually existed. Fix this problem by cleaning up again if pci_enable_sriov() fails. Fixes: 50f303496d92 ("igb: Enable SR-IOV after reinit") Signed-off-by: Corinna Vinschen Reviewed-by: Akihiko Odaki Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/igb/igb_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 13ba9c74bd848..76b34cee1da3c 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -3827,8 +3827,11 @@ static int igb_enable_sriov(struct pci_dev *pdev, int num_vfs, bool reinit) } /* only call pci_enable_sriov() if no VFs are allocated already */ - if (!old_vfs) + if (!old_vfs) { err = pci_enable_sriov(pdev, adapter->vfs_allocated_count); + if (err) + goto err_out; + } goto out; From e7b1ef29420fe52c2c1a273a9b4b36103a522625 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Tue, 12 Sep 2023 10:49:35 +0900 Subject: [PATCH 150/161] net: renesas: rswitch: Fix unmasking irq condition Fix unmasking irq condition by using napi_complete_done(). Otherwise, redundant interrupts happen. Fixes: 3590918b5d07 ("net: ethernet: renesas: Add support for "Ethernet Switch"") Signed-off-by: Yoshihiro Shimoda Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/rswitch.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 6083b1c8e4fb9..26c8807d7deaf 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -816,10 +816,10 @@ static int rswitch_poll(struct napi_struct *napi, int budget) netif_wake_subqueue(ndev, 0); - napi_complete(napi); - - rswitch_enadis_data_irq(priv, rdev->tx_queue->index, true); - rswitch_enadis_data_irq(priv, rdev->rx_queue->index, true); + if (napi_complete_done(napi, budget - quota)) { + rswitch_enadis_data_irq(priv, rdev->tx_queue->index, true); + rswitch_enadis_data_irq(priv, rdev->rx_queue->index, true); + } out: return budget - quota; From c4f922e86c8e0f7c5fe94e0547e9835fc9711f08 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Tue, 12 Sep 2023 10:49:36 +0900 Subject: [PATCH 151/161] net: renesas: rswitch: Add spin lock protection for irq {un}mask Add spin lock protection for irq {un}mask registers' control. After napi_complete_done() and this protection were applied, a lot of redundant interrupts no longer occur. For example: when "iperf3 -c -R" on R-Car S4-8 Spider Before the patches are applied: about 800,000 times happened After the patches were applied: about 100,000 times happened Fixes: 3590918b5d07 ("net: ethernet: renesas: Add support for "Ethernet Switch"") Signed-off-by: Yoshihiro Shimoda Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/rswitch.c | 12 ++++++++++++ drivers/net/ethernet/renesas/rswitch.h | 2 ++ 2 files changed, 14 insertions(+) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 26c8807d7deaf..ea9186178091d 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -799,6 +799,7 @@ static int rswitch_poll(struct napi_struct *napi, int budget) struct net_device *ndev = napi->dev; struct rswitch_private *priv; struct rswitch_device *rdev; + unsigned long flags; int quota = budget; rdev = netdev_priv(ndev); @@ -817,8 +818,10 @@ static int rswitch_poll(struct napi_struct *napi, int budget) netif_wake_subqueue(ndev, 0); if (napi_complete_done(napi, budget - quota)) { + spin_lock_irqsave(&priv->lock, flags); rswitch_enadis_data_irq(priv, rdev->tx_queue->index, true); rswitch_enadis_data_irq(priv, rdev->rx_queue->index, true); + spin_unlock_irqrestore(&priv->lock, flags); } out: @@ -835,8 +838,10 @@ static void rswitch_queue_interrupt(struct net_device *ndev) struct rswitch_device *rdev = netdev_priv(ndev); if (napi_schedule_prep(&rdev->napi)) { + spin_lock(&rdev->priv->lock); rswitch_enadis_data_irq(rdev->priv, rdev->tx_queue->index, false); rswitch_enadis_data_irq(rdev->priv, rdev->rx_queue->index, false); + spin_unlock(&rdev->priv->lock); __napi_schedule(&rdev->napi); } } @@ -1440,14 +1445,17 @@ static void rswitch_ether_port_deinit_all(struct rswitch_private *priv) static int rswitch_open(struct net_device *ndev) { struct rswitch_device *rdev = netdev_priv(ndev); + unsigned long flags; phy_start(ndev->phydev); napi_enable(&rdev->napi); netif_start_queue(ndev); + spin_lock_irqsave(&rdev->priv->lock, flags); rswitch_enadis_data_irq(rdev->priv, rdev->tx_queue->index, true); rswitch_enadis_data_irq(rdev->priv, rdev->rx_queue->index, true); + spin_unlock_irqrestore(&rdev->priv->lock, flags); if (bitmap_empty(rdev->priv->opened_ports, RSWITCH_NUM_PORTS)) iowrite32(GWCA_TS_IRQ_BIT, rdev->priv->addr + GWTSDIE); @@ -1461,6 +1469,7 @@ static int rswitch_stop(struct net_device *ndev) { struct rswitch_device *rdev = netdev_priv(ndev); struct rswitch_gwca_ts_info *ts_info, *ts_info2; + unsigned long flags; netif_tx_stop_all_queues(ndev); bitmap_clear(rdev->priv->opened_ports, rdev->port, 1); @@ -1476,8 +1485,10 @@ static int rswitch_stop(struct net_device *ndev) kfree(ts_info); } + spin_lock_irqsave(&rdev->priv->lock, flags); rswitch_enadis_data_irq(rdev->priv, rdev->tx_queue->index, false); rswitch_enadis_data_irq(rdev->priv, rdev->rx_queue->index, false); + spin_unlock_irqrestore(&rdev->priv->lock, flags); phy_stop(ndev->phydev); napi_disable(&rdev->napi); @@ -1887,6 +1898,7 @@ static int renesas_eth_sw_probe(struct platform_device *pdev) priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; + spin_lock_init(&priv->lock); attr = soc_device_match(rswitch_soc_no_speed_change); if (attr) diff --git a/drivers/net/ethernet/renesas/rswitch.h b/drivers/net/ethernet/renesas/rswitch.h index 54f397effbc6b..f0c16a37ea558 100644 --- a/drivers/net/ethernet/renesas/rswitch.h +++ b/drivers/net/ethernet/renesas/rswitch.h @@ -1011,6 +1011,8 @@ struct rswitch_private { struct rswitch_etha etha[RSWITCH_NUM_PORTS]; struct rswitch_mfwd mfwd; + spinlock_t lock; /* lock interrupt registers' control */ + bool etha_no_runtime_change; bool gwca_halt; }; From a22730b1b4bf437c6bbfdeff5feddf54be4aeada Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 11 Sep 2023 19:27:53 -0700 Subject: [PATCH 152/161] kcm: Fix error handling for SOCK_DGRAM in kcm_sendmsg(). syzkaller found a memory leak in kcm_sendmsg(), and commit c821a88bd720 ("kcm: Fix memory leak in error path of kcm_sendmsg()") suppressed it by updating kcm_tx_msg(head)->last_skb if partial data is copied so that the following sendmsg() will resume from the skb. However, we cannot know how many bytes were copied when we get the error. Thus, we could mess up the MSG_MORE queue. When kcm_sendmsg() fails for SOCK_DGRAM, we should purge the queue as we do so for UDP by udp_flush_pending_frames(). Even without this change, when the error occurred, the following sendmsg() resumed from a wrong skb and the queue was messed up. However, we have yet to get such a report, and only syzkaller stumbled on it. So, this can be changed safely. Note this does not change SOCK_SEQPACKET behaviour. Fixes: c821a88bd720 ("kcm: Fix memory leak in error path of kcm_sendmsg()") Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230912022753.33327-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/kcm/kcmsock.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 740539a218b7c..dd1d8ffd5f594 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -930,17 +930,18 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) out_error: kcm_push(kcm); - if (copied && sock->type == SOCK_SEQPACKET) { + if (sock->type == SOCK_SEQPACKET) { /* Wrote some bytes before encountering an * error, return partial success. */ - goto partial_message; - } - - if (head != kcm->seq_skb) + if (copied) + goto partial_message; + if (head != kcm->seq_skb) + kfree_skb(head); + } else { kfree_skb(head); - else if (copied) - kcm_tx_msg(head)->last_skb = skb; + kcm->seq_skb = NULL; + } err = sk_stream_error(sk, msg->msg_flags, err); From 7908632f2927b65f7486ae6b67c24071666ba43f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Thu, 14 Sep 2023 07:19:02 -0300 Subject: [PATCH 153/161] Revert "drm/vkms: Fix race-condition between the hrtimer and the atomic commit" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit a0e6a017ab56936c0405fe914a793b241ed25ee0. Unlocking a mutex in the context of a hrtimer callback is violating mutex locking rules, as mutex_unlock() from interrupt context is not permitted. Link: https://lore.kernel.org/dri-devel/ZQLAc%2FFwkv%2FGiVoK@phenom.ffwll.local/T/#t Acked-by: Daniel Vetter Signed-off-by: Maíra Canal Signed-off-by: Maíra Canal Link: https://patchwork.freedesktop.org/patch/msgid/20230914102024.1789154-1-mcanal@igalia.com --- drivers/gpu/drm/vkms/vkms_composer.c | 9 ++------- drivers/gpu/drm/vkms/vkms_crtc.c | 9 ++++----- drivers/gpu/drm/vkms/vkms_drv.h | 4 +--- 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/vkms/vkms_composer.c b/drivers/gpu/drm/vkms/vkms_composer.c index d5d4f642d3678..3c99fb8b54e2d 100644 --- a/drivers/gpu/drm/vkms/vkms_composer.c +++ b/drivers/gpu/drm/vkms/vkms_composer.c @@ -408,15 +408,10 @@ void vkms_set_composer(struct vkms_output *out, bool enabled) if (enabled) drm_crtc_vblank_get(&out->crtc); - mutex_lock(&out->enabled_lock); + spin_lock_irq(&out->lock); old_enabled = out->composer_enabled; out->composer_enabled = enabled; - - /* the composition wasn't enabled, so unlock the lock to make sure the lock - * will be balanced even if we have a failed commit - */ - if (!out->composer_enabled) - mutex_unlock(&out->enabled_lock); + spin_unlock_irq(&out->lock); if (old_enabled) drm_crtc_vblank_put(&out->crtc); diff --git a/drivers/gpu/drm/vkms/vkms_crtc.c b/drivers/gpu/drm/vkms/vkms_crtc.c index 3c5ebf106b66d..61e500b8c9da2 100644 --- a/drivers/gpu/drm/vkms/vkms_crtc.c +++ b/drivers/gpu/drm/vkms/vkms_crtc.c @@ -16,7 +16,7 @@ static enum hrtimer_restart vkms_vblank_simulate(struct hrtimer *timer) struct drm_crtc *crtc = &output->crtc; struct vkms_crtc_state *state; u64 ret_overrun; - bool ret, fence_cookie, composer_enabled; + bool ret, fence_cookie; fence_cookie = dma_fence_begin_signalling(); @@ -25,15 +25,15 @@ static enum hrtimer_restart vkms_vblank_simulate(struct hrtimer *timer) if (ret_overrun != 1) pr_warn("%s: vblank timer overrun\n", __func__); + spin_lock(&output->lock); ret = drm_crtc_handle_vblank(crtc); if (!ret) DRM_ERROR("vkms failure on handling vblank"); state = output->composer_state; - composer_enabled = output->composer_enabled; - mutex_unlock(&output->enabled_lock); + spin_unlock(&output->lock); - if (state && composer_enabled) { + if (state && output->composer_enabled) { u64 frame = drm_crtc_accurate_vblank_count(crtc); /* update frame_start only if a queued vkms_composer_worker() @@ -295,7 +295,6 @@ int vkms_crtc_init(struct drm_device *dev, struct drm_crtc *crtc, spin_lock_init(&vkms_out->lock); spin_lock_init(&vkms_out->composer_lock); - mutex_init(&vkms_out->enabled_lock); vkms_out->composer_workq = alloc_ordered_workqueue("vkms_composer", 0); if (!vkms_out->composer_workq) diff --git a/drivers/gpu/drm/vkms/vkms_drv.h b/drivers/gpu/drm/vkms/vkms_drv.h index c7ae6c2ba1df0..8f5710debb1eb 100644 --- a/drivers/gpu/drm/vkms/vkms_drv.h +++ b/drivers/gpu/drm/vkms/vkms_drv.h @@ -108,10 +108,8 @@ struct vkms_output { struct workqueue_struct *composer_workq; /* protects concurrent access to composer */ spinlock_t lock; - /* guarantees that if the composer is enabled, a job will be queued */ - struct mutex enabled_lock; - /* protected by @enabled_lock */ + /* protected by @lock */ bool composer_enabled; struct vkms_crtc_state *composer_state; From 139a27854bf5ce93ff9805f9f7683b88c13074dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Thu, 7 Sep 2023 15:53:38 +0200 Subject: [PATCH 154/161] drm/tests: helpers: Avoid a driver uaf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit when using __drm_kunit_helper_alloc_drm_device() the driver may be dereferenced by device-managed resources up until the device is freed, which is typically later than the kunit-managed resource code frees it. Fix this by simply make the driver device-managed as well. In short, the sequence leading to the UAF is as follows: INIT: Code allocates a struct device as a kunit-managed resource. Code allocates a drm driver as a kunit-managed resource. Code allocates a drm device as a device-managed resource. EXIT: Kunit resource cleanup frees the drm driver Kunit resource cleanup puts the struct device, which starts a device-managed resource cleanup device-managed cleanup calls drm_dev_put() drm_dev_put() dereferences the (now freed) drm driver -> Boom. Related KASAN message: [55272.551542] ================================================================== [55272.551551] BUG: KASAN: slab-use-after-free in drm_dev_put.part.0+0xd4/0xe0 [drm] [55272.551603] Read of size 8 at addr ffff888127502828 by task kunit_try_catch/10353 [55272.551612] CPU: 4 PID: 10353 Comm: kunit_try_catch Tainted: G U N 6.5.0-rc7+ #155 [55272.551620] Hardware name: ASUS System Product Name/PRIME B560M-A AC, BIOS 0403 01/26/2021 [55272.551626] Call Trace: [55272.551629] [55272.551633] dump_stack_lvl+0x57/0x90 [55272.551639] print_report+0xcf/0x630 [55272.551645] ? _raw_spin_lock_irqsave+0x5f/0x70 [55272.551652] ? drm_dev_put.part.0+0xd4/0xe0 [drm] [55272.551694] kasan_report+0xd7/0x110 [55272.551699] ? drm_dev_put.part.0+0xd4/0xe0 [drm] [55272.551742] drm_dev_put.part.0+0xd4/0xe0 [drm] [55272.551783] devres_release_all+0x15d/0x1f0 [55272.551790] ? __pfx_devres_release_all+0x10/0x10 [55272.551797] device_unbind_cleanup+0x16/0x1a0 [55272.551802] device_release_driver_internal+0x3e5/0x540 [55272.551808] ? kobject_put+0x5d/0x4b0 [55272.551814] bus_remove_device+0x1f1/0x3f0 [55272.551819] device_del+0x342/0x910 [55272.551826] ? __pfx_device_del+0x10/0x10 [55272.551830] ? lock_release+0x339/0x5e0 [55272.551836] ? kunit_remove_resource+0x128/0x290 [kunit] [55272.551845] ? __pfx_lock_release+0x10/0x10 [55272.551851] platform_device_del.part.0+0x1f/0x1e0 [55272.551856] ? _raw_spin_unlock_irqrestore+0x30/0x60 [55272.551863] kunit_remove_resource+0x195/0x290 [kunit] [55272.551871] ? _raw_spin_unlock_irqrestore+0x30/0x60 [55272.551877] kunit_cleanup+0x78/0x120 [kunit] [55272.551885] ? __kthread_parkme+0xc1/0x1f0 [55272.551891] ? __pfx_kunit_try_run_case_cleanup+0x10/0x10 [kunit] [55272.551900] ? __pfx_kunit_generic_run_threadfn_adapter+0x10/0x10 [kunit] [55272.551909] kunit_generic_run_threadfn_adapter+0x4a/0x90 [kunit] [55272.551919] kthread+0x2e7/0x3c0 [55272.551924] ? __pfx_kthread+0x10/0x10 [55272.551929] ret_from_fork+0x2d/0x70 [55272.551935] ? __pfx_kthread+0x10/0x10 [55272.551940] ret_from_fork_asm+0x1b/0x30 [55272.551948] [55272.551953] Allocated by task 10351: [55272.551956] kasan_save_stack+0x1c/0x40 [55272.551962] kasan_set_track+0x21/0x30 [55272.551966] __kasan_kmalloc+0x8b/0x90 [55272.551970] __kmalloc+0x5e/0x160 [55272.551976] kunit_kmalloc_array+0x1c/0x50 [kunit] [55272.551984] drm_exec_test_init+0xfa/0x2c0 [drm_exec_test] [55272.551991] kunit_try_run_case+0xdd/0x250 [kunit] [55272.551999] kunit_generic_run_threadfn_adapter+0x4a/0x90 [kunit] [55272.552008] kthread+0x2e7/0x3c0 [55272.552012] ret_from_fork+0x2d/0x70 [55272.552017] ret_from_fork_asm+0x1b/0x30 [55272.552024] Freed by task 10353: [55272.552027] kasan_save_stack+0x1c/0x40 [55272.552032] kasan_set_track+0x21/0x30 [55272.552036] kasan_save_free_info+0x27/0x40 [55272.552041] __kasan_slab_free+0x106/0x180 [55272.552046] slab_free_freelist_hook+0xb3/0x160 [55272.552051] __kmem_cache_free+0xb2/0x290 [55272.552056] kunit_remove_resource+0x195/0x290 [kunit] [55272.552064] kunit_cleanup+0x78/0x120 [kunit] [55272.552072] kunit_generic_run_threadfn_adapter+0x4a/0x90 [kunit] [55272.552080] kthread+0x2e7/0x3c0 [55272.552085] ret_from_fork+0x2d/0x70 [55272.552089] ret_from_fork_asm+0x1b/0x30 [55272.552096] The buggy address belongs to the object at ffff888127502800 which belongs to the cache kmalloc-512 of size 512 [55272.552105] The buggy address is located 40 bytes inside of freed 512-byte region [ffff888127502800, ffff888127502a00) [55272.552115] The buggy address belongs to the physical page: [55272.552119] page:00000000af6c70ff refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x127500 [55272.552127] head:00000000af6c70ff order:3 entire_mapcount:0 nr_pages_mapped:0 pincount:0 [55272.552133] anon flags: 0x17ffffc0010200(slab|head|node=0|zone=2|lastcpupid=0x1fffff) [55272.552141] page_type: 0xffffffff() [55272.552145] raw: 0017ffffc0010200 ffff888100042c80 0000000000000000 dead000000000001 [55272.552152] raw: 0000000000000000 0000000080200020 00000001ffffffff 0000000000000000 [55272.552157] page dumped because: kasan: bad access detected [55272.552163] Memory state around the buggy address: [55272.552167] ffff888127502700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [55272.552173] ffff888127502780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [55272.552178] >ffff888127502800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [55272.552184] ^ [55272.552187] ffff888127502880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [55272.552193] ffff888127502900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [55272.552198] ================================================================== [55272.552203] Disabling lock debugging due to kernel taint v2: - Update commit message, add Fixes: tag and Cc stable. v3: - Further commit message updates (Maxime Ripard). Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Cc: stable@vger.kernel.org # v6.3+ Fixes: d98780310719 ("drm/tests: helpers: Allow to pass a custom drm_driver") Signed-off-by: Thomas Hellström Reviewed-by: Francois Dugast Acked-by: Maxime Ripard Link: https://lore.kernel.org/r/20230907135339.7971-2-thomas.hellstrom@linux.intel.com Signed-off-by: Maxime Ripard --- include/drm/drm_kunit_helpers.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/drm/drm_kunit_helpers.h b/include/drm/drm_kunit_helpers.h index 514c8a7a32f0f..ba483c87f0e7b 100644 --- a/include/drm/drm_kunit_helpers.h +++ b/include/drm/drm_kunit_helpers.h @@ -3,6 +3,8 @@ #ifndef DRM_KUNIT_HELPERS_H_ #define DRM_KUNIT_HELPERS_H_ +#include + #include struct drm_device; @@ -51,7 +53,7 @@ __drm_kunit_helper_alloc_drm_device(struct kunit *test, { struct drm_driver *driver; - driver = kunit_kzalloc(test, sizeof(*driver), GFP_KERNEL); + driver = devm_kzalloc(dev, sizeof(*driver), GFP_KERNEL); KUNIT_ASSERT_NOT_NULL(test, driver); driver->driver_features = features; From f6007dce0cd35d634d9be91ef3515a6385dcee16 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 9 Aug 2023 12:44:20 +0200 Subject: [PATCH 155/161] dm: fix a race condition in retrieve_deps There's a race condition in the multipath target when retrieve_deps races with multipath_message calling dm_get_device and dm_put_device. retrieve_deps walks the list of open devices without holding any lock but multipath may add or remove devices to the list while it is running. The end result may be memory corruption or use-after-free memory access. See this description of a UAF with multipath_message(): https://listman.redhat.com/archives/dm-devel/2022-October/052373.html Fix this bug by introducing a new rw semaphore "devices_lock". We grab devices_lock for read in retrieve_deps and we grab it for write in dm_get_device and dm_put_device. Reported-by: Luo Meng Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Tested-by: Li Lingfeng Signed-off-by: Mike Snitzer --- drivers/md/dm-core.h | 1 + drivers/md/dm-ioctl.c | 7 ++++++- drivers/md/dm-table.c | 32 ++++++++++++++++++++++++-------- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 0d93661f88d30..095b9b49aa825 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -214,6 +214,7 @@ struct dm_table { /* a list of devices used by this table */ struct list_head devices; + struct rw_semaphore devices_lock; /* events get handed up using this callback */ void (*event_fn)(void *data); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index f5ed729a8e0cd..21ebb6c39394b 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1630,6 +1630,8 @@ static void retrieve_deps(struct dm_table *table, struct dm_dev_internal *dd; struct dm_target_deps *deps; + down_read(&table->devices_lock); + deps = get_result_buffer(param, param_size, &len); /* @@ -1644,7 +1646,7 @@ static void retrieve_deps(struct dm_table *table, needed = struct_size(deps, dev, count); if (len < needed) { param->flags |= DM_BUFFER_FULL_FLAG; - return; + goto out; } /* @@ -1656,6 +1658,9 @@ static void retrieve_deps(struct dm_table *table, deps->dev[count++] = huge_encode_dev(dd->dm_dev->bdev->bd_dev); param->data_size = param->data_start + needed; + +out: + up_read(&table->devices_lock); } static int table_deps(struct file *filp, struct dm_ioctl *param, size_t param_size) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 7d208b2b1a192..37b48f63ae6a5 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -135,6 +135,7 @@ int dm_table_create(struct dm_table **result, blk_mode_t mode, return -ENOMEM; INIT_LIST_HEAD(&t->devices); + init_rwsem(&t->devices_lock); if (!num_targets) num_targets = KEYS_PER_NODE; @@ -359,16 +360,20 @@ int __ref dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode, if (dev == disk_devt(t->md->disk)) return -EINVAL; + down_write(&t->devices_lock); + dd = find_device(&t->devices, dev); if (!dd) { dd = kmalloc(sizeof(*dd), GFP_KERNEL); - if (!dd) - return -ENOMEM; + if (!dd) { + r = -ENOMEM; + goto unlock_ret_r; + } r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev); if (r) { kfree(dd); - return r; + goto unlock_ret_r; } refcount_set(&dd->count, 1); @@ -378,12 +383,17 @@ int __ref dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode, } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { r = upgrade_mode(dd, mode, t->md); if (r) - return r; + goto unlock_ret_r; } refcount_inc(&dd->count); out: + up_write(&t->devices_lock); *result = dd->dm_dev; return 0; + +unlock_ret_r: + up_write(&t->devices_lock); + return r; } EXPORT_SYMBOL(dm_get_device); @@ -419,9 +429,12 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, void dm_put_device(struct dm_target *ti, struct dm_dev *d) { int found = 0; - struct list_head *devices = &ti->table->devices; + struct dm_table *t = ti->table; + struct list_head *devices = &t->devices; struct dm_dev_internal *dd; + down_write(&t->devices_lock); + list_for_each_entry(dd, devices, list) { if (dd->dm_dev == d) { found = 1; @@ -430,14 +443,17 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d) } if (!found) { DMERR("%s: device %s not in table devices list", - dm_device_name(ti->table->md), d->name); - return; + dm_device_name(t->md), d->name); + goto unlock_ret; } if (refcount_dec_and_test(&dd->count)) { - dm_put_table_device(ti->table->md, d); + dm_put_table_device(t->md, d); list_del(&dd->list); kfree(dd); } + +unlock_ret: + up_write(&t->devices_lock); } EXPORT_SYMBOL(dm_put_device); From c21a8027ad8a68c340d0d58bf1cc61dcb0bc4d2f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 14 Sep 2023 16:51:09 +0100 Subject: [PATCH 156/161] io_uring/net: fix iter retargeting for selected buf When using selected buffer feature, io_uring delays data iter setup until later. If io_setup_async_msg() is called before that it might see not correctly setup iterator. Pre-init nr_segs and judge from its state whether we repointing. Cc: stable@vger.kernel.org Reported-by: syzbot+a4c6e5ef999b68b26ed1@syzkaller.appspotmail.com Fixes: 0455d4ccec548 ("io_uring: add POLL_FIRST support for send/sendmsg and recv/recvmsg") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0000000000002770be06053c7757@google.com Signed-off-by: Jens Axboe --- io_uring/net.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/io_uring/net.c b/io_uring/net.c index 3d07bf79c1e02..7a8e298af81b3 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -183,6 +183,10 @@ static int io_setup_async_msg(struct io_kiocb *req, memcpy(async_msg, kmsg, sizeof(*kmsg)); if (async_msg->msg.msg_name) async_msg->msg.msg_name = &async_msg->addr; + + if ((req->flags & REQ_F_BUFFER_SELECT) && !async_msg->msg.msg_iter.nr_segs) + return -EAGAIN; + /* if were using fast_iov, set it to the new one */ if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) { size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov; @@ -542,6 +546,7 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { iomsg->msg.msg_name = &iomsg->addr; + iomsg->msg.msg_iter.nr_segs = 0; #ifdef CONFIG_COMPAT if (req->ctx->compat) From c8870379a21fbd9ad14ca36204ccfbe9d25def43 Mon Sep 17 00:00:00 2001 From: Mariusz Tkaczyk Date: Thu, 14 Sep 2023 17:24:16 +0200 Subject: [PATCH 157/161] md: Put the right device in md_seq_next If there are multiple arrays in system and one mddevice is marked with MD_DELETED and md_seq_next() is called in the middle of removal then it _get()s proper device but it may _put() deleted one. As a result, active counter may never be zeroed for mddevice and it cannot be removed. Put the device which has been _get with previous md_seq_next() call. Cc: stable@vger.kernel.org Fixes: 12a6caf27324 ("md: only delete entries from all_mddevs when the disk is freed") Reported-by: AceLan Kao Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217798 Cc: Yu Kuai Signed-off-by: Mariusz Tkaczyk Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230914152416.10819-1-mariusz.tkaczyk@linux.intel.com --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 73758b7541275..a104a025084dc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8265,7 +8265,7 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) spin_unlock(&all_mddevs_lock); if (to_put) - mddev_put(mddev); + mddev_put(to_put); return next_mddev; } From fb2c10245f201278804a6f28e196e95436059d6d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 14 Sep 2023 21:42:20 +0200 Subject: [PATCH 158/161] thermal: core: Fix disabled trip point check in handle_thermal_trip() Commit bc840ea5f9a9 ("thermal: core: Do not handle trip points with invalid temperature") added a check for invalid temperature to the disabled trip point check in handle_thermal_trip(), but that check was added at a point when the trip structure has not been initialized yet. This may cause handle_thermal_trip() to skip a valid trip point in some cases, so fix it by moving the check to a suitable place, after __thermal_zone_get_trip() has been called to populate the trip structure. Fixes: bc840ea5f9a9 ("thermal: core: Do not handle trip points with invalid temperature") Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 8717a33435125..58533ea75cd92 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -348,12 +348,14 @@ static void handle_thermal_trip(struct thermal_zone_device *tz, int trip_id) struct thermal_trip trip; /* Ignore disabled trip points */ - if (test_bit(trip_id, &tz->trips_disabled) || - trip.temperature == THERMAL_TEMP_INVALID) + if (test_bit(trip_id, &tz->trips_disabled)) return; __thermal_zone_get_trip(tz, trip_id, &trip); + if (trip.temperature == THERMAL_TEMP_INVALID) + return; + if (tz->last_temperature != THERMAL_TEMP_INVALID) { if (tz->last_temperature < trip.temperature && tz->temperature >= trip.temperature) From 6cc834ba62998c65c42d0c63499bdd35067151ec Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 12 Sep 2023 14:38:58 -0700 Subject: [PATCH 159/161] nvme: avoid bogus CRTO values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some devices are reporting controller ready mode support, but return 0 for CRTO. These devices require a much higher time to ready than that, so they are failing to initialize after the driver starter preferring that value over CAP.TO. The spec requires that CAP.TO match the appropritate CRTO value, or be set to 0xff if CRTO is larger than that. This means that CAP.TO can be used to validate if CRTO is reliable, and provides an appropriate fallback for setting the timeout value if not. Use whichever is larger. Link: https://bugzilla.kernel.org/show_bug.cgi?id=217863 Reported-by: Cláudio Sampaio Reported-by: Felix Yan Tested-by: Felix Yan Based-on-a-patch-by: Felix Yan Cc: stable@vger.kernel.org Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 54 ++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 37b6fa7466620..0685ed4f2dc49 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2245,25 +2245,8 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) else ctrl->ctrl_config = NVME_CC_CSS_NVM; - if (ctrl->cap & NVME_CAP_CRMS_CRWMS) { - u32 crto; - - ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto); - if (ret) { - dev_err(ctrl->device, "Reading CRTO failed (%d)\n", - ret); - return ret; - } - - if (ctrl->cap & NVME_CAP_CRMS_CRIMS) { - ctrl->ctrl_config |= NVME_CC_CRIME; - timeout = NVME_CRTO_CRIMT(crto); - } else { - timeout = NVME_CRTO_CRWMT(crto); - } - } else { - timeout = NVME_CAP_TIMEOUT(ctrl->cap); - } + if (ctrl->cap & NVME_CAP_CRMS_CRWMS && ctrl->cap & NVME_CAP_CRMS_CRIMS) + ctrl->ctrl_config |= NVME_CC_CRIME; ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; @@ -2277,6 +2260,39 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) if (ret) return ret; + /* CAP value may change after initial CC write */ + ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); + if (ret) + return ret; + + timeout = NVME_CAP_TIMEOUT(ctrl->cap); + if (ctrl->cap & NVME_CAP_CRMS_CRWMS) { + u32 crto, ready_timeout; + + ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto); + if (ret) { + dev_err(ctrl->device, "Reading CRTO failed (%d)\n", + ret); + return ret; + } + + /* + * CRTO should always be greater or equal to CAP.TO, but some + * devices are known to get this wrong. Use the larger of the + * two values. + */ + if (ctrl->ctrl_config & NVME_CC_CRIME) + ready_timeout = NVME_CRTO_CRIMT(crto); + else + ready_timeout = NVME_CRTO_CRWMT(crto); + + if (ready_timeout < timeout) + dev_warn_once(ctrl->device, "bad crto:%x cap:%llx\n", + crto, ctrl->cap); + else + timeout = ready_timeout; + } + ctrl->ctrl_config |= NVME_CC_ENABLE; ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) From 3c70de9b580998e5d644f4e80a9944c30aa1197b Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Fri, 15 Sep 2023 18:33:59 +0900 Subject: [PATCH 160/161] Revert "firewire: core: obsolete usage of GFP_ATOMIC at building node tree" This reverts commit 06f45435d985d60d7d2fe2424fbb9909d177a63d. John Ogness reports the case that the allocation is in atomic context under acquired spin-lock. [ 12.555784] BUG: sleeping function called from invalid context at include/linux/sched/mm.h:306 [ 12.555808] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 70, name: kworker/1:2 [ 12.555814] preempt_count: 1, expected: 0 [ 12.555820] INFO: lockdep is turned off. [ 12.555824] irq event stamp: 208 [ 12.555828] hardirqs last enabled at (207): [] ._raw_spin_unlock_irq+0x44/0x80 [ 12.555850] hardirqs last disabled at (208): [] .__schedule+0x854/0xfe0 [ 12.555859] softirqs last enabled at (188): [] .addrconf_verify_rtnl+0x2c4/0xb70 [ 12.555872] softirqs last disabled at (182): [] .addrconf_verify_rtnl+0x70/0xb70 [ 12.555884] CPU: 1 PID: 70 Comm: kworker/1:2 Tainted: G S 6.6.0-rc1 #1 [ 12.555893] Hardware name: PowerMac7,2 PPC970 0x390202 PowerMac [ 12.555898] Workqueue: firewire_ohci .bus_reset_work [firewire_ohci] [ 12.555939] Call Trace: [ 12.555944] [c000000009677830] [c0000000010d83c0] .dump_stack_lvl+0x8c/0xd0 (unreliable) [ 12.555963] [c0000000096778b0] [c000000000140270] .__might_resched+0x320/0x340 [ 12.555978] [c000000009677940] [c000000000497600] .__kmem_cache_alloc_node+0x390/0x460 [ 12.555993] [c000000009677a10] [c0000000003fe620] .__kmalloc+0x70/0x310 [ 12.556007] [c000000009677ac0] [c0003d00004e2268] .fw_core_handle_bus_reset+0x2c8/0xba0 [firewire_core] [ 12.556060] [c000000009677c20] [c0003d0000491190] .bus_reset_work+0x330/0x9b0 [firewire_ohci] [ 12.556079] [c000000009677d10] [c00000000011d0d0] .process_one_work+0x280/0x6f0 [ 12.556094] [c000000009677e10] [c00000000011d8a0] .worker_thread+0x360/0x500 [ 12.556107] [c000000009677ef0] [c00000000012e3b4] .kthread+0x154/0x160 [ 12.556120] [c000000009677f90] [c00000000000bfa8] .start_kernel_thread+0x10/0x14 Cc: stable@kernel.org Reported-by: John Ogness Link: https://lore.kernel.org/lkml/87jzsuv1xk.fsf@jogness.linutronix.de/raw Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-device.c | 2 +- drivers/firewire/core-topology.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index a3104e35412c1..aa597cda0d887 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -1211,7 +1211,7 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event) * without actually having a link. */ create: - device = kzalloc(sizeof(*device), GFP_KERNEL); + device = kzalloc(sizeof(*device), GFP_ATOMIC); if (device == NULL) break; diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 88466b663482f..f40c815343812 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -101,7 +101,7 @@ static struct fw_node *fw_node_create(u32 sid, int port_count, int color) { struct fw_node *node; - node = kzalloc(struct_size(node, ports, port_count), GFP_KERNEL); + node = kzalloc(struct_size(node, ports, port_count), GFP_ATOMIC); if (node == NULL) return NULL; From a9ce385344f916cd1c36a33905e564f5581beae9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Sep 2023 13:14:23 -0600 Subject: [PATCH 161/161] dm: don't attempt to queue IO under RCU protection dm looks up the table for IO based on the request type, with an assumption that if the request is marked REQ_NOWAIT, it's fine to attempt to submit that IO while under RCU read lock protection. This is not OK, as REQ_NOWAIT just means that we should not be sleeping waiting on other IO, it does not mean that we can't potentially schedule. A simple test case demonstrates this quite nicely: int main(int argc, char *argv[]) { struct iovec iov; int fd; fd = open("/dev/dm-0", O_RDONLY | O_DIRECT); posix_memalign(&iov.iov_base, 4096, 4096); iov.iov_len = 4096; preadv2(fd, &iov, 1, 0, RWF_NOWAIT); return 0; } which will instantly spew: BUG: sleeping function called from invalid context at include/linux/sched/mm.h:306 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 5580, name: dm-nowait preempt_count: 0, expected: 0 RCU nest depth: 1, expected: 0 INFO: lockdep is turned off. CPU: 7 PID: 5580 Comm: dm-nowait Not tainted 6.6.0-rc1-g39956d2dcd81 #132 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 Call Trace: dump_stack_lvl+0x11d/0x1b0 __might_resched+0x3c3/0x5e0 ? preempt_count_sub+0x150/0x150 mempool_alloc+0x1e2/0x390 ? mempool_resize+0x7d0/0x7d0 ? lock_sync+0x190/0x190 ? lock_release+0x4b7/0x670 ? internal_get_user_pages_fast+0x868/0x2d40 bio_alloc_bioset+0x417/0x8c0 ? bvec_alloc+0x200/0x200 ? internal_get_user_pages_fast+0xb8c/0x2d40 bio_alloc_clone+0x53/0x100 dm_submit_bio+0x27f/0x1a20 ? lock_release+0x4b7/0x670 ? blk_try_enter_queue+0x1a0/0x4d0 ? dm_dax_direct_access+0x260/0x260 ? rcu_is_watching+0x12/0xb0 ? blk_try_enter_queue+0x1cc/0x4d0 __submit_bio+0x239/0x310 ? __bio_queue_enter+0x700/0x700 ? kvm_clock_get_cycles+0x40/0x60 ? ktime_get+0x285/0x470 submit_bio_noacct_nocheck+0x4d9/0xb80 ? should_fail_request+0x80/0x80 ? preempt_count_sub+0x150/0x150 ? lock_release+0x4b7/0x670 ? __bio_add_page+0x143/0x2d0 ? iov_iter_revert+0x27/0x360 submit_bio_noacct+0x53e/0x1b30 submit_bio_wait+0x10a/0x230 ? submit_bio_wait_endio+0x40/0x40 __blkdev_direct_IO_simple+0x4f8/0x780 ? blkdev_bio_end_io+0x4c0/0x4c0 ? stack_trace_save+0x90/0xc0 ? __bio_clone+0x3c0/0x3c0 ? lock_release+0x4b7/0x670 ? lock_sync+0x190/0x190 ? atime_needs_update+0x3bf/0x7e0 ? timestamp_truncate+0x21b/0x2d0 ? inode_owner_or_capable+0x240/0x240 blkdev_direct_IO.part.0+0x84a/0x1810 ? rcu_is_watching+0x12/0xb0 ? lock_release+0x4b7/0x670 ? blkdev_read_iter+0x40d/0x530 ? reacquire_held_locks+0x4e0/0x4e0 ? __blkdev_direct_IO_simple+0x780/0x780 ? rcu_is_watching+0x12/0xb0 ? __mark_inode_dirty+0x297/0xd50 ? preempt_count_add+0x72/0x140 blkdev_read_iter+0x2a4/0x530 do_iter_readv_writev+0x2f2/0x3c0 ? generic_copy_file_range+0x1d0/0x1d0 ? fsnotify_perm.part.0+0x25d/0x630 ? security_file_permission+0xd8/0x100 do_iter_read+0x31b/0x880 ? import_iovec+0x10b/0x140 vfs_readv+0x12d/0x1a0 ? vfs_iter_read+0xb0/0xb0 ? rcu_is_watching+0x12/0xb0 ? rcu_is_watching+0x12/0xb0 ? lock_release+0x4b7/0x670 do_preadv+0x1b3/0x260 ? do_readv+0x370/0x370 __x64_sys_preadv2+0xef/0x150 do_syscall_64+0x39/0xb0 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f5af41ad806 Code: 41 54 41 89 fc 55 44 89 c5 53 48 89 cb 48 83 ec 18 80 3d e4 dd 0d 00 00 74 7a 45 89 c1 49 89 ca 45 31 c0 b8 47 01 00 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 be 00 00 00 48 85 c0 79 4a 48 8b 0d da 55 RSP: 002b:00007ffd3145c7f0 EFLAGS: 00000246 ORIG_RAX: 0000000000000147 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f5af41ad806 RDX: 0000000000000001 RSI: 00007ffd3145c850 RDI: 0000000000000003 RBP: 0000000000000008 R08: 0000000000000000 R09: 0000000000000008 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000003 R13: 00007ffd3145c850 R14: 000055f5f0431dd8 R15: 0000000000000001 where in fact it is dm itself that attempts to allocate a bio clone with GFP_NOIO under the rcu read lock, regardless of the request type. Fix this by getting rid of the special casing for REQ_NOWAIT, and just use the normal SRCU protected table lookup. Get rid of the bio based table locking helpers at the same time, as they are now unused. Cc: stable@vger.kernel.org Fixes: 563a225c9fd2 ("dm: introduce dm_{get,put}_live_table_bio called from dm_submit_bio") Signed-off-by: Jens Axboe Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f0f118ab20fa2..64a1f306c96c1 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -715,24 +715,6 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) rcu_read_unlock(); } -static inline struct dm_table *dm_get_live_table_bio(struct mapped_device *md, - int *srcu_idx, blk_opf_t bio_opf) -{ - if (bio_opf & REQ_NOWAIT) - return dm_get_live_table_fast(md); - else - return dm_get_live_table(md, srcu_idx); -} - -static inline void dm_put_live_table_bio(struct mapped_device *md, int srcu_idx, - blk_opf_t bio_opf) -{ - if (bio_opf & REQ_NOWAIT) - dm_put_live_table_fast(md); - else - dm_put_live_table(md, srcu_idx); -} - static char *_dm_claim_ptr = "I belong to device-mapper"; /* @@ -1833,9 +1815,8 @@ static void dm_submit_bio(struct bio *bio) struct mapped_device *md = bio->bi_bdev->bd_disk->private_data; int srcu_idx; struct dm_table *map; - blk_opf_t bio_opf = bio->bi_opf; - map = dm_get_live_table_bio(md, &srcu_idx, bio_opf); + map = dm_get_live_table(md, &srcu_idx); /* If suspended, or map not yet available, queue this IO for later */ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) || @@ -1851,7 +1832,7 @@ static void dm_submit_bio(struct bio *bio) dm_split_and_process_bio(md, map, bio); out: - dm_put_live_table_bio(md, srcu_idx, bio_opf); + dm_put_live_table(md, srcu_idx); } static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob,